1 /* $Id: Parser.xs,v 2.137 2007/01/12 10:18:39 gisle Exp $
3 * Copyright 1999-2005, Gisle Aas.
4 * Copyright 1999-2000, Michael A. Chase.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the same terms as Perl itself.
12 * Standard XS greeting.
17 #define PERL_NO_GET_CONTEXT /* we want efficiency */
28 * Some perl version compatibility gruff.
30 #include "patchlevel.h"
31 #if PATCHLEVEL <= 4 /* perl5.004_XX */
34 #define PL_sv_undef sv_undef
35 #define PL_sv_yes sv_yes
39 #define PL_hexdigit hexdigit
43 #define ERRSV GvSV(errgv)
46 #if (PATCHLEVEL == 4 && SUBVERSION <= 4)
47 /* The newSVpvn function was introduced in perl5.004_05 */
49 newSVpvn(char *s, STRLEN len)
51 register SV *sv = newSV(0);
55 #endif /* not perl5.004_05 */
56 #endif /* perl5.004_XX */
59 #define dNOOP extern int errno
67 #ifndef MEMBER_TO_FPTR
68 #define MEMBER_TO_FPTR(x) (x)
72 #define INT2PTR(any,d) (any)(d)
73 #define PTR2IV(p) (IV)(p)
77 #if PATCHLEVEL > 6 || (PATCHLEVEL == 6 && SUBVERSION > 0)
78 #define RETHROW croak(Nullch)
80 #define RETHROW { STRLEN my_na; croak("%s", SvPV(ERRSV, my_na)); }
84 /* No useable Unicode support */
85 /* Make these harmless if present */
90 #define SvUTF8_on(sv) 0
91 #define SvUTF8_off(sv) 0
93 #define UNICODE_HTML_PARSER
97 #define DOWARN (PL_dowarn & G_WARN_ON)
99 #define DOWARN PL_dowarn
103 * Include stuff. We include .c files instead of linking them,
104 * so that they don't have to pollute the external dll name space.
111 #define EXTERN static /* Don't pollute */
119 * Support functions for the XS glue
123 check_handler(pTHX_ SV* h)
127 if (SvTYPE(myref) == SVt_PVCV)
129 if (SvTYPE(myref) == SVt_PVAV)
130 return SvREFCNT_inc(myref);
131 croak("Only code or array references allowed as handler");
133 return SvOK(h) ? newSVsv(h) : 0;
138 get_pstate_iv(pTHX_ SV* sv)
142 p = INT2PTR(PSTATE*, SvIV(sv));
144 MAGIC *mg = SvMAGICAL(sv) ? mg_find(sv, '~') : NULL;
147 croak("Lost parser state magic");
148 p = (PSTATE *)mg->mg_ptr;
150 croak("Lost parser state magic");
152 if (p->signature != P_SIGNATURE)
153 croak("Bad signature in parser state object at %p", p);
159 get_pstate_hv(pTHX_ SV* sv) /* used by XS typemap */
165 if (!sv || SvTYPE(sv) != SVt_PVHV)
166 croak("Not a reference to a hash");
168 svp = hv_fetch(hv, "_hparser_xs_state", 17, 0);
171 return get_pstate_iv(aTHX_ SvRV(*svp));
173 croak("_hparser_xs_state element is not a reference");
175 croak("Can't find '_hparser_xs_state' element in HTML::Parser hash");
181 free_pstate(pTHX_ PSTATE* pstate)
184 SvREFCNT_dec(pstate->buf);
185 SvREFCNT_dec(pstate->pend_text);
186 SvREFCNT_dec(pstate->skipped_text);
187 #ifdef MARKED_SECTION
188 SvREFCNT_dec(pstate->ms_stack);
190 SvREFCNT_dec(pstate->bool_attr_val);
191 for (i = 0; i < EVENT_COUNT; i++) {
192 SvREFCNT_dec(pstate->handlers[i].cb);
193 SvREFCNT_dec(pstate->handlers[i].argspec);
196 SvREFCNT_dec(pstate->report_tags);
197 SvREFCNT_dec(pstate->ignore_tags);
198 SvREFCNT_dec(pstate->ignore_elements);
199 SvREFCNT_dec(pstate->ignoring_element);
201 SvREFCNT_dec(pstate->tmp);
203 pstate->signature = 0;
208 magic_free_pstate(pTHX_ SV *sv, MAGIC *mg)
211 free_pstate(aTHX_ get_pstate_iv(aTHX_ sv));
213 free_pstate(aTHX_ (PSTATE *)mg->mg_ptr);
218 #if defined(USE_ITHREADS) && PATCHLEVEL >= 8
221 dup_pstate(pTHX_ PSTATE *pstate, CLONE_PARAMS *params)
226 Newz(56, pstate2, 1, PSTATE);
227 pstate2->signature = pstate->signature;
229 pstate2->buf = SvREFCNT_inc(sv_dup(pstate->buf, params));
230 pstate2->offset = pstate->offset;
231 pstate2->line = pstate->line;
232 pstate2->column = pstate->column;
233 pstate2->start_document = pstate->start_document;
234 pstate2->parsing = pstate->parsing;
235 pstate2->eof = pstate->eof;
237 pstate2->literal_mode = pstate->literal_mode;
238 pstate2->is_cdata = pstate->is_cdata;
239 pstate2->no_dash_dash_comment_end = pstate->no_dash_dash_comment_end;
240 pstate2->pending_end_tag = pstate->pending_end_tag;
242 pstate2->pend_text = SvREFCNT_inc(sv_dup(pstate->pend_text, params));
243 pstate2->pend_text_is_cdata = pstate->pend_text_is_cdata;
244 pstate2->pend_text_offset = pstate->pend_text_offset;
245 pstate2->pend_text_line = pstate->pend_text_offset;
246 pstate2->pend_text_column = pstate->pend_text_column;
248 pstate2->skipped_text = SvREFCNT_inc(sv_dup(pstate->skipped_text, params));
250 #ifdef MARKED_SECTION
251 pstate2->ms = pstate->ms;
253 (AV *)SvREFCNT_inc(sv_dup((SV *)pstate->ms_stack, params));
254 pstate2->marked_sections = pstate->marked_sections;
257 pstate2->strict_comment = pstate->strict_comment;
258 pstate2->strict_names = pstate->strict_names;
259 pstate2->strict_end = pstate->strict_end;
260 pstate2->xml_mode = pstate->xml_mode;
261 pstate2->unbroken_text = pstate->unbroken_text;
262 pstate2->attr_encoded = pstate->attr_encoded;
263 pstate2->case_sensitive = pstate->case_sensitive;
264 pstate2->closing_plaintext = pstate->closing_plaintext;
265 pstate2->utf8_mode = pstate->utf8_mode;
266 pstate2->empty_element_tags = pstate->empty_element_tags;
267 pstate2->xml_pic = pstate->xml_pic;
269 pstate2->bool_attr_val =
270 SvREFCNT_inc(sv_dup(pstate->bool_attr_val, params));
271 for (i = 0; i < EVENT_COUNT; i++) {
272 pstate2->handlers[i].cb =
273 SvREFCNT_inc(sv_dup(pstate->handlers[i].cb, params));
274 pstate2->handlers[i].argspec =
275 SvREFCNT_inc(sv_dup(pstate->handlers[i].argspec, params));
277 pstate2->argspec_entity_decode = pstate->argspec_entity_decode;
279 pstate2->report_tags =
280 (HV *)SvREFCNT_inc(sv_dup((SV *)pstate->report_tags, params));
281 pstate2->ignore_tags =
282 (HV *)SvREFCNT_inc(sv_dup((SV *)pstate->ignore_tags, params));
283 pstate2->ignore_elements =
284 (HV *)SvREFCNT_inc(sv_dup((SV *)pstate->ignore_elements, params));
286 pstate2->ignoring_element =
287 SvREFCNT_inc(sv_dup(pstate->ignoring_element, params));
288 pstate2->ignore_depth = pstate->ignore_depth;
290 if (params->flags & CLONEf_JOIN_IN) {
291 pstate2->entity2char =
292 perl_get_hv("HTML::Entities::entity2char", TRUE);
294 pstate2->entity2char = (HV *)sv_dup((SV *)pstate->entity2char, params);
296 pstate2->tmp = SvREFCNT_inc(sv_dup(pstate->tmp, params));
302 magic_dup_pstate(pTHX_ MAGIC *mg, CLONE_PARAMS *params)
304 mg->mg_ptr = (char *)dup_pstate(aTHX_ (PSTATE *)mg->mg_ptr, params);
316 MEMBER_TO_FPTR(magic_free_pstate),
317 #if defined(USE_ITHREADS) && PATCHLEVEL >= 8
319 MEMBER_TO_FPTR(magic_dup_pstate),
325 * XS interface definition.
328 MODULE = HTML::Parser PACKAGE = HTML::Parser
343 if (!sv || SvTYPE(sv) != SVt_PVHV)
344 croak("Not a reference to a hash");
347 Newz(56, pstate, 1, PSTATE);
348 pstate->signature = P_SIGNATURE;
349 pstate->entity2char = perl_get_hv("HTML::Entities::entity2char", TRUE);
350 pstate->tmp = NEWSV(0, 20);
352 sv = newSViv(PTR2IV(pstate));
354 sv_magic(sv, 0, '~', 0, 0);
356 sv_magic(sv, 0, '~', (char *)pstate, 0);
358 mg = mg_find(sv, '~');
360 mg->mg_virtual = &vtbl_pstate;
361 #if defined(USE_ITHREADS) && PATCHLEVEL >= 8
362 mg->mg_flags |= MGf_DUP;
366 hv_store(hv, "_hparser_xs_state", 17, newRV_noinc(sv), 0);
373 PSTATE* p_state = get_pstate_hv(aTHX_ self);
375 if (p_state->parsing)
376 croak("Parse loop not allowed");
377 p_state->parsing = 1;
378 if (SvROK(chunk) && SvTYPE(SvRV(chunk)) == SVt_PVCV) {
379 SV* generator = chunk;
384 count = perl_call_sv(generator, G_SCALAR|G_EVAL);
386 chunk = count ? POPs : 0;
390 p_state->parsing = 0;
395 if (chunk && SvOK(chunk)) {
396 (void)SvPV(chunk, len); /* get length */
401 parse(aTHX_ p_state, len ? chunk : 0, self);
404 } while (len && !p_state->eof);
407 parse(aTHX_ p_state, chunk, self);
410 p_state->parsing = 0;
413 PUSHs(sv_newmortal());
423 PSTATE* p_state = get_pstate_hv(aTHX_ self);
425 if (p_state->parsing)
428 p_state->parsing = 1;
429 parse(aTHX_ p_state, 0, self); /* flush */
430 p_state->parsing = 0;
435 strict_comment(pstate,...)
438 HTML::Parser::strict_comment = 1
439 HTML::Parser::strict_names = 2
440 HTML::Parser::xml_mode = 3
441 HTML::Parser::unbroken_text = 4
442 HTML::Parser::marked_sections = 5
443 HTML::Parser::attr_encoded = 6
444 HTML::Parser::case_sensitive = 7
445 HTML::Parser::strict_end = 8
446 HTML::Parser::closing_plaintext = 9
447 HTML::Parser::utf8_mode = 10
448 HTML::Parser::empty_element_tags = 11
449 HTML::Parser::xml_pic = 12
454 case 1: attr = &pstate->strict_comment; break;
455 case 2: attr = &pstate->strict_names; break;
456 case 3: attr = &pstate->xml_mode; break;
457 case 4: attr = &pstate->unbroken_text; break;
459 #ifdef MARKED_SECTION
460 attr = &pstate->marked_sections; break;
462 croak("marked sections not supported"); break;
464 case 6: attr = &pstate->attr_encoded; break;
465 case 7: attr = &pstate->case_sensitive; break;
466 case 8: attr = &pstate->strict_end; break;
467 case 9: attr = &pstate->closing_plaintext; break;
468 #ifdef UNICODE_HTML_PARSER
469 case 10: attr = &pstate->utf8_mode; break;
471 case 10: croak("The utf8_mode does not work with this perl; perl-5.8 or better required");
473 case 11: attr = &pstate->empty_element_tags; break;
474 case 12: attr = &pstate->xml_pic; break;
476 croak("Unknown boolean attribute (%d)", ix);
478 RETVAL = boolSV(*attr);
480 *attr = SvTRUE(ST(1));
485 boolean_attribute_value(pstate,...)
488 RETVAL = pstate->bool_attr_val ? newSVsv(pstate->bool_attr_val)
491 SvREFCNT_dec(pstate->bool_attr_val);
492 pstate->bool_attr_val = newSVsv(ST(1));
498 ignore_tags(pstate,...)
501 HTML::Parser::report_tags = 1
502 HTML::Parser::ignore_tags = 2
503 HTML::Parser::ignore_elements = 3
509 case 1: attr = &pstate->report_tags; break;
510 case 2: attr = &pstate->ignore_tags; break;
511 case 3: attr = &pstate->ignore_elements; break;
513 croak("Unknown tag-list attribute (%d)", ix);
515 if (GIMME_V != G_VOID)
516 croak("Can't report tag lists yet");
518 items--; /* pstate */
525 for (i = 0; i < items; i++) {
529 if (SvTYPE(sv) == SVt_PVAV) {
532 STRLEN len = av_len(av) + 1;
533 for (j = 0; j < len; j++) {
534 SV**svp = av_fetch(av, j, 0);
536 hv_store_ent(*attr, *svp, newSViv(0), 0);
541 croak("Tag list must be plain scalars and arrays");
544 hv_store_ent(*attr, sv, newSViv(0), 0);
554 handler(pstate, eventname,...)
559 char *name = SvPV(eventname, name_len);
564 /* map event name string to event_id */
565 for (i = 0; i < EVENT_COUNT; i++) {
566 if (strEQ(name, event_id_str[i])) {
572 croak("No handler for %s events", name);
574 h = &pstate->handlers[event];
576 /* set up return value */
578 PUSHs((SvTYPE(h->cb) == SVt_PVAV)
579 ? sv_2mortal(newRV_inc(h->cb))
580 : sv_2mortal(newSVsv(h->cb)));
588 SvREFCNT_dec(h->argspec);
590 h->argspec = argspec_compile(ST(3), pstate);
595 h->cb = check_handler(aTHX_ ST(2));
599 MODULE = HTML::Parser PACKAGE = HTML::Entities
605 HV *entity2char = perl_get_hv("HTML::Entities::entity2char", FALSE);
607 if (GIMME_V == G_SCALAR && items > 1)
609 for (i = 0; i < items; i++) {
610 if (GIMME_V != G_VOID)
611 ST(i) = sv_2mortal(newSVsv(ST(i)));
612 else if (SvREADONLY(ST(i)))
613 croak("Can't inline decode readonly string");
614 decode_entities(aTHX_ ST(i), entity2char, 0);
619 _decode_entities(string, entities, ...)
624 bool expand_prefix = (items > 2) ? SvTRUE(ST(2)) : 0;
626 if (SvOK(entities)) {
627 if (SvROK(entities) && SvTYPE(SvRV(entities)) == SVt_PVHV) {
628 entities_hv = (HV*)SvRV(entities);
631 croak("2nd argument must be hash reference");
637 if (SvREADONLY(string))
638 croak("Can't inline decode readonly string");
639 decode_entities(aTHX_ string, entities_hv, expand_prefix);
642 _probably_utf8_chunk(string)
648 #ifdef UNICODE_HTML_PARSER
649 sv_utf8_downgrade(string, 0);
650 s = SvPV(string, len);
651 RETVAL = probably_utf8_chunk(aTHX_ s, len);
653 RETVAL = 0; /* avoid never initialized complains from compiler */
654 croak("_probably_utf8_chunk() only works for Unicode enabled perls");
663 #ifdef UNICODE_HTML_PARSER
672 MODULE = HTML::Parser PACKAGE = HTML::Parser