Debian lenny version packages
[pkg-perl] / deb-src / libhtml-parser-perl / libhtml-parser-perl-3.56 / Parser.xs
1 /* $Id: Parser.xs,v 2.137 2007/01/12 10:18:39 gisle Exp $
2  *
3  * Copyright 1999-2005, Gisle Aas.
4  * Copyright 1999-2000, Michael A. Chase.
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the same terms as Perl itself.
8  */
9
10
11 /*
12  * Standard XS greeting.
13  */
14 #ifdef __cplusplus
15 extern "C" {
16 #endif
17 #define PERL_NO_GET_CONTEXT     /* we want efficiency */
18 #include "EXTERN.h"
19 #include "perl.h"
20 #include "XSUB.h"
21 #ifdef __cplusplus
22 }
23 #endif
24
25
26
27 /*
28  * Some perl version compatibility gruff.
29  */
30 #include "patchlevel.h"
31 #if PATCHLEVEL <= 4 /* perl5.004_XX */
32
33 #ifndef PL_sv_undef
34    #define PL_sv_undef sv_undef
35    #define PL_sv_yes   sv_yes
36 #endif
37
38 #ifndef PL_hexdigit
39    #define PL_hexdigit hexdigit
40 #endif
41
42 #ifndef ERRSV
43    #define ERRSV GvSV(errgv)
44 #endif
45
46 #if (PATCHLEVEL == 4 && SUBVERSION <= 4)
47 /* The newSVpvn function was introduced in perl5.004_05 */
48 static SV *
49 newSVpvn(char *s, STRLEN len)
50 {
51     register SV *sv = newSV(0);
52     sv_setpvn(sv,s,len);
53     return sv;
54 }
55 #endif /* not perl5.004_05 */
56 #endif /* perl5.004_XX */
57
58 #ifndef dNOOP
59    #define dNOOP extern int errno
60 #endif
61 #ifndef dTHX
62    #define dTHX dNOOP
63    #define pTHX_
64    #define aTHX_
65 #endif
66
67 #ifndef MEMBER_TO_FPTR
68    #define MEMBER_TO_FPTR(x) (x)
69 #endif
70
71 #ifndef INT2PTR
72    #define INT2PTR(any,d)  (any)(d)
73    #define PTR2IV(p)       (IV)(p)
74 #endif
75
76
77 #if PATCHLEVEL > 6 || (PATCHLEVEL == 6 && SUBVERSION > 0)
78    #define RETHROW         croak(Nullch)
79 #else
80    #define RETHROW    { STRLEN my_na; croak("%s", SvPV(ERRSV, my_na)); }
81 #endif
82
83 #if PATCHLEVEL < 8
84    /* No useable Unicode support */
85    /* Make these harmless if present */
86    #undef SvUTF8
87    #undef SvUTF8_on
88    #undef SvUTF8_off
89    #define SvUTF8(sv)      0
90    #define SvUTF8_on(sv)   0
91    #define SvUTF8_off(sv)  0
92 #else
93    #define UNICODE_HTML_PARSER
94 #endif
95
96 #ifdef G_WARN_ON
97    #define DOWARN (PL_dowarn & G_WARN_ON)
98 #else
99    #define DOWARN PL_dowarn
100 #endif
101
102 /*
103  * Include stuff.  We include .c files instead of linking them,
104  * so that they don't have to pollute the external dll name space.
105  */
106
107 #ifdef EXTERN
108   #undef EXTERN
109 #endif
110
111 #define EXTERN static /* Don't pollute */
112
113 #include "hparser.h"
114 #include "util.c"
115 #include "hparser.c"
116
117
118 /*
119  * Support functions for the XS glue
120  */
121
122 static SV*
123 check_handler(pTHX_ SV* h)
124 {
125     if (SvROK(h)) {
126         SV* myref = SvRV(h);
127         if (SvTYPE(myref) == SVt_PVCV)
128             return newSVsv(h);
129         if (SvTYPE(myref) == SVt_PVAV)
130             return SvREFCNT_inc(myref);
131         croak("Only code or array references allowed as handler");
132     }
133     return SvOK(h) ? newSVsv(h) : 0;
134 }
135
136
137 static PSTATE*
138 get_pstate_iv(pTHX_ SV* sv)
139 {
140     PSTATE *p;
141 #if PATCHLEVEL < 8
142     p = INT2PTR(PSTATE*, SvIV(sv));
143 #else
144     MAGIC *mg = SvMAGICAL(sv) ? mg_find(sv, '~') : NULL;
145
146     if (!mg)
147         croak("Lost parser state magic");
148     p = (PSTATE *)mg->mg_ptr;
149     if (!p)
150         croak("Lost parser state magic");
151 #endif
152     if (p->signature != P_SIGNATURE)
153         croak("Bad signature in parser state object at %p", p);
154     return p;
155 }
156
157
158 static PSTATE*
159 get_pstate_hv(pTHX_ SV* sv)                               /* used by XS typemap */
160 {
161     HV* hv;
162     SV** svp;
163
164     sv = SvRV(sv);
165     if (!sv || SvTYPE(sv) != SVt_PVHV)
166         croak("Not a reference to a hash");
167     hv = (HV*)sv;
168     svp = hv_fetch(hv, "_hparser_xs_state", 17, 0);
169     if (svp) {
170         if (SvROK(*svp))
171             return get_pstate_iv(aTHX_ SvRV(*svp));
172         else
173             croak("_hparser_xs_state element is not a reference");
174     }
175     croak("Can't find '_hparser_xs_state' element in HTML::Parser hash");
176     return 0;
177 }
178
179
180 static void
181 free_pstate(pTHX_ PSTATE* pstate)
182 {
183     int i;
184     SvREFCNT_dec(pstate->buf);
185     SvREFCNT_dec(pstate->pend_text);
186     SvREFCNT_dec(pstate->skipped_text);
187 #ifdef MARKED_SECTION
188     SvREFCNT_dec(pstate->ms_stack);
189 #endif
190     SvREFCNT_dec(pstate->bool_attr_val);
191     for (i = 0; i < EVENT_COUNT; i++) {
192         SvREFCNT_dec(pstate->handlers[i].cb);
193         SvREFCNT_dec(pstate->handlers[i].argspec);
194     }
195
196     SvREFCNT_dec(pstate->report_tags);
197     SvREFCNT_dec(pstate->ignore_tags);
198     SvREFCNT_dec(pstate->ignore_elements);
199     SvREFCNT_dec(pstate->ignoring_element);
200
201     SvREFCNT_dec(pstate->tmp);
202
203     pstate->signature = 0;
204     Safefree(pstate);
205 }
206
207 static int
208 magic_free_pstate(pTHX_ SV *sv, MAGIC *mg)
209 {
210 #if PATCHLEVEL < 8
211     free_pstate(aTHX_ get_pstate_iv(aTHX_ sv));
212 #else
213     free_pstate(aTHX_ (PSTATE *)mg->mg_ptr);
214 #endif
215     return 0;
216 }
217
218 #if defined(USE_ITHREADS) && PATCHLEVEL >= 8
219
220 static PSTATE *
221 dup_pstate(pTHX_ PSTATE *pstate, CLONE_PARAMS *params)
222 {
223     PSTATE *pstate2;
224     int i;
225
226     Newz(56, pstate2, 1, PSTATE);
227     pstate2->signature = pstate->signature;
228
229     pstate2->buf = SvREFCNT_inc(sv_dup(pstate->buf, params));
230     pstate2->offset = pstate->offset;
231     pstate2->line = pstate->line;
232     pstate2->column = pstate->column;
233     pstate2->start_document = pstate->start_document;
234     pstate2->parsing = pstate->parsing;
235     pstate2->eof = pstate->eof;
236
237     pstate2->literal_mode = pstate->literal_mode;
238     pstate2->is_cdata = pstate->is_cdata;
239     pstate2->no_dash_dash_comment_end = pstate->no_dash_dash_comment_end;
240     pstate2->pending_end_tag = pstate->pending_end_tag;
241
242     pstate2->pend_text = SvREFCNT_inc(sv_dup(pstate->pend_text, params));
243     pstate2->pend_text_is_cdata = pstate->pend_text_is_cdata;
244     pstate2->pend_text_offset = pstate->pend_text_offset;
245     pstate2->pend_text_line = pstate->pend_text_offset;
246     pstate2->pend_text_column = pstate->pend_text_column;
247
248     pstate2->skipped_text = SvREFCNT_inc(sv_dup(pstate->skipped_text, params));
249
250 #ifdef MARKED_SECTION
251     pstate2->ms = pstate->ms;
252     pstate2->ms_stack =
253         (AV *)SvREFCNT_inc(sv_dup((SV *)pstate->ms_stack, params));
254     pstate2->marked_sections = pstate->marked_sections;
255 #endif
256
257     pstate2->strict_comment = pstate->strict_comment;
258     pstate2->strict_names = pstate->strict_names;
259     pstate2->strict_end = pstate->strict_end;
260     pstate2->xml_mode = pstate->xml_mode;
261     pstate2->unbroken_text = pstate->unbroken_text;
262     pstate2->attr_encoded = pstate->attr_encoded;
263     pstate2->case_sensitive = pstate->case_sensitive;
264     pstate2->closing_plaintext = pstate->closing_plaintext;
265     pstate2->utf8_mode = pstate->utf8_mode;
266     pstate2->empty_element_tags = pstate->empty_element_tags;
267     pstate2->xml_pic = pstate->xml_pic;
268
269     pstate2->bool_attr_val =
270         SvREFCNT_inc(sv_dup(pstate->bool_attr_val, params));
271     for (i = 0; i < EVENT_COUNT; i++) {
272         pstate2->handlers[i].cb =
273             SvREFCNT_inc(sv_dup(pstate->handlers[i].cb, params));
274         pstate2->handlers[i].argspec =
275             SvREFCNT_inc(sv_dup(pstate->handlers[i].argspec, params));
276     }
277     pstate2->argspec_entity_decode = pstate->argspec_entity_decode;
278
279     pstate2->report_tags =
280         (HV *)SvREFCNT_inc(sv_dup((SV *)pstate->report_tags, params));
281     pstate2->ignore_tags =
282         (HV *)SvREFCNT_inc(sv_dup((SV *)pstate->ignore_tags, params));
283     pstate2->ignore_elements =
284         (HV *)SvREFCNT_inc(sv_dup((SV *)pstate->ignore_elements, params));
285
286     pstate2->ignoring_element =
287         SvREFCNT_inc(sv_dup(pstate->ignoring_element, params));
288     pstate2->ignore_depth = pstate->ignore_depth;
289
290     if (params->flags & CLONEf_JOIN_IN) {
291         pstate2->entity2char =
292             perl_get_hv("HTML::Entities::entity2char", TRUE);
293     } else {
294         pstate2->entity2char = (HV *)sv_dup((SV *)pstate->entity2char, params);
295     }
296     pstate2->tmp = SvREFCNT_inc(sv_dup(pstate->tmp, params));
297
298     return pstate2;
299 }
300
301 static int
302 magic_dup_pstate(pTHX_ MAGIC *mg, CLONE_PARAMS *params)
303 {
304     mg->mg_ptr = (char *)dup_pstate(aTHX_ (PSTATE *)mg->mg_ptr, params);
305     return 0;
306 }
307
308 #endif
309
310 MGVTBL vtbl_pstate =
311 {
312     0,
313     0,
314     0,
315     0,
316     MEMBER_TO_FPTR(magic_free_pstate),
317 #if defined(USE_ITHREADS) && PATCHLEVEL >= 8
318     0,
319     MEMBER_TO_FPTR(magic_dup_pstate),
320 #endif
321 };
322
323
324 /*
325  *  XS interface definition.
326  */
327
328 MODULE = HTML::Parser           PACKAGE = HTML::Parser
329
330 PROTOTYPES: DISABLE
331
332 void
333 _alloc_pstate(self)
334         SV* self;
335     PREINIT:
336         PSTATE* pstate;
337         SV* sv;
338         HV* hv;
339         MAGIC* mg;
340
341     CODE:
342         sv = SvRV(self);
343         if (!sv || SvTYPE(sv) != SVt_PVHV)
344             croak("Not a reference to a hash");
345         hv = (HV*)sv;
346
347         Newz(56, pstate, 1, PSTATE);
348         pstate->signature = P_SIGNATURE;
349         pstate->entity2char = perl_get_hv("HTML::Entities::entity2char", TRUE);
350         pstate->tmp = NEWSV(0, 20);
351
352         sv = newSViv(PTR2IV(pstate));
353 #if PATCHLEVEL < 8
354         sv_magic(sv, 0, '~', 0, 0);
355 #else
356         sv_magic(sv, 0, '~', (char *)pstate, 0);
357 #endif
358         mg = mg_find(sv, '~');
359         assert(mg);
360         mg->mg_virtual = &vtbl_pstate;
361 #if defined(USE_ITHREADS) && PATCHLEVEL >= 8
362         mg->mg_flags |= MGf_DUP;
363 #endif
364         SvREADONLY_on(sv);
365
366         hv_store(hv, "_hparser_xs_state", 17, newRV_noinc(sv), 0);
367
368 void
369 parse(self, chunk)
370         SV* self;
371         SV* chunk
372     PREINIT:
373         PSTATE* p_state = get_pstate_hv(aTHX_ self);
374     PPCODE:
375         if (p_state->parsing)
376             croak("Parse loop not allowed");
377         p_state->parsing = 1;
378         if (SvROK(chunk) && SvTYPE(SvRV(chunk)) == SVt_PVCV) {
379             SV* generator = chunk;
380             STRLEN len;
381             do {
382                 int count;
383                 PUSHMARK(SP);
384                 count = perl_call_sv(generator, G_SCALAR|G_EVAL);
385                 SPAGAIN;
386                 chunk = count ? POPs : 0;
387                 PUTBACK;
388
389                 if (SvTRUE(ERRSV)) {
390                     p_state->parsing = 0;
391                     p_state->eof = 0;
392                     RETHROW;
393                 }
394
395                 if (chunk && SvOK(chunk)) {
396                     (void)SvPV(chunk, len);  /* get length */
397                 }
398                 else {
399                     len = 0;
400                 }
401                 parse(aTHX_ p_state, len ? chunk : 0, self);
402                 SPAGAIN;
403
404             } while (len && !p_state->eof);
405         }
406         else {
407             parse(aTHX_ p_state, chunk, self);
408             SPAGAIN;
409         }
410         p_state->parsing = 0;
411         if (p_state->eof) {
412             p_state->eof = 0;
413             PUSHs(sv_newmortal());
414         }
415         else {
416             PUSHs(self);
417         }
418
419 void
420 eof(self)
421         SV* self;
422     PREINIT:
423         PSTATE* p_state = get_pstate_hv(aTHX_ self);
424     PPCODE:
425         if (p_state->parsing)
426             p_state->eof = 1;
427         else {
428             p_state->parsing = 1;
429             parse(aTHX_ p_state, 0, self); /* flush */
430             p_state->parsing = 0;
431         }
432         PUSHs(self);
433
434 SV*
435 strict_comment(pstate,...)
436         PSTATE* pstate
437     ALIAS:
438         HTML::Parser::strict_comment = 1
439         HTML::Parser::strict_names = 2
440         HTML::Parser::xml_mode = 3
441         HTML::Parser::unbroken_text = 4
442         HTML::Parser::marked_sections = 5
443         HTML::Parser::attr_encoded = 6
444         HTML::Parser::case_sensitive = 7
445         HTML::Parser::strict_end = 8
446         HTML::Parser::closing_plaintext = 9
447         HTML::Parser::utf8_mode = 10
448         HTML::Parser::empty_element_tags = 11
449         HTML::Parser::xml_pic = 12
450     PREINIT:
451         bool *attr;
452     CODE:
453         switch (ix) {
454         case  1: attr = &pstate->strict_comment;       break;
455         case  2: attr = &pstate->strict_names;         break;
456         case  3: attr = &pstate->xml_mode;             break;
457         case  4: attr = &pstate->unbroken_text;        break;
458         case  5:
459 #ifdef MARKED_SECTION
460                  attr = &pstate->marked_sections;      break;
461 #else
462                  croak("marked sections not supported"); break;
463 #endif
464         case  6: attr = &pstate->attr_encoded;         break;
465         case  7: attr = &pstate->case_sensitive;       break;
466         case  8: attr = &pstate->strict_end;           break;
467         case  9: attr = &pstate->closing_plaintext;    break;
468 #ifdef UNICODE_HTML_PARSER
469         case 10: attr = &pstate->utf8_mode;            break;
470 #else
471         case 10: croak("The utf8_mode does not work with this perl; perl-5.8 or better required");
472 #endif
473         case 11: attr = &pstate->empty_element_tags;    break;
474         case 12: attr = &pstate->xml_pic;              break;
475         default:
476             croak("Unknown boolean attribute (%d)", ix);
477         }
478         RETVAL = boolSV(*attr);
479         if (items > 1)
480             *attr = SvTRUE(ST(1));
481     OUTPUT:
482         RETVAL
483
484 SV*
485 boolean_attribute_value(pstate,...)
486         PSTATE* pstate
487     CODE:
488         RETVAL = pstate->bool_attr_val ? newSVsv(pstate->bool_attr_val)
489                                        : &PL_sv_undef;
490         if (items > 1) {
491             SvREFCNT_dec(pstate->bool_attr_val);
492             pstate->bool_attr_val = newSVsv(ST(1));
493         }
494     OUTPUT:
495         RETVAL
496
497 void
498 ignore_tags(pstate,...)
499         PSTATE* pstate
500     ALIAS:
501         HTML::Parser::report_tags = 1
502         HTML::Parser::ignore_tags = 2
503         HTML::Parser::ignore_elements = 3
504     PREINIT:
505         HV** attr;
506         int i;
507     CODE:
508         switch (ix) {
509         case  1: attr = &pstate->report_tags;     break;
510         case  2: attr = &pstate->ignore_tags;     break;
511         case  3: attr = &pstate->ignore_elements; break;
512         default:
513             croak("Unknown tag-list attribute (%d)", ix);
514         }
515         if (GIMME_V != G_VOID)
516             croak("Can't report tag lists yet");
517
518         items--;  /* pstate */
519         if (items) {
520             if (*attr)
521                 hv_clear(*attr);
522             else
523                 *attr = newHV();
524
525             for (i = 0; i < items; i++) {
526                 SV* sv = ST(i+1);
527                 if (SvROK(sv)) {
528                     sv = SvRV(sv);
529                     if (SvTYPE(sv) == SVt_PVAV) {
530                         AV* av = (AV*)sv;
531                         STRLEN j;
532                         STRLEN len = av_len(av) + 1;
533                         for (j = 0; j < len; j++) {
534                             SV**svp = av_fetch(av, j, 0);
535                             if (svp) {
536                                 hv_store_ent(*attr, *svp, newSViv(0), 0);
537                             }
538                         }
539                     }
540                     else
541                         croak("Tag list must be plain scalars and arrays");
542                 }
543                 else {
544                     hv_store_ent(*attr, sv, newSViv(0), 0);
545                 }
546             }
547         }
548         else if (*attr) {
549             SvREFCNT_dec(*attr);
550             *attr = 0;
551         }
552
553 void
554 handler(pstate, eventname,...)
555         PSTATE* pstate
556         SV* eventname
557     PREINIT:
558         STRLEN name_len;
559         char *name = SvPV(eventname, name_len);
560         int event = -1;
561         int i;
562         struct p_handler *h;
563     PPCODE:
564         /* map event name string to event_id */
565         for (i = 0; i < EVENT_COUNT; i++) {
566             if (strEQ(name, event_id_str[i])) {
567                 event = i;
568                 break;
569             }
570         }
571         if (event < 0)
572             croak("No handler for %s events", name);
573
574         h = &pstate->handlers[event];
575
576         /* set up return value */
577         if (h->cb) {
578             PUSHs((SvTYPE(h->cb) == SVt_PVAV)
579                          ? sv_2mortal(newRV_inc(h->cb))
580                          : sv_2mortal(newSVsv(h->cb)));
581         }
582         else {
583             PUSHs(&PL_sv_undef);
584         }
585
586         /* update */
587         if (items > 3) {
588             SvREFCNT_dec(h->argspec);
589             h->argspec = 0;
590             h->argspec = argspec_compile(ST(3), pstate);
591         }
592         if (items > 2) {
593             SvREFCNT_dec(h->cb);
594             h->cb = 0;
595             h->cb = check_handler(aTHX_ ST(2));
596         }
597
598
599 MODULE = HTML::Parser           PACKAGE = HTML::Entities
600
601 void
602 decode_entities(...)
603     PREINIT:
604         int i;
605         HV *entity2char = perl_get_hv("HTML::Entities::entity2char", FALSE);
606     PPCODE:
607         if (GIMME_V == G_SCALAR && items > 1)
608             items = 1;
609         for (i = 0; i < items; i++) {
610             if (GIMME_V != G_VOID)
611                 ST(i) = sv_2mortal(newSVsv(ST(i)));
612             else if (SvREADONLY(ST(i)))
613                 croak("Can't inline decode readonly string");
614             decode_entities(aTHX_ ST(i), entity2char, 0);
615         }
616         SP += items;
617
618 void
619 _decode_entities(string, entities, ...)
620     SV* string
621     SV* entities
622     PREINIT:
623         HV* entities_hv;
624         bool expand_prefix = (items > 2) ? SvTRUE(ST(2)) : 0;
625     CODE:
626         if (SvOK(entities)) {
627             if (SvROK(entities) && SvTYPE(SvRV(entities)) == SVt_PVHV) {
628                 entities_hv = (HV*)SvRV(entities);
629             }
630             else {
631                 croak("2nd argument must be hash reference");
632             }
633         }
634         else {
635             entities_hv = 0;
636         }
637         if (SvREADONLY(string))
638             croak("Can't inline decode readonly string");
639         decode_entities(aTHX_ string, entities_hv, expand_prefix);
640
641 bool
642 _probably_utf8_chunk(string)
643     SV* string
644     PREINIT:
645         STRLEN len;
646         char *s;
647     CODE:
648 #ifdef UNICODE_HTML_PARSER
649         sv_utf8_downgrade(string, 0);
650         s = SvPV(string, len);
651         RETVAL = probably_utf8_chunk(aTHX_ s, len);
652 #else
653         RETVAL = 0; /* avoid never initialized complains from compiler */
654         croak("_probably_utf8_chunk() only works for Unicode enabled perls");
655 #endif
656     OUTPUT:
657         RETVAL
658
659 int
660 UNICODE_SUPPORT()
661     PROTOTYPE:
662     CODE:
663 #ifdef UNICODE_HTML_PARSER
664        RETVAL = 1;
665 #else
666        RETVAL = 0;
667 #endif
668     OUTPUT:
669        RETVAL
670
671
672 MODULE = HTML::Parser           PACKAGE = HTML::Parser