1 /* $Id: util.c,v 2.30 2006/03/22 09:15:17 gisle Exp $
3 * Copyright 1999-2006, Gisle Aas.
5 * This library is free software; you can redistribute it and/or
6 * modify it under the same terms as Perl itself.
15 sv_lower(pTHX_ SV* sv)
18 char *s = SvPV_force(sv, len);
25 strnEQx(const char* s1, const char* s2, STRLEN n, int ignore_case)
29 if (toLOWER(*s1) != toLOWER(*s2))
43 grow_gap(pTHX_ SV* sv, STRLEN grow, char** t, char** s, char** e)
46 SvPVX ---> AAAAAA...BBBBBB
50 STRLEN t_offset = *t - SvPVX(sv);
51 STRLEN s_offset = *s - SvPVX(sv);
52 STRLEN e_offset = *e - SvPVX(sv);
54 SvGROW(sv, e_offset + grow + 1);
56 *t = SvPVX(sv) + t_offset;
57 *s = SvPVX(sv) + s_offset;
58 *e = SvPVX(sv) + e_offset;
60 Move(*s, *s+grow, *e - *s, char);
66 decode_entities(pTHX_ SV* sv, HV* entity2char, bool expand_prefix)
69 char *s = SvPV_force(sv, len);
76 #ifdef UNICODE_HTML_PARSER
77 char buf[UTF8_MAXLEN];
79 int high_surrogate = 0;
84 #if defined(__GNUC__) && defined(UNICODE_HTML_PARSER)
85 /* gcc -Wall reports this variable as possibly used uninitialized */
92 if ((*t++ = *s++) != '&')
98 if (s < end && *s == '#') {
103 if (s < end && (*s == 'x' || *s == 'X')) {
106 char *tmp = strchr(PL_hexdigit, *s);
109 num = num << 4 | ((tmp - PL_hexdigit) & 15);
110 if (prev && num <= prev) {
121 while (s < end && isDIGIT(*s)) {
122 num = num * 10 + (*s - '0');
123 if (prev && num < prev) {
134 #ifdef UNICODE_HTML_PARSER
135 if (!SvUTF8(sv) && num <= 255) {
143 if ((num & 0xFFFFFC00) == 0xDC00) { /* low-surrogate */
144 if (high_surrogate != 0) {
145 t -= 3; /* Back up past 0xFFFD */
146 num = ((high_surrogate - 0xD800) << 10) +
147 (num - 0xDC00) + 0x10000;
153 else if ((num & 0xFFFFFC00) == 0xD800) { /* high-surrogate */
154 high_surrogate = num;
159 /* otherwise invalid? */
160 if ((num >= 0xFDD0 && num <= 0xFDEF) ||
161 ((num & 0xFFFE) == 0xFFFE) ||
168 tmp = (char*)uvuni_to_utf8((U8*)buf, num);
170 repl_len = tmp - buf;
175 buf[0] = (char) num & 0xFF;
184 while (s < end && isALNUM(*s))
186 if (ent_name != s && entity2char) {
188 if ( (svp = hv_fetch(entity2char, ent_name, s - ent_name, 0)) ||
189 (*s == ';' && (svp = hv_fetch(entity2char, ent_name, s - ent_name + 1, 0)))
192 repl = SvPV(*svp, repl_len);
193 #ifdef UNICODE_HTML_PARSER
194 repl_utf8 = SvUTF8(*svp);
197 else if (expand_prefix) {
199 while (ss > ent_name) {
200 svp = hv_fetch(entity2char, ent_name, ss - ent_name, 0);
202 repl = SvPV(*svp, repl_len);
203 #ifdef UNICODE_HTML_PARSER
204 repl_utf8 = SvUTF8(*svp);
213 #ifdef UNICODE_HTML_PARSER
219 char *repl_allocated = 0;
220 if (s < end && *s == ';')
222 t--; /* '&' already copied, undo it */
224 #ifdef UNICODE_HTML_PARSER
229 if (!SvUTF8(sv) && repl_utf8) {
230 /* need to upgrade sv before we continue */
231 STRLEN before_gap_len = t - SvPVX(sv);
232 char *before_gap = (char*)bytes_to_utf8((U8*)SvPVX(sv), &before_gap_len);
233 STRLEN after_gap_len = end - s;
234 char *after_gap = (char*)bytes_to_utf8((U8*)s, &after_gap_len);
236 sv_setpvn(sv, before_gap, before_gap_len);
237 sv_catpvn(sv, after_gap, after_gap_len);
240 Safefree(before_gap);
243 s = t = SvPVX(sv) + before_gap_len;
244 end = SvPVX(sv) + before_gap_len + after_gap_len;
246 else if (SvUTF8(sv) && !repl_utf8) {
247 repl = (char*)bytes_to_utf8((U8*)repl, &repl_len);
248 repl_allocated = repl;
252 if (t + repl_len > s) {
253 /* need to grow the string */
254 grow_gap(aTHX_ sv, repl_len - (s - t), &t, &s, &end);
257 /* copy replacement string into string */
262 Safefree(repl_allocated);
265 while (ent_start < s)
271 SvCUR_set(sv, t - SvPVX(sv));
276 #ifdef UNICODE_HTML_PARSER
278 has_hibit(char *s, char *e)
282 if (!UTF8_IS_INVARIANT(ch)) {
291 probably_utf8_chunk(pTHX_ char *s, STRLEN len)
296 /* ignore partial utf8 char at end of buffer */
297 while (s < e && UTF8_IS_CONTINUATION((U8)*(e - 1)))
299 if (s < e && UTF8_IS_START((U8)*(e - 1)))
301 clen = len - (e - s);
302 if (clen && UTF8SKIP(e) == clen) {
303 /* all promised continuation bytes are present */
307 if (!has_hibit(s, e))
310 return is_utf8_string((U8*)s, e - s);