3 #define __PHYSICSFS_INTERNAL__
4 #include "physfs_internal.h"
8 * From rfc3629, the UTF-8 spec:
9 * http://www.ietf.org/rfc/rfc3629.txt
11 * Char. number range | UTF-8 octet sequence
12 * (hexadecimal) | (binary)
13 * --------------------+---------------------------------------------
14 * 0000 0000-0000 007F | 0xxxxxxx
15 * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
16 * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
17 * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
22 * This may not be the best value, but it's one that isn't represented
23 * in Unicode (0x10FFFF is the largest codepoint value). We return this
24 * value from utf8codepoint() if there's bogus bits in the
25 * stream. utf8codepoint() will turn this value into something
26 * reasonable (like a question mark), for text that wants to try to recover,
27 * whereas utf8valid() will use the value to determine if a string has bad
30 #define UNICODE_BOGUS_CHAR_VALUE 0xFFFFFFFF
33 * This is the codepoint we currently return when there was bogus bits in a
34 * UTF-8 string. May not fly in Asian locales?
36 #define UNICODE_BOGUS_CHAR_CODEPOINT '?'
38 static PHYSFS_uint32 utf8codepoint(const char **_str)
40 const char *str = *_str;
41 PHYSFS_uint32 retval = 0;
42 PHYSFS_uint32 octet = (PHYSFS_uint32) ((PHYSFS_uint8) *str);
43 PHYSFS_uint32 octet2, octet3, octet4;
45 if (octet == 0) /* null terminator, end of string. */
48 else if (octet < 128) /* one octet char: 0 to 127 */
50 (*_str)++; /* skip to next possible start of codepoint. */
54 else if ((octet > 127) && (octet < 192)) /* bad (starts with 10xxxxxx). */
57 * Apparently each of these is supposed to be flagged as a bogus
58 * char, instead of just resyncing to the next valid codepoint.
60 (*_str)++; /* skip to next possible start of codepoint. */
61 return UNICODE_BOGUS_CHAR_VALUE;
64 else if (octet < 224) /* two octets */
67 octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
68 if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
69 return UNICODE_BOGUS_CHAR_VALUE;
71 *_str += 2; /* skip to next possible start of codepoint. */
72 retval = ((octet << 6) | (octet2 - 128));
73 if ((retval >= 0x80) && (retval <= 0x7FF))
77 else if (octet < 240) /* three octets */
80 octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
81 if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
82 return UNICODE_BOGUS_CHAR_VALUE;
84 octet3 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
85 if ((octet3 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
86 return UNICODE_BOGUS_CHAR_VALUE;
88 *_str += 3; /* skip to next possible start of codepoint. */
89 retval = ( ((octet << 12)) | ((octet2-128) << 6) | ((octet3-128)) );
91 /* There are seven "UTF-16 surrogates" that are illegal in UTF-8. */
101 return UNICODE_BOGUS_CHAR_VALUE;
104 /* 0xFFFE and 0xFFFF are illegal, too, so we check them at the edge. */
105 if ((retval >= 0x800) && (retval <= 0xFFFD))
109 else if (octet < 248) /* four octets */
111 octet -= (128+64+32+16);
112 octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
113 if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
114 return UNICODE_BOGUS_CHAR_VALUE;
116 octet3 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
117 if ((octet3 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
118 return UNICODE_BOGUS_CHAR_VALUE;
120 octet4 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
121 if ((octet4 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
122 return UNICODE_BOGUS_CHAR_VALUE;
124 *_str += 4; /* skip to next possible start of codepoint. */
125 retval = ( ((octet << 18)) | ((octet2 - 128) << 12) |
126 ((octet3 - 128) << 6) | ((octet4 - 128)) );
127 if ((retval >= 0x10000) && (retval <= 0x10FFFF))
132 * Five and six octet sequences became illegal in rfc3629.
133 * We throw the codepoint away, but parse them to make sure we move
134 * ahead the right number of bytes and don't overflow the buffer.
137 else if (octet < 252) /* five octets */
139 octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
140 if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
141 return UNICODE_BOGUS_CHAR_VALUE;
143 octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
144 if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
145 return UNICODE_BOGUS_CHAR_VALUE;
147 octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
148 if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
149 return UNICODE_BOGUS_CHAR_VALUE;
151 octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
152 if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
153 return UNICODE_BOGUS_CHAR_VALUE;
155 *_str += 5; /* skip to next possible start of codepoint. */
156 return UNICODE_BOGUS_CHAR_VALUE;
159 else /* six octets */
161 octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
162 if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
163 return UNICODE_BOGUS_CHAR_VALUE;
165 octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
166 if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
167 return UNICODE_BOGUS_CHAR_VALUE;
169 octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
170 if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
171 return UNICODE_BOGUS_CHAR_VALUE;
173 octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
174 if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
175 return UNICODE_BOGUS_CHAR_VALUE;
177 octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
178 if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
179 return UNICODE_BOGUS_CHAR_VALUE;
181 *_str += 6; /* skip to next possible start of codepoint. */
182 return UNICODE_BOGUS_CHAR_VALUE;
185 return UNICODE_BOGUS_CHAR_VALUE;
186 } /* utf8codepoint */
189 void PHYSFS_utf8ToUcs4(const char *src, PHYSFS_uint32 *dst, PHYSFS_uint64 len)
191 len -= sizeof (PHYSFS_uint32); /* save room for null char. */
192 while (len >= sizeof (PHYSFS_uint32))
194 PHYSFS_uint32 cp = utf8codepoint(&src);
197 else if (cp == UNICODE_BOGUS_CHAR_VALUE)
198 cp = UNICODE_BOGUS_CHAR_CODEPOINT;
200 len -= sizeof (PHYSFS_uint32);
204 } /* PHYSFS_utf8ToUcs4 */
207 void PHYSFS_utf8ToUcs2(const char *src, PHYSFS_uint16 *dst, PHYSFS_uint64 len)
209 len -= sizeof (PHYSFS_uint16); /* save room for null char. */
210 while (len >= sizeof (PHYSFS_uint16))
212 PHYSFS_uint32 cp = utf8codepoint(&src);
215 else if (cp == UNICODE_BOGUS_CHAR_VALUE)
216 cp = UNICODE_BOGUS_CHAR_CODEPOINT;
218 /* !!! BLUESKY: UTF-16 surrogates? */
220 cp = UNICODE_BOGUS_CHAR_CODEPOINT;
223 len -= sizeof (PHYSFS_uint16);
227 } /* PHYSFS_utf8ToUcs2 */
229 static void utf8fromcodepoint(PHYSFS_uint32 cp, char **_dst, PHYSFS_uint64 *_len)
232 PHYSFS_uint64 len = *_len;
238 cp = UNICODE_BOGUS_CHAR_CODEPOINT;
239 else if ((cp == 0xFFFE) || (cp == 0xFFFF)) /* illegal values. */
240 cp = UNICODE_BOGUS_CHAR_CODEPOINT;
243 /* There are seven "UTF-16 surrogates" that are illegal in UTF-8. */
253 cp = UNICODE_BOGUS_CHAR_CODEPOINT;
257 /* Do the encoding... */
260 *(dst++) = (char) cp;
270 *(dst++) = (char) ((cp >> 6) | 128 | 64);
271 *(dst++) = (char) (cp & 0x3F) | 128;
276 else if (cp < 0x10000)
282 *(dst++) = (char) ((cp >> 12) | 128 | 64 | 32);
283 *(dst++) = (char) ((cp >> 6) & 0x3F) | 128;
284 *(dst++) = (char) (cp & 0x3F) | 128;
295 *(dst++) = (char) ((cp >> 18) | 128 | 64 | 32 | 16);
296 *(dst++) = (char) ((cp >> 12) & 0x3F) | 128;
297 *(dst++) = (char) ((cp >> 6) & 0x3F) | 128;
298 *(dst++) = (char) (cp & 0x3F) | 128;
305 } /* utf8fromcodepoint */
307 #define UTF8FROMTYPE(typ, src, dst, len) \
308 if (len == 0) return; \
312 const PHYSFS_uint32 cp = (PHYSFS_uint32) ((typ) (*(src++))); \
313 if (cp == 0) break; \
314 utf8fromcodepoint(cp, &dst, &len); \
318 void PHYSFS_utf8FromUcs4(const PHYSFS_uint32 *src, char *dst, PHYSFS_uint64 len)
320 UTF8FROMTYPE(PHYSFS_uint32, src, dst, len);
321 } /* PHYSFS_utf8FromUcs4 */
323 void PHYSFS_utf8FromUcs2(const PHYSFS_uint16 *src, char *dst, PHYSFS_uint64 len)
325 UTF8FROMTYPE(PHYSFS_uint64, src, dst, len);
326 } /* PHYSFS_utf8FromUcs4 */
328 /* latin1 maps to unicode codepoints directly, we just utf-8 encode it. */
329 void PHYSFS_utf8FromLatin1(const char *src, char *dst, PHYSFS_uint64 len)
331 UTF8FROMTYPE(PHYSFS_uint8, src, dst, len);
332 } /* PHYSFS_utf8FromLatin1 */
337 typedef struct CaseFoldMapping
345 typedef struct CaseFoldHashBucket
347 const PHYSFS_uint8 count;
348 const CaseFoldMapping *list;
349 } CaseFoldHashBucket;
351 #include "physfs_casefolding.h"
353 static void locate_case_fold_mapping(const PHYSFS_uint32 from,
357 const PHYSFS_uint8 hashed = ((from ^ (from >> 8)) & 0xFF);
358 const CaseFoldHashBucket *bucket = &case_fold_hash[hashed];
359 const CaseFoldMapping *mapping = bucket->list;
361 for (i = 0; i < bucket->count; i++, mapping++)
363 if (mapping->from == from)
365 to[0] = mapping->to0;
366 to[1] = mapping->to1;
367 to[2] = mapping->to2;
372 /* Not found...there's no remapping for this codepoint. */
376 } /* locate_case_fold_mapping */
379 static int utf8codepointcmp(const PHYSFS_uint32 cp1, const PHYSFS_uint32 cp2)
381 PHYSFS_uint32 folded1[3], folded2[3];
382 locate_case_fold_mapping(cp1, folded1);
383 locate_case_fold_mapping(cp2, folded2);
384 return ( (folded1[0] == folded2[0]) &&
385 (folded1[1] == folded2[1]) &&
386 (folded1[2] == folded2[2]) );
387 } /* utf8codepointcmp */
390 int __PHYSFS_utf8strcasecmp(const char *str1, const char *str2)
394 const PHYSFS_uint32 cp1 = utf8codepoint(&str1);
395 const PHYSFS_uint32 cp2 = utf8codepoint(&str2);
396 if (!utf8codepointcmp(cp1, cp2)) return 0;
397 if (cp1 == 0) return 1;
400 return 0; /* shouldn't hit this. */
401 } /* __PHYSFS_utf8strcasecmp */
404 int __PHYSFS_utf8strnicmp(const char *str1, const char *str2, PHYSFS_uint32 n)
408 const PHYSFS_uint32 cp1 = utf8codepoint(&str1);
409 const PHYSFS_uint32 cp2 = utf8codepoint(&str2);
410 if (!utf8codepointcmp(cp1, cp2)) return 0;
411 if (cp1 == 0) return 1;
415 return 1; /* matched to n chars. */
416 } /* __PHYSFS_utf8strnicmp */
419 int __PHYSFS_stricmpASCII(const char *str1, const char *str2)
423 const char ch1 = *(str1++);
424 const char ch2 = *(str2++);
425 const char cp1 = ((ch1 >= 'A') && (ch1 <= 'Z')) ? (ch1+32) : ch1;
426 const char cp2 = ((ch2 >= 'A') && (ch2 <= 'Z')) ? (ch2+32) : ch2;
431 else if (cp1 == 0) /* they're both null chars? */
435 return 0; /* shouldn't hit this. */
436 } /* __PHYSFS_stricmpASCII */
439 int __PHYSFS_strnicmpASCII(const char *str1, const char *str2, PHYSFS_uint32 n)
443 const char ch1 = *(str1++);
444 const char ch2 = *(str2++);
445 const char cp1 = ((ch1 >= 'A') && (ch1 <= 'Z')) ? (ch1+32) : ch1;
446 const char cp2 = ((ch2 >= 'A') && (ch2 <= 'Z')) ? (ch2+32) : ch2;
451 else if (cp1 == 0) /* they're both null chars? */
456 } /* __PHYSFS_stricmpASCII */
459 /* end of physfs_unicode.c ... */