vcs.maemo.org Git - physicsfs/blob - physfs_unicode.c

   1 #include "physfs.h"
   2
   3 #define __PHYSICSFS_INTERNAL__
   4 #include "physfs_internal.h"
   5
   6
   7 /*
   8  * From rfc3629, the UTF-8 spec:
   9  *  http://www.ietf.org/rfc/rfc3629.txt
  10  *
  11  *   Char. number range  |        UTF-8 octet sequence
  12  *      (hexadecimal)    |              (binary)
  13  *   --------------------+---------------------------------------------
  14  *   0000 0000-0000 007F | 0xxxxxxx
  15  *   0000 0080-0000 07FF | 110xxxxx 10xxxxxx
  16  *   0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
  17  *   0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  18  */
  19
  20
  21 /*
  22  * This may not be the best value, but it's one that isn't represented
  23  *  in Unicode (0x10FFFF is the largest codepoint value). We return this
  24  *  value from utf8codepoint() if there's bogus bits in the
  25  *  stream. utf8codepoint() will turn this value into something
  26  *  reasonable (like a question mark), for text that wants to try to recover,
  27  *  whereas utf8valid() will use the value to determine if a string has bad
  28  *  bits.
  29  */
  30 #define UNICODE_BOGUS_CHAR_VALUE 0xFFFFFFFF
  31
  32 /*
  33  * This is the codepoint we currently return when there was bogus bits in a
  34  *  UTF-8 string. May not fly in Asian locales?
  35  */
  36 #define UNICODE_BOGUS_CHAR_CODEPOINT '?'
  37
  38 static PHYSFS_uint32 utf8codepoint(const char **_str)
  39 {
  40     const char *str = *_str;
  41     PHYSFS_uint32 retval = 0;
  42     PHYSFS_uint32 octet = (PHYSFS_uint32) ((PHYSFS_uint8) *str);
  43     PHYSFS_uint32 octet2, octet3, octet4;
  44
  45     if (octet == 0)  /* null terminator, end of string. */
  46         return 0;
  47
  48     else if (octet < 128)  /* one octet char: 0 to 127 */
  49     {
  50         (*_str)++;  /* skip to next possible start of codepoint. */
  51         return(octet);
  52     } /* else if */
  53
  54     else if ((octet > 127) && (octet < 192))  /* bad (starts with 10xxxxxx). */
  55     {
  56         /*
  57          * Apparently each of these is supposed to be flagged as a bogus
  58          *  char, instead of just resyncing to the next valid codepoint.
  59          */
  60         (*_str)++;  /* skip to next possible start of codepoint. */
  61         return UNICODE_BOGUS_CHAR_VALUE;
  62     } /* else if */
  63
  64     else if (octet < 224)  /* two octets */
  65     {
  66         octet -= (128+64);
  67         octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  68         if ((octet2 & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
  69             return UNICODE_BOGUS_CHAR_VALUE;
  70
  71         *_str += 2;  /* skip to next possible start of codepoint. */
  72         retval = ((octet << 6) | (octet2 - 128));
  73         if ((retval >= 0x80) && (retval <= 0x7FF))
  74             return retval;
  75     } /* else if */
  76
  77     else if (octet < 240)  /* three octets */
  78     {
  79         octet -= (128+64+32);
  80         octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  81         if ((octet2 & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
  82             return UNICODE_BOGUS_CHAR_VALUE;
  83
  84         octet3 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  85         if ((octet3 & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
  86             return UNICODE_BOGUS_CHAR_VALUE;
  87
  88         *_str += 3;  /* skip to next possible start of codepoint. */
  89         retval = ( ((octet << 12)) | ((octet2-128) << 6) | ((octet3-128)) );
  90
  91         /* There are seven "UTF-16 surrogates" that are illegal in UTF-8. */
  92         switch (retval)
  93         {
  94             case 0xD800:
  95             case 0xDB7F:
  96             case 0xDB80:
  97             case 0xDBFF:
  98             case 0xDC00:
  99             case 0xDF80:
 100             case 0xDFFF:
 101                 return UNICODE_BOGUS_CHAR_VALUE;
 102         } /* switch */
 103
 104         /* 0xFFFE and 0xFFFF are illegal, too, so we check them at the edge. */
 105         if ((retval >= 0x800) && (retval <= 0xFFFD))
 106             return retval;
 107     } /* else if */
 108
 109     else if (octet < 248)  /* four octets */
 110     {
 111         octet -= (128+64+32+16);
 112         octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
 113         if ((octet2 & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
 114             return UNICODE_BOGUS_CHAR_VALUE;
 115
 116         octet3 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
 117         if ((octet3 & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
 118             return UNICODE_BOGUS_CHAR_VALUE;
 119
 120         octet4 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
 121         if ((octet4 & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
 122             return UNICODE_BOGUS_CHAR_VALUE;
 123
 124         *_str += 4;  /* skip to next possible start of codepoint. */
 125         retval = ( ((octet << 18)) | ((octet2 - 128) << 12) |
 126                    ((octet3 - 128) << 6) | ((octet4 - 128)) );
 127         if ((retval >= 0x10000) && (retval <= 0x10FFFF))
 128             return retval;
 129     } /* else if */
 130
 131     /*
 132      * Five and six octet sequences became illegal in rfc3629.
 133      *  We throw the codepoint away, but parse them to make sure we move
 134      *  ahead the right number of bytes and don't overflow the buffer.
 135      */
 136
 137     else if (octet < 252)  /* five octets */
 138     {
 139         octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
 140         if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
 141             return UNICODE_BOGUS_CHAR_VALUE;
 142
 143         octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
 144         if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
 145             return UNICODE_BOGUS_CHAR_VALUE;
 146
 147         octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
 148         if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
 149             return UNICODE_BOGUS_CHAR_VALUE;
 150
 151         octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
 152         if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
 153             return UNICODE_BOGUS_CHAR_VALUE;
 154
 155         *_str += 5;  /* skip to next possible start of codepoint. */
 156         return UNICODE_BOGUS_CHAR_VALUE;
 157     } /* else if */
 158
 159     else  /* six octets */
 160     {
 161         octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
 162         if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
 163             return UNICODE_BOGUS_CHAR_VALUE;
 164
 165         octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
 166         if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
 167             return UNICODE_BOGUS_CHAR_VALUE;
 168
 169         octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
 170         if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
 171             return UNICODE_BOGUS_CHAR_VALUE;
 172
 173         octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
 174         if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
 175             return UNICODE_BOGUS_CHAR_VALUE;
 176
 177         octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
 178         if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
 179             return UNICODE_BOGUS_CHAR_VALUE;
 180
 181         *_str += 6;  /* skip to next possible start of codepoint. */
 182         return UNICODE_BOGUS_CHAR_VALUE;
 183     } /* else if */
 184
 185     return UNICODE_BOGUS_CHAR_VALUE;
 186 } /* utf8codepoint */
 187
 188
 189 void PHYSFS_utf8ToUcs4(const char *src, PHYSFS_uint32 *dst, PHYSFS_uint64 len)
 190 {
 191     len -= sizeof (PHYSFS_uint32);   /* save room for null char. */
 192     while (len >= sizeof (PHYSFS_uint32))
 193     {
 194         PHYSFS_uint32 cp = utf8codepoint(&src);
 195         if (cp == 0)
 196             break;
 197         else if (cp == UNICODE_BOGUS_CHAR_VALUE)
 198             cp = UNICODE_BOGUS_CHAR_CODEPOINT;
 199         *(dst++) = cp;
 200         len -= sizeof (PHYSFS_uint32);
 201     } /* while */
 202
 203     *dst = 0;
 204 } /* PHYSFS_utf8ToUcs4 */
 205
 206
 207 void PHYSFS_utf8ToUcs2(const char *src, PHYSFS_uint16 *dst, PHYSFS_uint64 len)
 208 {
 209     len -= sizeof (PHYSFS_uint16);   /* save room for null char. */
 210     while (len >= sizeof (PHYSFS_uint16))
 211     {
 212         PHYSFS_uint32 cp = utf8codepoint(&src);
 213         if (cp == 0)
 214             break;
 215         else if (cp == UNICODE_BOGUS_CHAR_VALUE)
 216             cp = UNICODE_BOGUS_CHAR_CODEPOINT;
 217
 218         /* !!! BLUESKY: UTF-16 surrogates? */
 219         if (cp > 0xFFFF)
 220             cp = UNICODE_BOGUS_CHAR_CODEPOINT;
 221
 222         *(dst++) = cp;
 223         len -= sizeof (PHYSFS_uint16);
 224     } /* while */
 225
 226     *dst = 0;
 227 } /* PHYSFS_utf8ToUcs2 */
 228
 229 static void utf8fromcodepoint(PHYSFS_uint32 cp, char **_dst, PHYSFS_uint64 *_len)
 230 {
 231     char *dst = *_dst;
 232     PHYSFS_uint64 len = *_len;
 233
 234     if (len == 0)
 235         return;
 236
 237     if (cp > 0x10FFFF)
 238         cp = UNICODE_BOGUS_CHAR_CODEPOINT;
 239     else if ((cp == 0xFFFE) || (cp == 0xFFFF))  /* illegal values. */
 240         cp = UNICODE_BOGUS_CHAR_CODEPOINT;
 241     else
 242     {
 243         /* There are seven "UTF-16 surrogates" that are illegal in UTF-8. */
 244         switch (cp)
 245         {
 246             case 0xD800:
 247             case 0xDB7F:
 248             case 0xDB80:
 249             case 0xDBFF:
 250             case 0xDC00:
 251             case 0xDF80:
 252             case 0xDFFF:
 253                 cp = UNICODE_BOGUS_CHAR_CODEPOINT;
 254         } /* switch */
 255     } /* else */
 256
 257     /* Do the encoding... */
 258     if (cp < 0x80)
 259     {
 260         *(dst++) = (char) cp;
 261         len--;
 262     } /* if */
 263
 264     else if (cp < 0x800)
 265     {
 266         if (len < 2)
 267             len = 0;
 268         else
 269         {
 270             *(dst++) = (char) ((cp >> 6) | 128 | 64);
 271             *(dst++) = (char) (cp & 0x3F) | 128;
 272             len -= 2;
 273         } /* else */
 274     } /* else if */
 275
 276     else if (cp < 0x10000)
 277     {
 278         if (len < 3)
 279             len = 0;
 280         else
 281         {
 282             *(dst++) = (char) ((cp >> 12) | 128 | 64 | 32);
 283             *(dst++) = (char) ((cp >> 6) & 0x3F) | 128;
 284             *(dst++) = (char) (cp & 0x3F) | 128;
 285             len -= 3;
 286         } /* else */
 287     } /* else if */
 288
 289     else
 290     {
 291         if (len < 4)
 292             len = 0;
 293         else
 294         {
 295             *(dst++) = (char) ((cp >> 18) | 128 | 64 | 32 | 16);
 296             *(dst++) = (char) ((cp >> 12) & 0x3F) | 128;
 297             *(dst++) = (char) ((cp >> 6) & 0x3F) | 128;
 298             *(dst++) = (char) (cp & 0x3F) | 128;
 299             len -= 4;
 300         } /* else if */
 301     } /* else */
 302
 303     *_dst = dst;
 304     *_len = len;
 305 } /* utf8fromcodepoint */
 306
 307 #define UTF8FROMTYPE(typ, src, dst, len) \
 308     if (len == 0) return; \
 309     len--;  \
 310     while (len) \
 311     { \
 312         const PHYSFS_uint32 cp = (PHYSFS_uint32) ((typ) (*(src++))); \
 313         if (cp == 0) break; \
 314         utf8fromcodepoint(cp, &dst, &len); \
 315     } \
 316     *dst = '\0'; \
 317
 318 void PHYSFS_utf8FromUcs4(const PHYSFS_uint32 *src, char *dst, PHYSFS_uint64 len)
 319 {
 320     UTF8FROMTYPE(PHYSFS_uint32, src, dst, len);
 321 } /* PHYSFS_utf8FromUcs4 */
 322
 323 void PHYSFS_utf8FromUcs2(const PHYSFS_uint16 *src, char *dst, PHYSFS_uint64 len)
 324 {
 325     UTF8FROMTYPE(PHYSFS_uint64, src, dst, len);
 326 } /* PHYSFS_utf8FromUcs4 */
 327
 328 /* latin1 maps to unicode codepoints directly, we just utf-8 encode it. */
 329 void PHYSFS_utf8FromLatin1(const char *src, char *dst, PHYSFS_uint64 len)
 330 {
 331     UTF8FROMTYPE(PHYSFS_uint8, src, dst, len);
 332 } /* PHYSFS_utf8FromLatin1 */
 333
 334 #undef UTF8FROMTYPE
 335
 336
 337 typedef struct CaseFoldMapping
 338 {
 339     PHYSFS_uint32 from;
 340     PHYSFS_uint32 to0;
 341     PHYSFS_uint32 to1;
 342     PHYSFS_uint32 to2;
 343 } CaseFoldMapping;
 344
 345 typedef struct CaseFoldHashBucket
 346 {
 347     const PHYSFS_uint8 count;
 348     const CaseFoldMapping *list;
 349 } CaseFoldHashBucket;
 350
 351 #include "physfs_casefolding.h"
 352
 353 static void locate_case_fold_mapping(const PHYSFS_uint32 from,
 354                                      PHYSFS_uint32 *to)
 355 {
 356     PHYSFS_uint32 i;
 357     const PHYSFS_uint8 hashed = ((from ^ (from >> 8)) & 0xFF);
 358     const CaseFoldHashBucket *bucket = &case_fold_hash[hashed];
 359     const CaseFoldMapping *mapping = bucket->list;
 360
 361     for (i = 0; i < bucket->count; i++, mapping++)
 362     {
 363         if (mapping->from == from)
 364         {
 365             to[0] = mapping->to0;
 366             to[1] = mapping->to1;
 367             to[2] = mapping->to2;
 368             return;
 369         } /* if */
 370     } /* for */
 371
 372     /* Not found...there's no remapping for this codepoint. */
 373     to[0] = from;
 374     to[1] = 0;
 375     to[2] = 0;
 376 } /* locate_case_fold_mapping */
 377
 378
 379 static int utf8codepointcmp(const PHYSFS_uint32 cp1, const PHYSFS_uint32 cp2)
 380 {
 381     PHYSFS_uint32 folded1[3], folded2[3];
 382     locate_case_fold_mapping(cp1, folded1);
 383     locate_case_fold_mapping(cp2, folded2);
 384     return ( (folded1[0] == folded2[0]) &&
 385              (folded1[1] == folded2[1]) &&
 386              (folded1[2] == folded2[2]) );
 387 } /* utf8codepointcmp */
 388
 389
 390 int __PHYSFS_utf8strcasecmp(const char *str1, const char *str2)
 391 {
 392     while (1)
 393     {
 394         const PHYSFS_uint32 cp1 = utf8codepoint(&str1);
 395         const PHYSFS_uint32 cp2 = utf8codepoint(&str2);
 396         if (!utf8codepointcmp(cp1, cp2)) return 0;
 397         if (cp1 == 0) return 1;
 398     } /* while */
 399
 400     return 0;  /* shouldn't hit this. */
 401 } /* __PHYSFS_utf8strcasecmp */
 402
 403
 404 int __PHYSFS_utf8strnicmp(const char *str1, const char *str2, PHYSFS_uint32 n)
 405 {
 406     while (n > 0)
 407     {
 408         const PHYSFS_uint32 cp1 = utf8codepoint(&str1);
 409         const PHYSFS_uint32 cp2 = utf8codepoint(&str2);
 410         if (!utf8codepointcmp(cp1, cp2)) return 0;
 411         if (cp1 == 0) return 1;
 412         n--;
 413     } /* while */
 414
 415     return 1;  /* matched to n chars. */
 416 } /* __PHYSFS_utf8strnicmp */
 417
 418
 419 int __PHYSFS_stricmpASCII(const char *str1, const char *str2)
 420 {
 421     while (1)
 422     {
 423         const char ch1 = *(str1++);
 424         const char ch2 = *(str2++);
 425         const char cp1 = ((ch1 >= 'A') && (ch1 <= 'Z')) ? (ch1+32) : ch1;
 426         const char cp2 = ((ch2 >= 'A') && (ch2 <= 'Z')) ? (ch2+32) : ch2;
 427         if (cp1 < cp2)
 428             return -1;
 429         else if (cp1 > cp2)
 430             return 1;
 431         else if (cp1 == 0)  /* they're both null chars? */
 432             return 0;
 433     } /* while */
 434
 435     return 0;  /* shouldn't hit this. */
 436 } /* __PHYSFS_stricmpASCII */
 437
 438
 439 int __PHYSFS_strnicmpASCII(const char *str1, const char *str2, PHYSFS_uint32 n)
 440 {
 441     while (n-- > 0)
 442     {
 443         const char ch1 = *(str1++);
 444         const char ch2 = *(str2++);
 445         const char cp1 = ((ch1 >= 'A') && (ch1 <= 'Z')) ? (ch1+32) : ch1;
 446         const char cp2 = ((ch2 >= 'A') && (ch2 <= 'Z')) ? (ch2+32) : ch2;
 447         if (cp1 < cp2)
 448             return -1;
 449         else if (cp1 > cp2)
 450             return 1;
 451         else if (cp1 == 0)  /* they're both null chars? */
 452             return 0;
 453     } /* while */
 454
 455     return 0;
 456 } /* __PHYSFS_stricmpASCII */
 457
 458
 459 /* end of physfs_unicode.c ... */
 460