vcs.maemo.org Git - mstardict/blob - src/lib/dictziplib.cpp

   1 /* dictziplib.c --
   2  * http://stardict.sourceforge.net
   3  * Copyright (C) 2003-2003 Hu Zheng <huzheng_001@163.com>
   4  * This file is a modify version of dictd-1.9.7's data.c
   5  *
   6  * data.c --
   7  * Created: Tue Jul 16 12:45:41 1996 by faith@dict.org
   8  * Revised: Sat Mar 30 10:46:06 2002 by faith@dict.org
   9  * Copyright 1996, 1997, 1998, 2000, 2002 Rickard E. Faith (faith@dict.org)
  10  *
  11  *
  12  *  This program is free software; you can redistribute it and/or modify
  13  *  it under the terms of the GNU General Public License as published by
  14  *  the Free Software Foundation; either version 3 of the License, or
  15  *  (at your option) any later version.
  16  *
  17  *  This program is distributed in the hope that it will be useful,
  18  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20  *  GNU Library General Public License for more details.
  21  *
  22  *  You should have received a copy of the GNU General Public License
  23  *  along with this program; if not, write to the Free Software
  24  *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  25  */
  26
  27 //#define HAVE_MMAP //it will defined in config.h. this can be done by configure.in with a AC_FUNC_MMAP.
  28 #ifdef HAVE_CONFIG_H
  29 #  include "config.h"
  30 #endif
  31
  32 #include <cassert>
  33 #include <cstdio>
  34 #include <cstdlib>
  35 #include <cstring>
  36 #ifdef _WIN32
  37 #  include <io.h>
  38 #else
  39 #  include <unistd.h>
  40 #endif
  41 #include <limits.h>
  42 #include <fcntl.h>
  43
  44 #include <sys/stat.h>
  45
  46
  47 #include "dictziplib.hpp"
  48
  49 #define USE_CACHE 1
  50
  51 #define BUFFERSIZE 10240
  52
  53 /*
  54  * Output buffer must be greater than or
  55  * equal to 110% of input buffer size, plus
  56  * 12 bytes.
  57 */
  58 #define OUT_BUFFER_SIZE 0xffffL
  59
  60 #define IN_BUFFER_SIZE ((unsigned long)((double)(OUT_BUFFER_SIZE - 12) * 0.89))
  61
  62 /* For gzip-compatible header, as defined in RFC 1952 */
  63
  64                                 /* Magic for GZIP (rfc1952)                */
  65 #define GZ_MAGIC1     0x1f      /* First magic byte                        */
  66 #define GZ_MAGIC2     0x8b      /* Second magic byte                       */
  67
  68                                 /* FLaGs (bitmapped), from rfc1952         */
  69 #define GZ_FTEXT      0x01      /* Set for ASCII text                      */
  70 #define GZ_FHCRC      0x02      /* Header CRC16                            */
  71 #define GZ_FEXTRA     0x04      /* Optional field (random access index)    */
  72 #define GZ_FNAME      0x08      /* Original name                           */
  73 #define GZ_COMMENT    0x10      /* Zero-terminated, human-readable comment */
  74 #define GZ_MAX           2      /* Maximum compression                     */
  75 #define GZ_FAST          4      /* Fasted compression                      */
  76
  77                                 /* These are from rfc1952                  */
  78 #define GZ_OS_FAT        0      /* FAT filesystem (MS-DOS, OS/2, NT/Win32) */
  79 #define GZ_OS_AMIGA      1      /* Amiga                                   */
  80 #define GZ_OS_VMS        2      /* VMS (or OpenVMS)                        */
  81 #define GZ_OS_UNIX       3      /* Unix                                    */
  82 #define GZ_OS_VMCMS      4      /* VM/CMS                                  */
  83 #define GZ_OS_ATARI      5      /* Atari TOS                               */
  84 #define GZ_OS_HPFS       6      /* HPFS filesystem (OS/2, NT)              */
  85 #define GZ_OS_MAC        7      /* Macintosh                               */
  86 #define GZ_OS_Z          8      /* Z-System                                */
  87 #define GZ_OS_CPM        9      /* CP/M                                    */
  88 #define GZ_OS_TOPS20    10      /* TOPS-20                                 */
  89 #define GZ_OS_NTFS      11      /* NTFS filesystem (NT)                    */
  90 #define GZ_OS_QDOS      12      /* QDOS                                    */
  91 #define GZ_OS_ACORN     13      /* Acorn RISCOS                            */
  92 #define GZ_OS_UNKNOWN  255      /* unknown                                 */
  93
  94 #define GZ_RND_S1       'R'     /* First magic for random access format    */
  95 #define GZ_RND_S2       'A'     /* Second magic for random access format   */
  96
  97 #define GZ_ID1           0      /* GZ_MAGIC1                               */
  98 #define GZ_ID2           1      /* GZ_MAGIC2                               */
  99 #define GZ_CM            2      /* Compression Method (Z_DEFALTED)         */
 100 #define GZ_FLG           3      /* FLaGs (see above)                       */
 101 #define GZ_MTIME         4      /* Modification TIME                       */
 102 #define GZ_XFL           8      /* eXtra FLags (GZ_MAX or GZ_FAST)         */
 103 #define GZ_OS            9      /* Operating System                        */
 104 #define GZ_XLEN         10      /* eXtra LENgth (16bit)                    */
 105 #define GZ_FEXTRA_START 12      /* Start of extra fields                   */
 106 #define GZ_SI1          12      /* Subfield ID1                            */
 107 #define GZ_SI2          13      /* Subfield ID2                            */
 108 #define GZ_SUBLEN       14      /* Subfield length (16bit)                 */
 109 #define GZ_VERSION      16      /* Version for subfield format             */
 110 #define GZ_CHUNKLEN     18      /* Chunk length (16bit)                    */
 111 #define GZ_CHUNKCNT     20      /* Number of chunks (16bit)                */
 112 #define GZ_RNDDATA      22      /* Random access data (16bit)              */
 113
 114 #define DICT_UNKNOWN    0
 115 #define DICT_TEXT       1
 116 #define DICT_GZIP       2
 117 #define DICT_DZIP       3
 118
 119
 120 int dictData::read_header(const std::string &fname, int computeCRC)
 121 {
 122         FILE          *str;
 123         int           id1, id2, si1, si2;
 124         char          buffer[BUFFERSIZE];
 125         int           extraLength, subLength;
 126         int           i;
 127         char          *pt;
 128         int           c;
 129         struct stat   sb;
 130         unsigned long crc   = crc32( 0L, Z_NULL, 0 );
 131         int           count;
 132         unsigned long offset;
 133
 134         if (!(str = fopen(fname.c_str(), "rb"))) {
 135                 //err_fatal_errno( __FUNCTION__,
 136                 //       "Cannot open data file \"%s\" for read\n", filename );
 137         }
 138
 139         this->headerLength = GZ_XLEN - 1;
 140         this->type         = DICT_UNKNOWN;
 141
 142         id1                  = getc( str );
 143         id2                  = getc( str );
 144
 145         if (id1 != GZ_MAGIC1 || id2 != GZ_MAGIC2) {
 146                 this->type = DICT_TEXT;
 147 #if defined(_MSC_VER)
 148                 fstat( _fileno( str ), &sb );
 149 #else
 150                 fstat( fileno( str ), &sb );
 151 #endif
 152                 this->compressedLength = this->length = sb.st_size;
 153                 this->origFilename     = fname;
 154                 this->mtime            = sb.st_mtime;
 155                 if (computeCRC) {
 156                         rewind( str );
 157                         while (!feof( str )) {
 158                                 if ((count = fread( buffer, 1, BUFFERSIZE, str ))) {
 159                                         crc = crc32(crc, (Bytef *)buffer, count);
 160                                 }
 161                         }
 162                 }
 163                 this->crc = crc;
 164                 fclose( str );
 165                 return 0;
 166         }
 167         this->type = DICT_GZIP;
 168
 169         this->method       = getc( str );
 170         this->flags        = getc( str );
 171         this->mtime        = getc( str ) <<  0;
 172         this->mtime       |= getc( str ) <<  8;
 173         this->mtime       |= getc( str ) << 16;
 174         this->mtime       |= getc( str ) << 24;
 175         this->extraFlags   = getc( str );
 176         this->os           = getc( str );
 177
 178         if (this->flags & GZ_FEXTRA) {
 179                 extraLength          = getc( str ) << 0;
 180                 extraLength         |= getc( str ) << 8;
 181                 this->headerLength += extraLength + 2;
 182                 si1                  = getc( str );
 183                 si2                  = getc( str );
 184
 185                 if (si1 == GZ_RND_S1 || si2 == GZ_RND_S2) {
 186                         subLength            = getc( str ) << 0;
 187                         subLength           |= getc( str ) << 8;
 188                         this->version      = getc( str ) << 0;
 189                         this->version     |= getc( str ) << 8;
 190
 191                         if (this->version != 1) {
 192                                 //err_internal( __FUNCTION__,
 193                                 //        "dzip header version %d not supported\n",
 194                                 //        this->version );
 195                         }
 196
 197                         this->chunkLength  = getc( str ) << 0;
 198                         this->chunkLength |= getc( str ) << 8;
 199                         this->chunkCount   = getc( str ) << 0;
 200                         this->chunkCount  |= getc( str ) << 8;
 201
 202                         if (this->chunkCount <= 0) {
 203                                 fclose( str );
 204                                 return 5;
 205                         }
 206                         this->chunks = (int *)malloc(sizeof( this->chunks[0] )
 207                                                                                                                                                  * this->chunkCount );
 208                         for (i = 0; i < this->chunkCount; i++) {
 209                                 this->chunks[i]  = getc( str ) << 0;
 210                                 this->chunks[i] |= getc( str ) << 8;
 211                         }
 212                         this->type = DICT_DZIP;
 213                 } else {
 214                         fseek( str, this->headerLength, SEEK_SET );
 215                 }
 216         }
 217
 218         if (this->flags & GZ_FNAME) { /* FIXME! Add checking against header len */
 219                 pt = buffer;
 220                 while ((c = getc( str )) && c != EOF)
 221                         *pt++ = c;
 222                 *pt = '\0';
 223
 224                 this->origFilename = buffer;
 225                 this->headerLength += this->origFilename.length() + 1;
 226         } else {
 227                 this->origFilename = "";
 228         }
 229
 230    if (this->flags & GZ_COMMENT) { /* FIXME! Add checking for header len */
 231       pt = buffer;
 232       while ((c = getc( str )) && c != EOF)
 233          *pt++ = c;
 234       *pt = '\0';
 235       comment = buffer;
 236       headerLength += comment.length()+1;
 237    } else {
 238       comment = "";
 239    }
 240
 241    if (this->flags & GZ_FHCRC) {
 242       getc( str );
 243       getc( str );
 244       this->headerLength += 2;
 245    }
 246
 247    if (ftell( str ) != this->headerLength + 1) {
 248       //err_internal( __FUNCTION__,
 249                 //    "File position (%lu) != header length + 1 (%d)\n",
 250                   //  ftell( str ), this->headerLength + 1 );
 251    }
 252
 253    fseek( str, -8, SEEK_END );
 254    this->crc     = getc( str ) <<  0;
 255    this->crc    |= getc( str ) <<  8;
 256    this->crc    |= getc( str ) << 16;
 257    this->crc    |= getc( str ) << 24;
 258    this->length  = getc( str ) <<  0;
 259    this->length |= getc( str ) <<  8;
 260    this->length |= getc( str ) << 16;
 261    this->length |= getc( str ) << 24;
 262    this->compressedLength = ftell( str );
 263
 264                                 /* Compute offsets */
 265    this->offsets = (unsigned long *)malloc( sizeof( this->offsets[0] )
 266                                                                                                                                                                                         * this->chunkCount );
 267    for (offset = this->headerLength + 1, i = 0;
 268         i < this->chunkCount;
 269         i++) {
 270       this->offsets[i] = offset;
 271       offset += this->chunks[i];
 272    }
 273
 274    fclose( str );
 275    return 0;
 276 }
 277
 278 bool dictData::open(const std::string& fname, int computeCRC)
 279 {
 280         struct stat sb;
 281         int         j;
 282         int fd;
 283
 284         this->initialized = 0;
 285         if (!g_file_test(fname.c_str(),
 286                 GFileTest(G_FILE_TEST_EXISTS | G_FILE_TEST_IS_REGULAR))) {
 287                 //err_warning( __FUNCTION__,
 288                 //   "%s is not a regular file -- ignoring\n", fname );
 289                 return false;
 290         }
 291
 292         if (read_header(fname, computeCRC)) {
 293                 //err_fatal( __FUNCTION__,
 294                 // "\"%s\" not in text or dzip format\n", fname );
 295                 return false;
 296         }
 297
 298 #if defined(_MSC_VER)
 299         if ((fd = ::_open(fname.c_str(), O_RDONLY )) < 0) {
 300 #else
 301         if ((fd = ::open(fname.c_str(), O_RDONLY )) < 0) {
 302 #endif
 303                 //err_fatal_errno( __FUNCTION__,
 304                 //       "Cannot open data file \"%s\"\n", fname );
 305                 return false;
 306    }
 307    if (fstat(fd, &sb)) {
 308                  //err_fatal_errno( __FUNCTION__,
 309                  //       "Cannot stat data file \"%s\"\n", fname );
 310                  return false;
 311    }
 312
 313    this->size = sb.st_size;
 314 #if defined(_MSC_VER)
 315         ::_close(fd);
 316 #else
 317         ::close(fd);
 318 #endif
 319          if (!mapfile.open(fname.c_str(), size))
 320                  return false;
 321
 322          this->start=mapfile.begin();
 323    this->end = this->start + this->size;
 324
 325    for (j = 0; j < DICT_CACHE_SIZE; j++) {
 326                  cache[j].chunk    = -1;
 327                  cache[j].stamp    = -1;
 328                  cache[j].inBuffer = NULL;
 329                  cache[j].count    = 0;
 330    }
 331
 332    return true;
 333 }
 334
 335 void dictData::close()
 336 {
 337         int i;
 338
 339         if (this->chunks)
 340                 free(this->chunks);
 341         if (this->offsets)
 342                 free(this->offsets);
 343
 344         if (this->initialized) {
 345                 if (inflateEnd( &this->zStream )) {
 346                         //err_internal( __FUNCTION__,
 347                         //       "Cannot shut down inflation engine: %s\n",
 348                   //     this->zStream.msg );
 349           }
 350         }
 351
 352         for (i = 0; i < DICT_CACHE_SIZE; ++i){
 353                 if (this -> cache [i].inBuffer)
 354                         free (this -> cache [i].inBuffer);
 355         }
 356 }
 357
 358 void dictData::read(char *buffer, unsigned long start, unsigned long size)
 359 {
 360         char          *pt;
 361         unsigned long end;
 362         int           count;
 363         char          *inBuffer;
 364         char          outBuffer[OUT_BUFFER_SIZE];
 365         int           firstChunk, lastChunk;
 366         int           firstOffset, lastOffset;
 367         int           i, j;
 368         int           found, target, lastStamp;
 369         static int    stamp = 0;
 370
 371         end  = start + size;
 372
 373         //buffer = malloc( size + 1 );
 374
 375         //PRINTF(DBG_UNZIP,
 376         // ("dict_data_read( %p, %lu, %lu )\n",
 377         //h, start, size ));
 378
 379
 380         switch (this->type) {
 381         case DICT_GZIP:
 382                 //err_fatal( __FUNCTION__,
 383                 // "Cannot seek on pure gzip format files.\n"
 384                 // "Use plain text (for performance)"
 385                 // " or dzip format (for space savings).\n" );
 386                 break;
 387         case DICT_TEXT:
 388                 memcpy( buffer, this->start + start, size );
 389                 //buffer[size] = '\0';
 390                 break;
 391         case DICT_DZIP:
 392                 if (!this->initialized) {
 393                         ++this->initialized;
 394                         this->zStream.zalloc    = NULL;
 395                         this->zStream.zfree     = NULL;
 396                         this->zStream.opaque    = NULL;
 397                         this->zStream.next_in   = 0;
 398                         this->zStream.avail_in  = 0;
 399                         this->zStream.next_out  = NULL;
 400                         this->zStream.avail_out = 0;
 401                         if (inflateInit2( &this->zStream, -15 ) != Z_OK) {
 402                                 //err_internal( __FUNCTION__,
 403                                 //  "Cannot initialize inflation engine: %s\n",
 404                           //this->zStream.msg );
 405                         }
 406                 }
 407                 firstChunk  = start / this->chunkLength;
 408                 firstOffset = start - firstChunk * this->chunkLength;
 409                 lastChunk   = end / this->chunkLength;
 410                 lastOffset  = end - lastChunk * this->chunkLength;
 411                 //PRINTF(DBG_UNZIP,
 412                 // ("   start = %lu, end = %lu\n"
 413                 //"firstChunk = %d, firstOffset = %d,"
 414                 //" lastChunk = %d, lastOffset = %d\n",
 415                 //start, end, firstChunk, firstOffset, lastChunk, lastOffset ));
 416                 for (pt = buffer, i = firstChunk; i <= lastChunk; i++) {
 417
 418                         /* Access cache */
 419                         found  = 0;
 420                         target = 0;
 421                         lastStamp = INT_MAX;
 422                         for (j = 0; j < DICT_CACHE_SIZE; j++) {
 423 #if USE_CACHE
 424                                 if (this->cache[j].chunk == i) {
 425                                         found  = 1;
 426                                         target = j;
 427                                         break;
 428                                 }
 429 #endif
 430                                 if (this->cache[j].stamp < lastStamp) {
 431                                         lastStamp = this->cache[j].stamp;
 432                                         target = j;
 433                                 }
 434                         }
 435
 436                         this->cache[target].stamp = ++stamp;
 437                         if (found) {
 438                                 count = this->cache[target].count;
 439                                 inBuffer = this->cache[target].inBuffer;
 440                         } else {
 441                                 this->cache[target].chunk = i;
 442                                 if (!this->cache[target].inBuffer)
 443                                         this->cache[target].inBuffer = (char *)malloc( IN_BUFFER_SIZE );
 444                                 inBuffer = this->cache[target].inBuffer;
 445
 446                                 if (this->chunks[i] >= OUT_BUFFER_SIZE ) {
 447                                         //err_internal( __FUNCTION__,
 448                                         //    "this->chunks[%d] = %d >= %ld (OUT_BUFFER_SIZE)\n",
 449                                         //  i, this->chunks[i], OUT_BUFFER_SIZE );
 450                                 }
 451                                 memcpy( outBuffer, this->start + this->offsets[i], this->chunks[i] );
 452
 453                                 this->zStream.next_in   = (Bytef *)outBuffer;
 454                                 this->zStream.avail_in  = this->chunks[i];
 455                                 this->zStream.next_out  = (Bytef *)inBuffer;
 456                                 this->zStream.avail_out = IN_BUFFER_SIZE;
 457                                 if (inflate( &this->zStream,  Z_PARTIAL_FLUSH ) != Z_OK) {
 458                                         //err_fatal( __FUNCTION__, "inflate: %s\n", this->zStream.msg );
 459                                 }
 460                                 if (this->zStream.avail_in) {
 461                                         //err_internal( __FUNCTION__,
 462                                         //    "inflate did not flush (%d pending, %d avail)\n",
 463                                         //  this->zStream.avail_in, this->zStream.avail_out );
 464                                 }
 465
 466                                 count = IN_BUFFER_SIZE - this->zStream.avail_out;
 467
 468                                 this->cache[target].count = count;
 469                         }
 470
 471                         if (i == firstChunk) {
 472                                 if (i == lastChunk) {
 473                                         memcpy( pt, inBuffer + firstOffset, lastOffset-firstOffset);
 474                                         pt += lastOffset - firstOffset;
 475                                 } else {
 476                                         if (count != this->chunkLength ) {
 477                                                 //err_internal( __FUNCTION__,
 478                                                 //      "Length = %d instead of %d\n",
 479                                                 //count, this->chunkLength );
 480                                         }
 481                                         memcpy( pt, inBuffer + firstOffset,
 482                                                                         this->chunkLength - firstOffset );
 483                                         pt += this->chunkLength - firstOffset;
 484                                 }
 485                         } else if (i == lastChunk) {
 486                                 memcpy( pt, inBuffer, lastOffset );
 487                                 pt += lastOffset;
 488                         } else {
 489                                 assert( count == this->chunkLength );
 490                                 memcpy( pt, inBuffer, this->chunkLength );
 491                                 pt += this->chunkLength;
 492                         }
 493                 }
 494                 //*pt = '\0';
 495                 break;
 496         case DICT_UNKNOWN:
 497                 //err_fatal( __FUNCTION__, "Cannot read unknown file type\n" );
 498                 break;
 499         }
 500 }