Imported version 0.2-1
[mstardict] / src / lib / dictziplib.cpp
diff --git a/src/lib/dictziplib.cpp b/src/lib/dictziplib.cpp
new file mode 100644 (file)
index 0000000..29a6e3a
--- /dev/null
@@ -0,0 +1,500 @@
+/* dictziplib.c --
+ * http://stardict.sourceforge.net
+ * Copyright (C) 2003-2003 Hu Zheng <huzheng_001@163.com>
+ * This file is a modify version of dictd-1.9.7's data.c
+ *
+ * data.c -- 
+ * Created: Tue Jul 16 12:45:41 1996 by faith@dict.org
+ * Revised: Sat Mar 30 10:46:06 2002 by faith@dict.org
+ * Copyright 1996, 1997, 1998, 2000, 2002 Rickard E. Faith (faith@dict.org)
+ * 
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Library General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
+//#define HAVE_MMAP //it will defined in config.h. this can be done by configure.in with a AC_FUNC_MMAP.
+#ifdef HAVE_CONFIG_H
+#  include "config.h"
+#endif
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#ifdef _WIN32
+#  include <io.h>
+#else
+#  include <unistd.h>
+#endif
+#include <limits.h>
+#include <fcntl.h>
+
+#include <sys/stat.h>
+
+
+#include "dictziplib.hpp"
+
+#define USE_CACHE 1
+
+#define BUFFERSIZE 10240
+
+/* 
+ * Output buffer must be greater than or
+ * equal to 110% of input buffer size, plus
+ * 12 bytes. 
+*/
+#define OUT_BUFFER_SIZE 0xffffL
+
+#define IN_BUFFER_SIZE ((unsigned long)((double)(OUT_BUFFER_SIZE - 12) * 0.89))
+
+/* For gzip-compatible header, as defined in RFC 1952 */
+
+                               /* Magic for GZIP (rfc1952)                */
+#define GZ_MAGIC1     0x1f     /* First magic byte                        */
+#define GZ_MAGIC2     0x8b     /* Second magic byte                       */
+
+                               /* FLaGs (bitmapped), from rfc1952         */
+#define GZ_FTEXT      0x01     /* Set for ASCII text                      */
+#define GZ_FHCRC      0x02     /* Header CRC16                            */
+#define GZ_FEXTRA     0x04     /* Optional field (random access index)    */
+#define GZ_FNAME      0x08     /* Original name                           */
+#define GZ_COMMENT    0x10     /* Zero-terminated, human-readable comment */
+#define GZ_MAX           2     /* Maximum compression                     */
+#define GZ_FAST          4     /* Fasted compression                      */
+
+                               /* These are from rfc1952                  */
+#define GZ_OS_FAT        0     /* FAT filesystem (MS-DOS, OS/2, NT/Win32) */
+#define GZ_OS_AMIGA      1     /* Amiga                                   */
+#define GZ_OS_VMS        2     /* VMS (or OpenVMS)                        */
+#define GZ_OS_UNIX       3      /* Unix                                    */
+#define GZ_OS_VMCMS      4      /* VM/CMS                                  */
+#define GZ_OS_ATARI      5      /* Atari TOS                               */
+#define GZ_OS_HPFS       6      /* HPFS filesystem (OS/2, NT)              */
+#define GZ_OS_MAC        7      /* Macintosh                               */
+#define GZ_OS_Z          8      /* Z-System                                */
+#define GZ_OS_CPM        9      /* CP/M                                    */
+#define GZ_OS_TOPS20    10      /* TOPS-20                                 */
+#define GZ_OS_NTFS      11      /* NTFS filesystem (NT)                    */
+#define GZ_OS_QDOS      12      /* QDOS                                    */
+#define GZ_OS_ACORN     13      /* Acorn RISCOS                            */
+#define GZ_OS_UNKNOWN  255      /* unknown                                 */
+
+#define GZ_RND_S1       'R'    /* First magic for random access format    */
+#define GZ_RND_S2       'A'    /* Second magic for random access format   */
+
+#define GZ_ID1           0     /* GZ_MAGIC1                               */
+#define GZ_ID2           1     /* GZ_MAGIC2                               */
+#define GZ_CM            2     /* Compression Method (Z_DEFALTED)         */
+#define GZ_FLG          3      /* FLaGs (see above)                       */
+#define GZ_MTIME         4     /* Modification TIME                       */
+#define GZ_XFL           8     /* eXtra FLags (GZ_MAX or GZ_FAST)         */
+#define GZ_OS            9     /* Operating System                        */
+#define GZ_XLEN         10     /* eXtra LENgth (16bit)                    */
+#define GZ_FEXTRA_START 12     /* Start of extra fields                   */
+#define GZ_SI1          12     /* Subfield ID1                            */
+#define GZ_SI2          13      /* Subfield ID2                            */
+#define GZ_SUBLEN       14     /* Subfield length (16bit)                 */
+#define GZ_VERSION      16      /* Version for subfield format             */
+#define GZ_CHUNKLEN     18     /* Chunk length (16bit)                    */
+#define GZ_CHUNKCNT     20     /* Number of chunks (16bit)                */
+#define GZ_RNDDATA      22     /* Random access data (16bit)              */
+
+#define DICT_UNKNOWN    0
+#define DICT_TEXT       1
+#define DICT_GZIP       2
+#define DICT_DZIP       3
+
+
+int dictData::read_header(const std::string &fname, int computeCRC)
+{
+       FILE          *str;
+       int           id1, id2, si1, si2;
+       char          buffer[BUFFERSIZE];
+       int           extraLength, subLength;
+       int           i;
+       char          *pt;
+       int           c;
+       struct stat   sb;
+       unsigned long crc   = crc32( 0L, Z_NULL, 0 );
+       int           count;
+       unsigned long offset;
+       
+       if (!(str = fopen(fname.c_str(), "rb"))) {
+               //err_fatal_errno( __FUNCTION__,
+               //       "Cannot open data file \"%s\" for read\n", filename );
+       }       
+
+       this->headerLength = GZ_XLEN - 1;
+       this->type         = DICT_UNKNOWN;
+   
+       id1                  = getc( str );
+       id2                  = getc( str );
+       
+       if (id1 != GZ_MAGIC1 || id2 != GZ_MAGIC2) {
+               this->type = DICT_TEXT;
+#if defined(_MSC_VER)
+               fstat( _fileno( str ), &sb );
+#else
+               fstat( fileno( str ), &sb );
+#endif
+               this->compressedLength = this->length = sb.st_size;
+               this->origFilename     = fname;
+               this->mtime            = sb.st_mtime;
+               if (computeCRC) {
+                       rewind( str );
+                       while (!feof( str )) {
+                               if ((count = fread( buffer, 1, BUFFERSIZE, str ))) {
+                                       crc = crc32(crc, (Bytef *)buffer, count);
+                               }
+                       }
+               }
+               this->crc = crc;
+               fclose( str );
+               return 0;
+       }
+       this->type = DICT_GZIP;
+  
+       this->method       = getc( str );
+       this->flags        = getc( str );
+       this->mtime        = getc( str ) <<  0;
+       this->mtime       |= getc( str ) <<  8;
+       this->mtime       |= getc( str ) << 16;
+       this->mtime       |= getc( str ) << 24;
+       this->extraFlags   = getc( str );
+       this->os           = getc( str );
+  
+       if (this->flags & GZ_FEXTRA) {
+               extraLength          = getc( str ) << 0;
+               extraLength         |= getc( str ) << 8;
+               this->headerLength += extraLength + 2;
+               si1                  = getc( str );
+               si2                  = getc( str );
+    
+               if (si1 == GZ_RND_S1 || si2 == GZ_RND_S2) {
+                       subLength            = getc( str ) << 0;
+                       subLength           |= getc( str ) << 8;
+                       this->version      = getc( str ) << 0;
+                       this->version     |= getc( str ) << 8;
+                       
+                       if (this->version != 1) {
+                               //err_internal( __FUNCTION__,
+                               //        "dzip header version %d not supported\n",
+                               //        this->version );
+                       }
+                       
+                       this->chunkLength  = getc( str ) << 0;
+                       this->chunkLength |= getc( str ) << 8;
+                       this->chunkCount   = getc( str ) << 0;
+                       this->chunkCount  |= getc( str ) << 8;
+                       
+                       if (this->chunkCount <= 0) {
+                               fclose( str );
+                               return 5;
+                       }
+                       this->chunks = (int *)malloc(sizeof( this->chunks[0] )
+                                                                                                                                                * this->chunkCount );
+                       for (i = 0; i < this->chunkCount; i++) {
+                               this->chunks[i]  = getc( str ) << 0;
+                               this->chunks[i] |= getc( str ) << 8;
+                       }
+                       this->type = DICT_DZIP;
+               } else {
+                       fseek( str, this->headerLength, SEEK_SET );
+               }
+       }
+       
+       if (this->flags & GZ_FNAME) { /* FIXME! Add checking against header len */
+               pt = buffer;
+               while ((c = getc( str )) && c != EOF)
+                       *pt++ = c;
+               *pt = '\0';
+               
+               this->origFilename = buffer;
+               this->headerLength += this->origFilename.length() + 1;
+       } else {
+               this->origFilename = "";
+       }
+   
+   if (this->flags & GZ_COMMENT) { /* FIXME! Add checking for header len */
+      pt = buffer;
+      while ((c = getc( str )) && c != EOF)
+        *pt++ = c;
+      *pt = '\0';
+      comment = buffer;
+      headerLength += comment.length()+1;
+   } else {
+      comment = "";
+   }
+
+   if (this->flags & GZ_FHCRC) {
+      getc( str );
+      getc( str );
+      this->headerLength += 2;
+   }
+
+   if (ftell( str ) != this->headerLength + 1) {
+      //err_internal( __FUNCTION__,
+               //    "File position (%lu) != header length + 1 (%d)\n",
+                 //  ftell( str ), this->headerLength + 1 );
+   }
+
+   fseek( str, -8, SEEK_END );
+   this->crc     = getc( str ) <<  0;
+   this->crc    |= getc( str ) <<  8;
+   this->crc    |= getc( str ) << 16;
+   this->crc    |= getc( str ) << 24;
+   this->length  = getc( str ) <<  0;
+   this->length |= getc( str ) <<  8;
+   this->length |= getc( str ) << 16;
+   this->length |= getc( str ) << 24;
+   this->compressedLength = ftell( str );
+
+                               /* Compute offsets */
+   this->offsets = (unsigned long *)malloc( sizeof( this->offsets[0] )
+                                                                                                                                                                                       * this->chunkCount );
+   for (offset = this->headerLength + 1, i = 0;
+       i < this->chunkCount;
+       i++) {
+      this->offsets[i] = offset;
+      offset += this->chunks[i];
+   }
+
+   fclose( str );
+   return 0;
+}
+
+bool dictData::open(const std::string& fname, int computeCRC)
+{
+       struct stat sb;
+       int         j;
+       int fd;
+
+       this->initialized = 0;
+       if (!g_file_test(fname.c_str(),
+               GFileTest(G_FILE_TEST_EXISTS | G_FILE_TEST_IS_REGULAR))) {      
+               //err_warning( __FUNCTION__,
+               //   "%s is not a regular file -- ignoring\n", fname );
+               return false;
+       }
+   
+       if (read_header(fname, computeCRC)) {
+               //err_fatal( __FUNCTION__,
+               // "\"%s\" not in text or dzip format\n", fname );
+               return false;
+       }
+
+#if defined(_MSC_VER)
+       if ((fd = ::_open(fname.c_str(), O_RDONLY )) < 0) {
+#else
+       if ((fd = ::open(fname.c_str(), O_RDONLY )) < 0) {
+#endif
+               //err_fatal_errno( __FUNCTION__,
+               //       "Cannot open data file \"%s\"\n", fname );
+               return false;
+   }
+   if (fstat(fd, &sb)) {
+                //err_fatal_errno( __FUNCTION__,
+                //       "Cannot stat data file \"%s\"\n", fname );
+                return false;
+   }
+
+   this->size = sb.st_size;
+#if defined(_MSC_VER)
+       ::_close(fd);
+#else
+       ::close(fd);
+#endif
+        if (!mapfile.open(fname.c_str(), size))
+                return false;          
+
+        this->start=mapfile.begin();
+   this->end = this->start + this->size;
+
+   for (j = 0; j < DICT_CACHE_SIZE; j++) {
+                cache[j].chunk    = -1;
+                cache[j].stamp    = -1;
+                cache[j].inBuffer = NULL;
+                cache[j].count    = 0;
+   }
+   
+   return true;
+}
+
+void dictData::close()
+{
+       int i;   
+       
+       if (this->chunks)
+               free(this->chunks);
+       if (this->offsets)
+               free(this->offsets);
+
+       if (this->initialized) {
+               if (inflateEnd( &this->zStream )) {
+                       //err_internal( __FUNCTION__,
+                       //       "Cannot shut down inflation engine: %s\n",
+                 //     this->zStream.msg );
+         }
+       }
+
+       for (i = 0; i < DICT_CACHE_SIZE; ++i){
+               if (this -> cache [i].inBuffer)
+                       free (this -> cache [i].inBuffer);
+       }
+}
+
+void dictData::read(char *buffer, unsigned long start, unsigned long size)
+{
+       char          *pt;
+       unsigned long end;
+       int           count;
+       char          *inBuffer;
+       char          outBuffer[OUT_BUFFER_SIZE];
+       int           firstChunk, lastChunk;
+       int           firstOffset, lastOffset;
+       int           i, j;
+       int           found, target, lastStamp;
+       static int    stamp = 0;
+       
+       end  = start + size;
+       
+       //buffer = malloc( size + 1 );
+  
+       //PRINTF(DBG_UNZIP,
+       // ("dict_data_read( %p, %lu, %lu )\n",
+       //h, start, size ));
+       
+  
+       switch (this->type) {
+       case DICT_GZIP:
+               //err_fatal( __FUNCTION__,
+               // "Cannot seek on pure gzip format files.\n"
+               // "Use plain text (for performance)"
+               // " or dzip format (for space savings).\n" );
+               break;
+       case DICT_TEXT:
+               memcpy( buffer, this->start + start, size );
+               //buffer[size] = '\0';
+               break;
+       case DICT_DZIP:
+               if (!this->initialized) {
+                       ++this->initialized;
+                       this->zStream.zalloc    = NULL;
+                       this->zStream.zfree     = NULL;
+                       this->zStream.opaque    = NULL;
+                       this->zStream.next_in   = 0;
+                       this->zStream.avail_in  = 0;
+                       this->zStream.next_out  = NULL;
+                       this->zStream.avail_out = 0;
+                       if (inflateInit2( &this->zStream, -15 ) != Z_OK) {
+                               //err_internal( __FUNCTION__,
+                               //  "Cannot initialize inflation engine: %s\n",
+                         //this->zStream.msg );
+                       }
+               }
+               firstChunk  = start / this->chunkLength;
+               firstOffset = start - firstChunk * this->chunkLength;
+               lastChunk   = end / this->chunkLength;
+               lastOffset  = end - lastChunk * this->chunkLength;
+               //PRINTF(DBG_UNZIP,
+               // ("   start = %lu, end = %lu\n"
+               //"firstChunk = %d, firstOffset = %d,"
+               //" lastChunk = %d, lastOffset = %d\n",
+               //start, end, firstChunk, firstOffset, lastChunk, lastOffset ));
+               for (pt = buffer, i = firstChunk; i <= lastChunk; i++) {
+                       
+                       /* Access cache */
+                       found  = 0;
+                       target = 0;
+                       lastStamp = INT_MAX;
+                       for (j = 0; j < DICT_CACHE_SIZE; j++) {
+#if USE_CACHE
+                               if (this->cache[j].chunk == i) {
+                                       found  = 1;
+                                       target = j;
+                                       break;
+                               }
+#endif
+                               if (this->cache[j].stamp < lastStamp) {
+                                       lastStamp = this->cache[j].stamp;
+                                       target = j;
+                               }
+                       }
+                       
+                       this->cache[target].stamp = ++stamp;
+                       if (found) {
+                               count = this->cache[target].count;
+                               inBuffer = this->cache[target].inBuffer;
+                       } else {
+                               this->cache[target].chunk = i;
+                               if (!this->cache[target].inBuffer)
+                                       this->cache[target].inBuffer = (char *)malloc( IN_BUFFER_SIZE );
+                               inBuffer = this->cache[target].inBuffer;
+                               
+                               if (this->chunks[i] >= OUT_BUFFER_SIZE ) {
+                                       //err_internal( __FUNCTION__,
+                                       //    "this->chunks[%d] = %d >= %ld (OUT_BUFFER_SIZE)\n",
+                                       //  i, this->chunks[i], OUT_BUFFER_SIZE );
+                               }
+                               memcpy( outBuffer, this->start + this->offsets[i], this->chunks[i] );
+                               
+                               this->zStream.next_in   = (Bytef *)outBuffer;
+                               this->zStream.avail_in  = this->chunks[i];
+                               this->zStream.next_out  = (Bytef *)inBuffer;
+                               this->zStream.avail_out = IN_BUFFER_SIZE;
+                               if (inflate( &this->zStream,  Z_PARTIAL_FLUSH ) != Z_OK) {
+                                       //err_fatal( __FUNCTION__, "inflate: %s\n", this->zStream.msg );
+                               }
+                               if (this->zStream.avail_in) {
+                                       //err_internal( __FUNCTION__,
+                                       //    "inflate did not flush (%d pending, %d avail)\n",
+                                       //  this->zStream.avail_in, this->zStream.avail_out );
+                               }
+                               
+                               count = IN_BUFFER_SIZE - this->zStream.avail_out;
+                               
+                               this->cache[target].count = count;
+                       }
+                       
+                       if (i == firstChunk) {
+                               if (i == lastChunk) {
+                                       memcpy( pt, inBuffer + firstOffset, lastOffset-firstOffset);
+                                       pt += lastOffset - firstOffset;
+                               } else {
+                                       if (count != this->chunkLength ) {
+                                               //err_internal( __FUNCTION__,
+                                               //      "Length = %d instead of %d\n",
+                                               //count, this->chunkLength );
+                                       }
+                                       memcpy( pt, inBuffer + firstOffset,
+                                                                       this->chunkLength - firstOffset );
+                                       pt += this->chunkLength - firstOffset;
+                               }
+                       } else if (i == lastChunk) {
+                               memcpy( pt, inBuffer, lastOffset );
+                               pt += lastOffset;
+                       } else {
+                               assert( count == this->chunkLength );
+                               memcpy( pt, inBuffer, this->chunkLength );
+                               pt += this->chunkLength;
+                       }
+               }
+               //*pt = '\0';
+               break;
+       case DICT_UNKNOWN:
+               //err_fatal( __FUNCTION__, "Cannot read unknown file type\n" );
+               break;
+       }
+}