X-Git-Url: http://vcs.maemo.org/git/?a=blobdiff_plain;f=plugins%2Fstardict%2Flib.cpp;fp=plugins%2Fstardict%2Flib.cpp;h=e18183f39651449a9c72fe65bd50dc819b0d4e18;hb=15ae4fc455d1171c3406300abf80a18014a61dff;hp=0000000000000000000000000000000000000000;hpb=a495e0bb50b65271284b6ded45b6b61bc9d8dc20;p=qstardict diff --git a/plugins/stardict/lib.cpp b/plugins/stardict/lib.cpp new file mode 100644 index 0000000..e18183f --- /dev/null +++ b/plugins/stardict/lib.cpp @@ -0,0 +1,1925 @@ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include +#include +#include + +#include +#include +#include + +#include "distance.h" +#include "file.hpp" +#include "mapfile.hpp" + +#include "lib.h" + +// Notice: read src/tools/DICTFILE_FORMAT for the dictionary +// file's format information! + + +static inline bool bIsVowel(gchar inputchar) +{ + gchar ch = g_ascii_toupper(inputchar); + return ( ch == 'A' || ch == 'E' || ch == 'I' || ch == 'O' || ch == 'U' ); +} + +static bool bIsPureEnglish(const gchar *str) +{ + // i think this should work even when it is UTF8 string :). + for (int i = 0; str[i] != 0; i++) + //if(str[i]<0) + //if(str[i]<32 || str[i]>126) // tab equal 9,so this is not OK. + // Better use isascii() but not str[i]<0 while char is default unsigned in arm + if (!isascii(str[i])) + return false; + return true; +} + +static inline gint stardict_strcmp(const gchar *s1, const gchar *s2) +{ + gint a = g_ascii_strcasecmp(s1, s2); + if (a == 0) + return strcmp(s1, s2); + else + return a; +} + +bool DictInfo::load_from_ifo_file(const std::string& ifofilename, + bool istreedict) +{ + ifo_file_name = ifofilename; + gchar *buffer; + if (!g_file_get_contents(ifofilename.c_str(), &buffer, NULL, NULL)) + return false; + +#define TREEDICT_MAGIC_DATA "StarDict's treedict ifo file\nversion=2.4.2\n" +#define DICT_MAGIC_DATA "StarDict's dict ifo file\nversion=2.4.2\n" + + const gchar *magic_data = istreedict ? TREEDICT_MAGIC_DATA : DICT_MAGIC_DATA; + if (!g_str_has_prefix(buffer, magic_data)) + { + g_free(buffer); + return false; + } + + gchar *p1, *p2, *p3; + + p1 = buffer + strlen(magic_data) - 1; + + p2 = strstr(p1, "\nwordcount="); + if (!p2) + { + g_free(buffer); + return false; + } + + p3 = strchr(p2 + sizeof("\nwordcount=") - 1, '\n'); + gchar *tmpstr = (gchar *)g_memdup(p2 + sizeof("\nwordcount=") - 1, p3 - (p2 + sizeof("\nwordcount=") - 1) + 1); + tmpstr[p3 - (p2 + sizeof("\nwordcount=") - 1)] = '\0'; + wordcount = atol(tmpstr); + g_free(tmpstr); + + if (istreedict) + { + p2 = strstr(p1, "\ntdxfilesize="); + if (!p2) + { + g_free(buffer); + return false; + } + p3 = strchr(p2 + sizeof("\ntdxfilesize=") - 1, '\n'); + tmpstr = (gchar *)g_memdup(p2 + sizeof("\ntdxfilesize=") - 1, p3 - (p2 + sizeof("\ntdxfilesize=") - 1) + 1); + tmpstr[p3 - (p2 + sizeof("\ntdxfilesize=") - 1)] = '\0'; + index_file_size = atol(tmpstr); + g_free(tmpstr); + } + else + { + + p2 = strstr(p1, "\nidxfilesize="); + if (!p2) + { + g_free(buffer); + return false; + } + + p3 = strchr(p2 + sizeof("\nidxfilesize=") - 1, '\n'); + tmpstr = (gchar *)g_memdup(p2 + sizeof("\nidxfilesize=") - 1, p3 - (p2 + sizeof("\nidxfilesize=") - 1) + 1); + tmpstr[p3 - (p2 + sizeof("\nidxfilesize=") - 1)] = '\0'; + index_file_size = atol(tmpstr); + g_free(tmpstr); + } + + p2 = strstr(p1, "\nbookname="); + + if (!p2) + { + g_free(buffer); + return false; + } + + p2 = p2 + sizeof("\nbookname=") - 1; + p3 = strchr(p2, '\n'); + bookname.assign(p2, p3 - p2); + + p2 = strstr(p1, "\nauthor="); + if (p2) + { + p2 = p2 + sizeof("\nauthor=") - 1; + p3 = strchr(p2, '\n'); + author.assign(p2, p3 - p2); + } + + p2 = strstr(p1, "\nemail="); + if (p2) + { + p2 = p2 + sizeof("\nemail=") - 1; + p3 = strchr(p2, '\n'); + email.assign(p2, p3 - p2); + } + + p2 = strstr(p1, "\nwebsite="); + if (p2) + { + p2 = p2 + sizeof("\nwebsite=") - 1; + p3 = strchr(p2, '\n'); + website.assign(p2, p3 - p2); + } + + p2 = strstr(p1, "\ndate="); + if (p2) + { + p2 = p2 + sizeof("\ndate=") - 1; + p3 = strchr(p2, '\n'); + date.assign(p2, p3 - p2); + } + + p2 = strstr(p1, "\ndescription="); + if (p2) + { + p2 = p2 + sizeof("\ndescription=") - 1; + p3 = strchr(p2, '\n'); + description.assign(p2, p3 - p2); + } + + p2 = strstr(p1, "\nsametypesequence="); + if (p2) + { + p2 += sizeof("\nsametypesequence=") - 1; + p3 = strchr(p2, '\n'); + sametypesequence.assign(p2, p3 - p2); + } + + g_free(buffer); + + return true; +} +//=================================================================== +DictBase::DictBase() +{ + dictfile = NULL; + cache_cur = 0; +} + +DictBase::~DictBase() +{ + if (dictfile) + fclose(dictfile); +} + +gchar* DictBase::GetWordData(guint32 idxitem_offset, guint32 idxitem_size) +{ + for (int i = 0; i < WORDDATA_CACHE_NUM; i++) + if (cache[i].data && cache[i].offset == idxitem_offset) + return cache[i].data; + + if (dictfile) + fseek(dictfile, idxitem_offset, SEEK_SET); + + gchar *data; + if (!sametypesequence.empty()) + { + gchar *origin_data = (gchar *)g_malloc(idxitem_size); + + if (dictfile) + fread(origin_data, idxitem_size, 1, dictfile); + else + dictdzfile->read(origin_data, idxitem_offset, idxitem_size); + + guint32 data_size; + gint sametypesequence_len = sametypesequence.length(); + //there have sametypesequence_len char being omitted. + data_size = idxitem_size + sizeof(guint32) + sametypesequence_len; + //if the last item's size is determined by the end up '\0',then +=sizeof(gchar); + //if the last item's size is determined by the head guint32 type data,then +=sizeof(guint32); + switch (sametypesequence[sametypesequence_len - 1]) + { + case 'm': + case 't': + case 'y': + case 'l': + case 'g': + case 'x': + data_size += sizeof(gchar); + break; + case 'W': + case 'P': + data_size += sizeof(guint32); + break; + default: + if (g_ascii_isupper(sametypesequence[sametypesequence_len - 1])) + data_size += sizeof(guint32); + else + data_size += sizeof(gchar); + break; + } + data = (gchar *)g_malloc(data_size); + gchar *p1, *p2; + p1 = data + sizeof(guint32); + p2 = origin_data; + guint32 sec_size; + //copy the head items. + for (int i = 0; i < sametypesequence_len - 1; i++) + { + *p1 = sametypesequence[i]; + p1 += sizeof(gchar); + switch (sametypesequence[i]) + { + case 'm': + case 't': + case 'y': + case 'l': + case 'g': + case 'x': + sec_size = strlen(p2) + 1; + memcpy(p1, p2, sec_size); + p1 += sec_size; + p2 += sec_size; + break; + case 'W': + case 'P': + sec_size = *reinterpret_cast(p2); + sec_size += sizeof(guint32); + memcpy(p1, p2, sec_size); + p1 += sec_size; + p2 += sec_size; + break; + default: + if (g_ascii_isupper(sametypesequence[i])) + { + sec_size = *reinterpret_cast(p2); + sec_size += sizeof(guint32); + } + else + { + sec_size = strlen(p2) + 1; + } + memcpy(p1, p2, sec_size); + p1 += sec_size; + p2 += sec_size; + break; + } + } + //calculate the last item 's size. + sec_size = idxitem_size - (p2 - origin_data); + *p1 = sametypesequence[sametypesequence_len - 1]; + p1 += sizeof(gchar); + switch (sametypesequence[sametypesequence_len - 1]) + { + case 'm': + case 't': + case 'y': + case 'l': + case 'g': + case 'x': + memcpy(p1, p2, sec_size); + p1 += sec_size; + *p1 = '\0'; //add the end up '\0'; + break; + case 'W': + case 'P': + *reinterpret_cast(p1) = sec_size; + p1 += sizeof(guint32); + memcpy(p1, p2, sec_size); + break; + default: + if (g_ascii_isupper(sametypesequence[sametypesequence_len - 1])) + { + *reinterpret_cast(p1) = sec_size; + p1 += sizeof(guint32); + memcpy(p1, p2, sec_size); + } + else + { + memcpy(p1, p2, sec_size); + p1 += sec_size; + *p1 = '\0'; + } + break; + } + g_free(origin_data); + *reinterpret_cast(data) = data_size; + } + else + { + data = (gchar *)g_malloc(idxitem_size + sizeof(guint32)); + if (dictfile) + fread(data + sizeof(guint32), idxitem_size, 1, dictfile); + else + dictdzfile->read(data + sizeof(guint32), idxitem_offset, idxitem_size); + *reinterpret_cast(data) = idxitem_size + sizeof(guint32); + } + g_free(cache[cache_cur].data); + + cache[cache_cur].data = data; + cache[cache_cur].offset = idxitem_offset; + cache_cur++; + if (cache_cur == WORDDATA_CACHE_NUM) + cache_cur = 0; + return data; +} + +inline bool DictBase::containSearchData() +{ + if (sametypesequence.empty()) + return true; + + return sametypesequence.find_first_of("mlgxty") != std::string::npos; +} + +bool DictBase::SearchData(std::vector &SearchWords, guint32 idxitem_offset, guint32 idxitem_size, gchar *origin_data) +{ + int nWord = SearchWords.size(); + std::vector WordFind(nWord, false); + int nfound = 0; + + if (dictfile) + fseek(dictfile, idxitem_offset, SEEK_SET); + if (dictfile) + fread(origin_data, idxitem_size, 1, dictfile); + else + dictdzfile->read(origin_data, idxitem_offset, idxitem_size); + gchar *p = origin_data; + guint32 sec_size; + int j; + if (!sametypesequence.empty()) + { + gint sametypesequence_len = sametypesequence.length(); + for (int i = 0; i < sametypesequence_len - 1; i++) + { + switch (sametypesequence[i]) + { + case 'm': + case 't': + case 'y': + case 'l': + case 'g': + case 'x': + for (j = 0; j < nWord; j++) + if (!WordFind[j] && strstr(p, SearchWords[j].c_str())) + { + WordFind[j] = true; + ++nfound; + } + + + if (nfound == nWord) + return true; + sec_size = strlen(p) + 1; + p += sec_size; + break; + default: + if (g_ascii_isupper(sametypesequence[i])) + { + sec_size = *reinterpret_cast(p); + sec_size += sizeof(guint32); + } + else + { + sec_size = strlen(p) + 1; + } + p += sec_size; + } + } + switch (sametypesequence[sametypesequence_len - 1]) + { + case 'm': + case 't': + case 'y': + case 'l': + case 'g': + case 'x': + sec_size = idxitem_size - (p - origin_data); + for (j = 0; j < nWord; j++) + if (!WordFind[j] && + g_strstr_len(p, sec_size, SearchWords[j].c_str())) + { + WordFind[j] = true; + ++nfound; + } + + + if (nfound == nWord) + return true; + break; + } + } + else + { + while (guint32(p - origin_data) < idxitem_size) + { + switch (*p) + { + case 'm': + case 't': + case 'y': + case 'l': + case 'g': + case 'x': + for (j = 0; j < nWord; j++) + if (!WordFind[j] && strstr(p, SearchWords[j].c_str())) + { + WordFind[j] = true; + ++nfound; + } + + if (nfound == nWord) + return true; + sec_size = strlen(p) + 1; + p += sec_size; + break; + default: + if (g_ascii_isupper(*p)) + { + sec_size = *reinterpret_cast(p); + sec_size += sizeof(guint32); + } + else + { + sec_size = strlen(p) + 1; + } + p += sec_size; + } + } + } + return false; +} + +class offset_index : public index_file +{ + public: + offset_index() : idxfile(NULL) + {} + ~offset_index(); + bool load(const std::string& url, gulong wc, gulong fsize); + const gchar *get_key(glong idx); + void get_data(glong idx); + const gchar *get_key_and_data(glong idx); + bool lookup(const char *str, glong &idx); + private: + static const gint ENTR_PER_PAGE = 32; + static const char *CACHE_MAGIC; + + std::vector wordoffset; + FILE *idxfile; + gulong wordcount; + + gchar wordentry_buf[256 + sizeof(guint32)*2]; // The length of "word_str" should be less than 256. See src/tools/DICTFILE_FORMAT. + struct index_entry + { + glong idx; + std::string keystr; + void assign(glong i, const std::string& str) + { + idx = i; + keystr.assign(str); + } + }; + index_entry first, last, middle, real_last; + + struct page_entry + { + gchar *keystr; + guint32 off, size; + }; + std::vector page_data; + struct page_t + { + glong idx; + page_entry entries[ENTR_PER_PAGE]; + + page_t(): idx( -1) + {} + void fill(gchar *data, gint nent, glong idx_); + } + page; + gulong load_page(glong page_idx); + const gchar *read_first_on_page_key(glong page_idx); + const gchar *get_first_on_page_key(glong page_idx); + bool load_cache(const std::string& url); + bool save_cache(const std::string& url); + static strlist_t get_cache_variant(const std::string& url); +}; + +const char *offset_index::CACHE_MAGIC = "StarDict's Cache, Version: 0.1"; + +class wordlist_index : public index_file +{ + public: + wordlist_index() : idxdatabuf(NULL) + {} + ~wordlist_index(); + bool load(const std::string& url, gulong wc, gulong fsize); + const gchar *get_key(glong idx); + void get_data(glong idx); + const gchar *get_key_and_data(glong idx); + bool lookup(const char *str, glong &idx); + private: + gchar *idxdatabuf; + std::vector wordlist; +}; + +void offset_index::page_t::fill(gchar *data, gint nent, glong idx_) +{ + idx = idx_; + gchar *p = data; + glong len; + for (gint i = 0; i < nent; ++i) + { + entries[i].keystr = p; + len = strlen(p); + p += len + 1; + entries[i].off = g_ntohl(*reinterpret_cast(p)); + p += sizeof(guint32); + entries[i].size = g_ntohl(*reinterpret_cast(p)); + p += sizeof(guint32); + } +} + +offset_index::~offset_index() +{ + if (idxfile) + fclose(idxfile); +} + +inline const gchar *offset_index::read_first_on_page_key(glong page_idx) +{ + fseek(idxfile, wordoffset[page_idx], SEEK_SET); + guint page_size = wordoffset[page_idx + 1] - wordoffset[page_idx]; + fread(wordentry_buf, std::min(sizeof(wordentry_buf), page_size), 1, idxfile); //TODO: check returned values, deal with word entry that strlen>255. + return wordentry_buf; +} + +inline const gchar *offset_index::get_first_on_page_key(glong page_idx) +{ + if (page_idx < middle.idx) + { + if (page_idx == first.idx) + return first.keystr.c_str(); + return read_first_on_page_key(page_idx); + } + else if (page_idx > middle.idx) + { + if (page_idx == last.idx) + return last.keystr.c_str(); + return read_first_on_page_key(page_idx); + } + else + return middle.keystr.c_str(); +} + +bool offset_index::load_cache(const std::string& url) +{ + strlist_t vars = get_cache_variant(url); + + for (strlist_t::const_iterator it = vars.begin(); it != vars.end(); ++it) + { + struct stat idxstat, cachestat; + if (g_stat(url.c_str(), &idxstat) != 0 || + g_stat(it->c_str(), &cachestat) != 0) + continue; + if (cachestat.st_mtime < idxstat.st_mtime) + continue; + MapFile mf; + if (!mf.open(it->c_str(), cachestat.st_size)) + continue; + if (strncmp(mf.begin(), CACHE_MAGIC, strlen(CACHE_MAGIC)) != 0) + continue; + memcpy(&wordoffset[0], mf.begin() + strlen(CACHE_MAGIC), wordoffset.size()*sizeof(wordoffset[0])); + return true; + + } + + return false; +} + +strlist_t offset_index::get_cache_variant(const std::string& url) +{ + strlist_t res; + res.push_back(url + ".oft"); + if (!g_file_test(g_get_user_cache_dir(), G_FILE_TEST_EXISTS) && + g_mkdir(g_get_user_cache_dir(), 0700) == -1) + return res; + + std::string cache_dir = std::string(g_get_user_cache_dir()) + G_DIR_SEPARATOR_S + "sdcv"; + + if (!g_file_test(cache_dir.c_str(), G_FILE_TEST_EXISTS)) + { + if (g_mkdir(cache_dir.c_str(), 0700) == -1) + return res; + } + else if (!g_file_test(cache_dir.c_str(), G_FILE_TEST_IS_DIR)) + return res; + + gchar *base = g_path_get_basename(url.c_str()); + res.push_back(cache_dir + G_DIR_SEPARATOR_S + base + ".oft"); + g_free(base); + return res; +} + +bool offset_index::save_cache(const std::string& url) +{ + strlist_t vars = get_cache_variant(url); + for (strlist_t::const_iterator it = vars.begin(); it != vars.end(); ++it) + { + FILE *out = fopen(it->c_str(), "wb"); + if (!out) + continue; + if (fwrite(CACHE_MAGIC, 1, strlen(CACHE_MAGIC), out) != strlen(CACHE_MAGIC)) + continue; + if (fwrite(&wordoffset[0], sizeof(wordoffset[0]), wordoffset.size(), out) != wordoffset.size()) + continue; + fclose(out); + printf("save to cache %s\n", url.c_str()); + return true; + } + return false; +} + +bool offset_index::load(const std::string& url, gulong wc, gulong fsize) +{ + wordcount = wc; + gulong npages = (wc - 1) / ENTR_PER_PAGE + 2; + wordoffset.resize(npages); + if (!load_cache(url)) + { //map file will close after finish of block + MapFile map_file; + if (!map_file.open(url.c_str(), fsize)) + return false; + const gchar *idxdatabuffer = map_file.begin(); + + const gchar *p1 = idxdatabuffer; + gulong index_size; + guint32 j = 0; + for (guint32 i = 0; i < wc; i++) + { + index_size = strlen(p1) + 1 + 2 * sizeof(guint32); + if (i % ENTR_PER_PAGE == 0) + { + wordoffset[j] = p1 - idxdatabuffer; + ++j; + } + p1 += index_size; + } + wordoffset[j] = p1 - idxdatabuffer; + if (!save_cache(url)) + fprintf(stderr, "cache update failed\n"); + } + + if (!(idxfile = fopen(url.c_str(), "rb"))) + { + wordoffset.resize(0); + return false; + } + + first.assign(0, read_first_on_page_key(0)); + last.assign(wordoffset.size() - 2, read_first_on_page_key(wordoffset.size() - 2)); + middle.assign((wordoffset.size() - 2) / 2, read_first_on_page_key((wordoffset.size() - 2) / 2)); + real_last.assign(wc - 1, get_key(wc - 1)); + + return true; +} + +inline gulong offset_index::load_page(glong page_idx) +{ + gulong nentr = ENTR_PER_PAGE; + if (page_idx == glong(wordoffset.size() - 2)) + if ((nentr = wordcount % ENTR_PER_PAGE) == 0) + nentr = ENTR_PER_PAGE; + + + if (page_idx != page.idx) + { + page_data.resize(wordoffset[page_idx + 1] - wordoffset[page_idx]); + fseek(idxfile, wordoffset[page_idx], SEEK_SET); + fread(&page_data[0], 1, page_data.size(), idxfile); + page.fill(&page_data[0], nentr, page_idx); + } + + return nentr; +} + +const gchar *offset_index::get_key(glong idx) +{ + load_page(idx / ENTR_PER_PAGE); + glong idx_in_page = idx % ENTR_PER_PAGE; + wordentry_offset = page.entries[idx_in_page].off; + wordentry_size = page.entries[idx_in_page].size; + + return page.entries[idx_in_page].keystr; +} + +void offset_index::get_data(glong idx) +{ + get_key(idx); +} + +const gchar *offset_index::get_key_and_data(glong idx) +{ + return get_key(idx); +} + +bool offset_index::lookup(const char *str, glong &idx) +{ + bool bFound = false; + glong iFrom; + glong iTo = wordoffset.size() - 2; + gint cmpint; + glong iThisIndex; + if (stardict_strcmp(str, first.keystr.c_str()) < 0) + { + idx = 0; + return false; + } + else if (stardict_strcmp(str, real_last.keystr.c_str()) > 0) + { + idx = INVALID_INDEX; + return false; + } + else + { + iFrom = 0; + iThisIndex = 0; + while (iFrom <= iTo) + { + iThisIndex = (iFrom + iTo) / 2; + cmpint = stardict_strcmp(str, get_first_on_page_key(iThisIndex)); + if (cmpint > 0) + iFrom = iThisIndex + 1; + else if (cmpint < 0) + iTo = iThisIndex - 1; + else + { + bFound = true; + break; + } + } + if (!bFound) + idx = iTo; //prev + else + idx = iThisIndex; + } + if (!bFound) + { + gulong netr = load_page(idx); + iFrom = 1; // Needn't search the first word anymore. + iTo = netr - 1; + iThisIndex = 0; + while (iFrom <= iTo) + { + iThisIndex = (iFrom + iTo) / 2; + cmpint = stardict_strcmp(str, page.entries[iThisIndex].keystr); + if (cmpint > 0) + iFrom = iThisIndex + 1; + else if (cmpint < 0) + iTo = iThisIndex - 1; + else + { + bFound = true; + break; + } + } + idx *= ENTR_PER_PAGE; + if (!bFound) + idx += iFrom; //next + else + idx += iThisIndex; + } + else + { + idx *= ENTR_PER_PAGE; + } + return bFound; +} + +wordlist_index::~wordlist_index() +{ + g_free(idxdatabuf); +} + +bool wordlist_index::load(const std::string& url, gulong wc, gulong fsize) +{ + gzFile in = gzopen(url.c_str(), "rb"); + if (in == NULL) + return false; + + idxdatabuf = (gchar *)g_malloc(fsize); + + gulong len = gzread(in, idxdatabuf, fsize); + gzclose(in); + + if (len != fsize) + return false; + + wordlist.resize(wc + 1); + gchar *p1 = idxdatabuf; + guint32 i; + for (i = 0; i < wc; i++) + { + wordlist[i] = p1; + p1 += strlen(p1) + 1 + 2 * sizeof(guint32); + } + wordlist[wc] = p1; + + return true; +} + +const gchar *wordlist_index::get_key(glong idx) +{ + return wordlist[idx]; +} + +void wordlist_index::get_data(glong idx) +{ + gchar *p1 = wordlist[idx] + strlen(wordlist[idx]) + sizeof(gchar); + wordentry_offset = g_ntohl(*reinterpret_cast(p1)); + p1 += sizeof(guint32); + wordentry_size = g_ntohl(*reinterpret_cast(p1)); +} + +const gchar *wordlist_index::get_key_and_data(glong idx) +{ + get_data(idx); + return get_key(idx); +} + +bool wordlist_index::lookup(const char *str, glong &idx) +{ + bool bFound = false; + glong iTo = wordlist.size() - 2; + + if (stardict_strcmp(str, get_key(0)) < 0) + { + idx = 0; + } + else if (stardict_strcmp(str, get_key(iTo)) > 0) + { + idx = INVALID_INDEX; + } + else + { + glong iThisIndex = 0; + glong iFrom = 0; + gint cmpint; + while (iFrom <= iTo) + { + iThisIndex = (iFrom + iTo) / 2; + cmpint = stardict_strcmp(str, get_key(iThisIndex)); + if (cmpint > 0) + iFrom = iThisIndex + 1; + else if (cmpint < 0) + iTo = iThisIndex - 1; + else + { + bFound = true; + break; + } + } + if (!bFound) + idx = iFrom; //next + else + idx = iThisIndex; + } + return bFound; +} + +//=================================================================== +bool Dict::load(const std::string& ifofilename) +{ + gulong idxfilesize; + if (!load_ifofile(ifofilename, idxfilesize)) + return false; + + std::string fullfilename(ifofilename); + fullfilename.replace(fullfilename.length() - sizeof("ifo") + 1, sizeof("ifo") - 1, "dict.dz"); + + if (g_file_test(fullfilename.c_str(), G_FILE_TEST_EXISTS)) + { + dictdzfile.reset(new dictData); + if (!dictdzfile->open(fullfilename, 0)) + { + //g_print("open file %s failed!\n",fullfilename); + return false; + } + } + else + { + fullfilename.erase(fullfilename.length() - sizeof(".dz") + 1, sizeof(".dz") - 1); + dictfile = fopen(fullfilename.c_str(), "rb"); + if (!dictfile) + { + //g_print("open file %s failed!\n",fullfilename); + return false; + } + } + + fullfilename = ifofilename; + fullfilename.replace(fullfilename.length() - sizeof("ifo") + 1, sizeof("ifo") - 1, "idx.gz"); + + if (g_file_test(fullfilename.c_str(), G_FILE_TEST_EXISTS)) + { + idx_file.reset(new wordlist_index); + } + else + { + fullfilename.erase(fullfilename.length() - sizeof(".gz") + 1, sizeof(".gz") - 1); + idx_file.reset(new offset_index); + } + + if (!idx_file->load(fullfilename, wordcount, idxfilesize)) + return false; + + //g_print("bookname: %s , wordcount %lu\n", bookname.c_str(), narticles()); + return true; +} + +bool Dict::load_ifofile(const std::string& ifofilename, gulong &idxfilesize) +{ + DictInfo dict_info; + if (!dict_info.load_from_ifo_file(ifofilename, false)) + return false; + if (dict_info.wordcount == 0) + return false; + + + + ifo_file_name = dict_info.ifo_file_name; + wordcount = dict_info.wordcount; + bookname = dict_info.bookname; + + idxfilesize = dict_info.index_file_size; + + sametypesequence = dict_info.sametypesequence; + + return true; +} + +bool Dict::LookupWithRule(GPatternSpec *pspec, glong *aIndex, int iBuffLen) +{ + int iIndexCount = 0; + + for (guint32 i = 0; i < narticles() && iIndexCount < iBuffLen - 1; i++) + if (g_pattern_match_string(pspec, get_key(i))) + aIndex[iIndexCount++] = i; + + aIndex[iIndexCount] = -1; // -1 is the end. + + return (iIndexCount > 0); +} + +//=================================================================== +Libs::Libs(progress_func_t f) +{ + progress_func = f; + iMaxFuzzyDistance = MAX_FUZZY_DISTANCE; //need to read from cfg. +} + +Libs::~Libs() +{ + for (std::vector::iterator p = oLib.begin(); p != oLib.end(); ++p) + delete *p; +} + +void Libs::load_dict(const std::string& url) +{ + Dict *lib = new Dict; + if (lib->load(url)) + oLib.push_back(lib); + else + delete lib; +} + +class DictLoader +{ + public: + DictLoader(Libs& lib_): lib(lib_) + {} + void operator()(const std::string& url, bool disable) + { + if (!disable) + lib.load_dict(url); + } + private: + Libs& lib; +}; + +void Libs::load(const strlist_t& dicts_dirs, + const strlist_t& order_list, + const strlist_t& disable_list) +{ + for_each_file(dicts_dirs, ".ifo", order_list, disable_list, + DictLoader(*this)); +} + +class DictReLoader +{ + public: + DictReLoader(std::vector &p, std::vector &f, + Libs& lib_) : prev(p), future(f), lib(lib_) + {} + void operator()(const std::string& url, bool disable) + { + if (!disable) + { + Dict *dict = find(url); + if (dict) + future.push_back(dict); + else + lib.load_dict(url); + } + } + private: + std::vector &prev; + std::vector &future; + Libs& lib; + + Dict *find(const std::string& url) + { + std::vector::iterator it; + for (it = prev.begin(); it != prev.end(); ++it) + if ((*it)->ifofilename() == url) + break; + if (it != prev.end()) + { + Dict *res = *it; + prev.erase(it); + return res; + } + return NULL; + } +}; + +void Libs::reload(const strlist_t& dicts_dirs, + const strlist_t& order_list, + const strlist_t& disable_list) +{ + std::vector prev(oLib); + oLib.clear(); + for_each_file(dicts_dirs, ".ifo", order_list, disable_list, + DictReLoader(prev, oLib, *this)); + for (std::vector::iterator it = prev.begin(); it != prev.end(); ++it) + delete *it; +} + +const gchar *Libs::poGetCurrentWord(glong * iCurrent) +{ + const gchar *poCurrentWord = NULL; + const gchar *word; + for (std::vector::size_type iLib = 0; iLib= narticles(iLib) || iCurrent[iLib] < 0) + continue; + if ( poCurrentWord == NULL ) + { + poCurrentWord = poGetWord(iCurrent[iLib], iLib); + } + else + { + word = poGetWord(iCurrent[iLib], iLib); + + if (stardict_strcmp(poCurrentWord, word) > 0 ) + poCurrentWord = word; + } + } + return poCurrentWord; +} + +const gchar * +Libs::poGetNextWord(const gchar *sWord, glong *iCurrent) +{ + // the input can be: + // (word,iCurrent),read word,write iNext to iCurrent,and return next word. used by TopWin::NextCallback(); + // (NULL,iCurrent),read iCurrent,write iNext to iCurrent,and return next word. used by AppCore::ListWords(); + const gchar *poCurrentWord = NULL; + std::vector::size_type iCurrentLib = 0; + const gchar *word; + + for (std::vector::size_type iLib = 0;iLibLookup(sWord, iCurrent[iLib]); + if (iCurrent[iLib] == INVALID_INDEX) + continue; + if (iCurrent[iLib] >= narticles(iLib) || iCurrent[iLib] < 0) + continue; + if (poCurrentWord == NULL ) + { + poCurrentWord = poGetWord(iCurrent[iLib], iLib); + iCurrentLib = iLib; + } + else + { + word = poGetWord(iCurrent[iLib], iLib); + + if (stardict_strcmp(poCurrentWord, word) > 0 ) + { + poCurrentWord = word; + iCurrentLib = iLib; + } + } + } + if (poCurrentWord) + { + iCurrent[iCurrentLib] + ++; + for (std::vector::size_type iLib = 0;iLib= narticles(iLib) || iCurrent[iLib] < 0) + continue; + if (strcmp(poCurrentWord, poGetWord(iCurrent[iLib], iLib)) == 0 ) + iCurrent[iLib]++; + } + poCurrentWord = poGetCurrentWord(iCurrent); + } + return poCurrentWord; +} + + +const gchar * +Libs::poGetPreWord(glong * iCurrent) +{ + // used by TopWin::PreviousCallback(); the iCurrent is cached by AppCore::TopWinWordChange(); + const gchar *poCurrentWord = NULL; + std::vector::size_type iCurrentLib = 0; + const gchar *word; + + for (std::vector::size_type iLib = 0;iLib narticles(iLib) || iCurrent[iLib] <= 0) + continue; + } + if ( poCurrentWord == NULL ) + { + poCurrentWord = poGetWord(iCurrent[iLib] - 1, iLib); + iCurrentLib = iLib; + } + else + { + word = poGetWord(iCurrent[iLib] - 1, iLib); + if (stardict_strcmp(poCurrentWord, word) < 0 ) + { + poCurrentWord = word; + iCurrentLib = iLib; + } + } + } + + if (poCurrentWord) + { + iCurrent[iCurrentLib] + --; + for (std::vector::size_type iLib = 0;iLib narticles(iLib) || iCurrent[iLib] <= 0) + continue; + if (strcmp(poCurrentWord, poGetWord(iCurrent[iLib] - 1, iLib)) == 0) + { + iCurrent[iLib]--; + } + else + { + if (iCurrent[iLib] == narticles(iLib)) + iCurrent[iLib] = INVALID_INDEX; + } + } + } + return poCurrentWord; +} + +bool Libs::LookupSimilarWord(const gchar* sWord, glong & iWordIndex, int iLib) +{ + glong iIndex; + bool bFound = false; + gchar *casestr; + + if (!bFound) + { + // to lower case. + casestr = g_utf8_strdown(sWord, -1); + if (strcmp(casestr, sWord)) + { + if (oLib[iLib]->Lookup(casestr, iIndex)) + bFound = true; + } + g_free(casestr); + // to upper case. + if (!bFound) + { + casestr = g_utf8_strup(sWord, -1); + if (strcmp(casestr, sWord)) + { + if (oLib[iLib]->Lookup(casestr, iIndex)) + bFound = true; + } + g_free(casestr); + } + // Upper the first character and lower others. + if (!bFound) + { + gchar *nextchar = g_utf8_next_char(sWord); + gchar *firstchar = g_utf8_strup(sWord, nextchar - sWord); + nextchar = g_utf8_strdown(nextchar, -1); + casestr = g_strdup_printf("%s%s", firstchar, nextchar); + g_free(firstchar); + g_free(nextchar); + if (strcmp(casestr, sWord)) + { + if (oLib[iLib]->Lookup(casestr, iIndex)) + bFound = true; + } + g_free(casestr); + } + } + + if (bIsPureEnglish(sWord)) + { + // If not Found , try other status of sWord. + int iWordLen = strlen(sWord); + bool isupcase; + + gchar *sNewWord = (gchar *)g_malloc(iWordLen + 1); + + //cut one char "s" or "d" + if (!bFound && iWordLen > 1) + { + isupcase = sWord[iWordLen - 1] == 'S' || !strncmp(&sWord[iWordLen - 2], "ED", 2); + if (isupcase || sWord[iWordLen - 1] == 's' || !strncmp(&sWord[iWordLen - 2], "ed", 2)) + { + strcpy(sNewWord, sWord); + sNewWord[iWordLen - 1] = '\0'; // cut "s" or "d" + if (oLib[iLib]->Lookup(sNewWord, iIndex)) + bFound = true; + else if (isupcase || g_ascii_isupper(sWord[0])) + { + casestr = g_ascii_strdown(sNewWord, -1); + if (strcmp(casestr, sNewWord)) + { + if (oLib[iLib]->Lookup(casestr, iIndex)) + bFound = true; + } + g_free(casestr); + } + } + } + + //cut "ly" + if (!bFound && iWordLen > 2) + { + isupcase = !strncmp(&sWord[iWordLen - 2], "LY", 2); + if (isupcase || (!strncmp(&sWord[iWordLen - 2], "ly", 2))) + { + strcpy(sNewWord, sWord); + sNewWord[iWordLen - 2] = '\0'; // cut "ly" + if (iWordLen > 5 && sNewWord[iWordLen - 3] == sNewWord[iWordLen - 4] + && !bIsVowel(sNewWord[iWordLen - 4]) && + bIsVowel(sNewWord[iWordLen - 5])) + { //doubled + + sNewWord[iWordLen - 3] = '\0'; + if ( oLib[iLib]->Lookup(sNewWord, iIndex) ) + bFound = true; + else + { + if (isupcase || g_ascii_isupper(sWord[0])) + { + casestr = g_ascii_strdown(sNewWord, -1); + if (strcmp(casestr, sNewWord)) + { + if (oLib[iLib]->Lookup(casestr, iIndex)) + bFound = true; + } + g_free(casestr); + } + if (!bFound) + sNewWord[iWordLen - 3] = sNewWord[iWordLen - 4]; //restore + } + } + if (!bFound) + { + if (oLib[iLib]->Lookup(sNewWord, iIndex)) + bFound = true; + else if (isupcase || g_ascii_isupper(sWord[0])) + { + casestr = g_ascii_strdown(sNewWord, -1); + if (strcmp(casestr, sNewWord)) + { + if (oLib[iLib]->Lookup(casestr, iIndex)) + bFound = true; + } + g_free(casestr); + } + } + } + } + + //cut "ing" + if (!bFound && iWordLen > 3) + { + isupcase = !strncmp(&sWord[iWordLen - 3], "ING", 3); + if (isupcase || !strncmp(&sWord[iWordLen - 3], "ing", 3) ) + { + strcpy(sNewWord, sWord); + sNewWord[iWordLen - 3] = '\0'; + if ( iWordLen > 6 && (sNewWord[iWordLen - 4] == sNewWord[iWordLen - 5]) + && !bIsVowel(sNewWord[iWordLen - 5]) && + bIsVowel(sNewWord[iWordLen - 6])) + { //doubled + sNewWord[iWordLen - 4] = '\0'; + if (oLib[iLib]->Lookup(sNewWord, iIndex)) + bFound = true; + else + { + if (isupcase || g_ascii_isupper(sWord[0])) + { + casestr = g_ascii_strdown(sNewWord, -1); + if (strcmp(casestr, sNewWord)) + { + if (oLib[iLib]->Lookup(casestr, iIndex)) + bFound = true; + } + g_free(casestr); + } + if (!bFound) + sNewWord[iWordLen - 4] = sNewWord[iWordLen - 5]; //restore + } + } + if ( !bFound ) + { + if (oLib[iLib]->Lookup(sNewWord, iIndex)) + bFound = true; + else if (isupcase || g_ascii_isupper(sWord[0])) + { + casestr = g_ascii_strdown(sNewWord, -1); + if (strcmp(casestr, sNewWord)) + { + if (oLib[iLib]->Lookup(casestr, iIndex)) + bFound = true; + } + g_free(casestr); + } + } + if (!bFound) + { + if (isupcase) + strcat(sNewWord, "E"); // add a char "E" + else + strcat(sNewWord, "e"); // add a char "e" + if (oLib[iLib]->Lookup(sNewWord, iIndex)) + bFound = true; + else if (isupcase || g_ascii_isupper(sWord[0])) + { + casestr = g_ascii_strdown(sNewWord, -1); + if (strcmp(casestr, sNewWord)) + { + if (oLib[iLib]->Lookup(casestr, iIndex)) + bFound = true; + } + g_free(casestr); + } + } + } + } + + //cut two char "es" + if (!bFound && iWordLen > 3) + { + isupcase = (!strncmp(&sWord[iWordLen - 2], "ES", 2) && + (sWord[iWordLen - 3] == 'S' || + sWord[iWordLen - 3] == 'X' || + sWord[iWordLen - 3] == 'O' || + (iWordLen > 4 && sWord[iWordLen - 3] == 'H' && + (sWord[iWordLen - 4] == 'C' || + sWord[iWordLen - 4] == 'S')))); + if (isupcase || + (!strncmp(&sWord[iWordLen - 2], "es", 2) && + (sWord[iWordLen - 3] == 's' || sWord[iWordLen - 3] == 'x' || + sWord[iWordLen - 3] == 'o' || + (iWordLen > 4 && sWord[iWordLen - 3] == 'h' && + (sWord[iWordLen - 4] == 'c' || sWord[iWordLen - 4] == 's'))))) + { + strcpy(sNewWord, sWord); + sNewWord[iWordLen - 2] = '\0'; + if (oLib[iLib]->Lookup(sNewWord, iIndex)) + bFound = true; + else if (isupcase || g_ascii_isupper(sWord[0])) + { + casestr = g_ascii_strdown(sNewWord, -1); + if (strcmp(casestr, sNewWord)) + { + if (oLib[iLib]->Lookup(casestr, iIndex)) + bFound = true; + } + g_free(casestr); + } + } + } + + //cut "ed" + if (!bFound && iWordLen > 3) + { + isupcase = !strncmp(&sWord[iWordLen - 2], "ED", 2); + if (isupcase || !strncmp(&sWord[iWordLen - 2], "ed", 2)) + { + strcpy(sNewWord, sWord); + sNewWord[iWordLen - 2] = '\0'; + if (iWordLen > 5 && (sNewWord[iWordLen - 3] == sNewWord[iWordLen - 4]) + && !bIsVowel(sNewWord[iWordLen - 4]) && + bIsVowel(sNewWord[iWordLen - 5])) + { //doubled + sNewWord[iWordLen - 3] = '\0'; + if (oLib[iLib]->Lookup(sNewWord, iIndex)) + bFound = true; + else + { + if (isupcase || g_ascii_isupper(sWord[0])) + { + casestr = g_ascii_strdown(sNewWord, -1); + if (strcmp(casestr, sNewWord)) + { + if (oLib[iLib]->Lookup(casestr, iIndex)) + bFound = true; + } + g_free(casestr); + } + if (!bFound) + sNewWord[iWordLen - 3] = sNewWord[iWordLen - 4]; //restore + } + } + if (!bFound) + { + if (oLib[iLib]->Lookup(sNewWord, iIndex)) + bFound = true; + else if (isupcase || g_ascii_isupper(sWord[0])) + { + casestr = g_ascii_strdown(sNewWord, -1); + if (strcmp(casestr, sNewWord)) + { + if (oLib[iLib]->Lookup(casestr, iIndex)) + bFound = true; + } + g_free(casestr); + } + } + } + } + + // cut "ied" , add "y". + if (!bFound && iWordLen > 3) + { + isupcase = !strncmp(&sWord[iWordLen - 3], "IED", 3); + if (isupcase || (!strncmp(&sWord[iWordLen - 3], "ied", 3))) + { + strcpy(sNewWord, sWord); + sNewWord[iWordLen - 3] = '\0'; + if (isupcase) + strcat(sNewWord, "Y"); // add a char "Y" + else + strcat(sNewWord, "y"); // add a char "y" + if (oLib[iLib]->Lookup(sNewWord, iIndex)) + bFound = true; + else if (isupcase || g_ascii_isupper(sWord[0])) + { + casestr = g_ascii_strdown(sNewWord, -1); + if (strcmp(casestr, sNewWord)) + { + if (oLib[iLib]->Lookup(casestr, iIndex)) + bFound = true; + } + g_free(casestr); + } + } + } + + // cut "ies" , add "y". + if (!bFound && iWordLen > 3) + { + isupcase = !strncmp(&sWord[iWordLen - 3], "IES", 3); + if (isupcase || (!strncmp(&sWord[iWordLen - 3], "ies", 3))) + { + strcpy(sNewWord, sWord); + sNewWord[iWordLen - 3] = '\0'; + if (isupcase) + strcat(sNewWord, "Y"); // add a char "Y" + else + strcat(sNewWord, "y"); // add a char "y" + if (oLib[iLib]->Lookup(sNewWord, iIndex)) + bFound = true; + else if (isupcase || g_ascii_isupper(sWord[0])) + { + casestr = g_ascii_strdown(sNewWord, -1); + if (strcmp(casestr, sNewWord)) + { + if (oLib[iLib]->Lookup(casestr, iIndex)) + bFound = true; + } + g_free(casestr); + } + } + } + + // cut "er". + if (!bFound && iWordLen > 2) + { + isupcase = !strncmp(&sWord[iWordLen - 2], "ER", 2); + if (isupcase || (!strncmp(&sWord[iWordLen - 2], "er", 2))) + { + strcpy(sNewWord, sWord); + sNewWord[iWordLen - 2] = '\0'; + if (oLib[iLib]->Lookup(sNewWord, iIndex)) + bFound = true; + else if (isupcase || g_ascii_isupper(sWord[0])) + { + casestr = g_ascii_strdown(sNewWord, -1); + if (strcmp(casestr, sNewWord)) + { + if (oLib[iLib]->Lookup(casestr, iIndex)) + bFound = true; + } + g_free(casestr); + } + } + } + + // cut "est". + if (!bFound && iWordLen > 3) + { + isupcase = !strncmp(&sWord[iWordLen - 3], "EST", 3); + if (isupcase || (!strncmp(&sWord[iWordLen - 3], "est", 3))) + { + strcpy(sNewWord, sWord); + sNewWord[iWordLen - 3] = '\0'; + if (oLib[iLib]->Lookup(sNewWord, iIndex)) + bFound = true; + else if (isupcase || g_ascii_isupper(sWord[0])) + { + casestr = g_ascii_strdown(sNewWord, -1); + if (strcmp(casestr, sNewWord)) + { + if (oLib[iLib]->Lookup(casestr, iIndex)) + bFound = true; + } + g_free(casestr); + } + } + } + + g_free(sNewWord); + } + + if (bFound) + iWordIndex = iIndex; +#if 0 + + else + { + //don't change iWordIndex here. + //when LookupSimilarWord all failed too, we want to use the old LookupWord index to list words. + //iWordIndex = INVALID_INDEX; + } +#endif + return bFound; +} + +bool Libs::SimpleLookupWord(const gchar* sWord, glong & iWordIndex, int iLib) +{ + bool bFound = oLib[iLib]->Lookup(sWord, iWordIndex); + if (!bFound) + bFound = LookupSimilarWord(sWord, iWordIndex, iLib); + return bFound; +} + +struct Fuzzystruct +{ + char * pMatchWord; + int iMatchWordDistance; +}; + +inline bool operator<(const Fuzzystruct & lh, const Fuzzystruct & rh) +{ + if (lh.iMatchWordDistance != rh.iMatchWordDistance) + return lh.iMatchWordDistance < rh.iMatchWordDistance; + + if (lh.pMatchWord && rh.pMatchWord) + return stardict_strcmp(lh.pMatchWord, rh.pMatchWord) < 0; + + return false; +} + +static inline void unicode_strdown(gunichar *str) +{ + while (*str) + { + *str = g_unichar_tolower(*str); + ++str; + } +} + +bool Libs::LookupWithFuzzy(const gchar *sWord, gchar *reslist[], gint reslist_size, gint iLib) +{ + if (sWord[0] == '\0') + return false; + + Fuzzystruct *oFuzzystruct = new Fuzzystruct[reslist_size]; + + for (int i = 0; i < reslist_size; i++) + { + oFuzzystruct[i].pMatchWord = NULL; + oFuzzystruct[i].iMatchWordDistance = iMaxFuzzyDistance; + } + int iMaxDistance = iMaxFuzzyDistance; + int iDistance; + bool Found = false; + EditDistance oEditDistance; + + glong iCheckWordLen; + const char *sCheck; + gunichar *ucs4_str1, *ucs4_str2; + glong ucs4_str2_len; + + ucs4_str2 = g_utf8_to_ucs4_fast(sWord, -1, &ucs4_str2_len); + unicode_strdown(ucs4_str2); + +// for (std::vector::size_type iLib = 0; iLib=0 && stardict_strcmp(sWord, poGetWord(narticles(iLib)-1,iLib))<=0) { + //there are Chinese dicts and English dicts... + if (TRUE) + { + const int iwords = narticles(iLib); + for (int index = 0; index < iwords; index++) + { + sCheck = poGetWord(index, iLib); + // tolower and skip too long or too short words + iCheckWordLen = g_utf8_strlen(sCheck, -1); + if (iCheckWordLen - ucs4_str2_len >= iMaxDistance || + ucs4_str2_len - iCheckWordLen >= iMaxDistance) + continue; + ucs4_str1 = g_utf8_to_ucs4_fast(sCheck, -1, NULL); + if (iCheckWordLen > ucs4_str2_len) + ucs4_str1[ucs4_str2_len] = 0; + unicode_strdown(ucs4_str1); + + iDistance = oEditDistance.CalEditDistance(ucs4_str1, ucs4_str2, iMaxDistance); + g_free(ucs4_str1); + if (iDistance < iMaxDistance && iDistance < ucs4_str2_len) + { + // when ucs4_str2_len=1,2 we need less fuzzy. + Found = true; + bool bAlreadyInList = false; + int iMaxDistanceAt = 0; + for (int j = 0; j < reslist_size; j++) + { + if (oFuzzystruct[j].pMatchWord && + strcmp(oFuzzystruct[j].pMatchWord, sCheck) == 0 ) + { //already in list + bAlreadyInList = true; + break; + } + //find the position,it will certainly be found (include the first time) as iMaxDistance is set by last time. + if (oFuzzystruct[j].iMatchWordDistance == iMaxDistance ) + { + iMaxDistanceAt = j; + } + } + if (!bAlreadyInList) + { + if (oFuzzystruct[iMaxDistanceAt].pMatchWord) + g_free(oFuzzystruct[iMaxDistanceAt].pMatchWord); + oFuzzystruct[iMaxDistanceAt].pMatchWord = g_strdup(sCheck); + oFuzzystruct[iMaxDistanceAt].iMatchWordDistance = iDistance; + // calc new iMaxDistance + iMaxDistance = iDistance; + for (int j = 0; j < reslist_size; j++) + { + if (oFuzzystruct[j].iMatchWordDistance > iMaxDistance) + iMaxDistance = oFuzzystruct[j].iMatchWordDistance; + } // calc new iMaxDistance + } // add to list + } // find one + } // each word + } // ok for search +// } // each lib + g_free(ucs4_str2); + + if (Found) // sort with distance + std::sort(oFuzzystruct, oFuzzystruct + reslist_size); + + for (gint i = 0; i < reslist_size; ++i) + reslist[i] = oFuzzystruct[i].pMatchWord; + + delete[] oFuzzystruct; + + return Found; +} + +inline bool less_for_compare(const char *lh, const char *rh) +{ + return stardict_strcmp(lh, rh) < 0; +} + +gint Libs::LookupWithRule(const gchar *word, gchar **ppMatchWord) +{ + glong aiIndex[MAX_MATCH_ITEM_PER_LIB + 1]; + gint iMatchCount = 0; + GPatternSpec *pspec = g_pattern_spec_new(word); + + for (std::vector::size_type iLib = 0; iLib + LookupWithRule(pspec, aiIndex, MAX_MATCH_ITEM_PER_LIB + 1)) + { + if (progress_func) + progress_func(); + for (int i = 0; aiIndex[i] != -1; i++) + { + const gchar * sMatchWord = poGetWord(aiIndex[i], iLib); + bool bAlreadyInList = false; + for (int j = 0; j < iMatchCount; j++) + { + if (strcmp(ppMatchWord[j], sMatchWord) == 0) + { //already in list + bAlreadyInList = true; + break; + } + } + if (!bAlreadyInList) + ppMatchWord[iMatchCount++] = g_strdup(sMatchWord); + } + } + } + g_pattern_spec_free(pspec); + + if (iMatchCount) // sort it. + std::sort(ppMatchWord, ppMatchWord + iMatchCount, less_for_compare); + + return iMatchCount; +} + +bool Libs::LookupData(const gchar *sWord, std::vector *reslist) +{ + std::vector SearchWords; + std::string SearchWord; + const char *p = sWord; + while (*p) + { + if (*p == '\\') + { + p++; + switch (*p) + { + case ' ': + SearchWord += ' '; + break; + case '\\': + SearchWord += '\\'; + break; + case 't': + SearchWord += '\t'; + break; + case 'n': + SearchWord += '\n'; + break; + default: + SearchWord += *p; + } + } + else if (*p == ' ') + { + if (!SearchWord.empty()) + { + SearchWords.push_back(SearchWord); + SearchWord.clear(); + } + } + else + { + SearchWord += *p; + } + p++; + } + if (!SearchWord.empty()) + { + SearchWords.push_back(SearchWord); + SearchWord.clear(); + } + if (SearchWords.empty()) + return false; + + guint32 max_size = 0; + gchar *origin_data = NULL; + for (std::vector::size_type i = 0; i + containSearchData()) + continue; + if (progress_func) + progress_func(); + const gulong iwords = narticles(i); + const gchar *key; + guint32 offset, size; + for (gulong j = 0; + j < iwords; + ++j) + { + oLib[i] + ->get_key_and_data(j, &key, &offset, &size); + if (size > max_size) + { + origin_data = (gchar *)g_realloc(origin_data, size); + max_size = size; + } + if (oLib[i]->SearchData(SearchWords, offset, size, origin_data)) + reslist[i].push_back(g_strdup(key)); + } + } + g_free(origin_data); + + std::vector::size_type i; + for (i = 0; i