4 cellwriter -- a character recognition input method
5 Copyright (C) 2007 Michael Levin <risujin@risujin.org>
7 This program is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public License
9 as published by the Free Software Foundation; either version 2
10 of the License, or (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
25 #include "recognize.h"
30 const char *cell_widget_word(void);
36 #ifndef DISABLE_WORDFREQ
38 /* TODO needs to be internationalized (wide char)
39 TODO user-made words list
40 TODO choose a list via GUI
41 FIXME the frequency list contains "n't" etc as separate endings, this
42 needs to be taken into consideration */
44 /* The number of word frequency entries to load */
45 #define WORDFREQS 15000
52 int wordfreq_enable = TRUE;
54 static WordFreq wordfreqs[WORDFREQS + 1];
55 static int wordfreqs_len, wordfreqs_count;
57 void load_wordfreq(void)
58 /* Read in the word frequency file. The file format is: word\tcount\n */
66 wordfreqs[0].string[0] = 0;
68 /* Try to open the user's word frequency file */
69 path = g_build_filename(g_get_home_dir(), "." PACKAGE, "wordfreq",
71 channel = g_io_channel_new_file(path, "r", &error);
73 g_debug("User does not have a word frequency file, "
74 "loading system file");
80 /* Open the word frequency file */
82 path = g_build_filename(PKGDATADIR, "wordfreq", NULL);
83 channel = g_io_channel_new_file(path, "r", &error);
85 g_warning("Failed to open system word frequency file "
86 "'%s' for reading: %s", path, error->message);
93 /* Read in every entry */
94 g_debug("Parsing word frequency list");
96 for (i = 0; bytes_read > 0 && i < WORDFREQS; i++) {
103 g_io_channel_read_chars(channel, ++pbuf, 1,
104 &bytes_read, &error);
105 } while (bytes_read > 0 && *pbuf != '\n' &&
106 pbuf < buf + sizeof (buf));
111 while (*pbuf && *pbuf != '\t' && *pbuf != ' ')
120 if (len >= (int)sizeof (wordfreqs[i].string))
121 len = sizeof (wordfreqs[i].string) - 1;
122 memcpy(wordfreqs[i].string, buf, len);
123 wordfreqs[i].string[len] = 0;
125 /* Parse the count */
127 while (*pbuf == ' ' || *pbuf == '\t')
129 wordfreqs_count += wordfreqs[i].count = log(atoi(pbuf));
131 wordfreqs[i].string[0] = 0;
133 g_io_channel_unref(channel);
134 g_debug("%d words parsed", i);
139 void engine_wordfreq(void)
142 const char *pre, *post;
143 int i, pre_len, post_len, chars[128];
145 if (!wordfreq_enable)
147 pre = cell_widget_word();
148 pre_len = strlen(pre);
149 post = pre + pre_len + 1;
150 post_len = strlen(post);
151 if (!pre_len && !post_len)
153 memset(chars, 0, sizeof (chars));
155 /* Numbers follow numbers */
156 if (g_ascii_isdigit(pre[pre_len - 1])) {
157 for (i = 0; i <= 9; i++)
162 /* Search the databases for matches (FIXME sort/index) */
163 for (i = 0; i < wordfreqs_len; i++)
165 !g_ascii_strncasecmp(pre, wordfreqs[i].string, pre_len)) &&
167 !g_ascii_strncasecmp(post, wordfreqs[i].string + pre_len +
169 int ch = wordfreqs[i].string[pre_len],
170 ch_lower = ch, ch_upper = 0;
172 if (ch < 32 || ch >= 127)
175 /* Suggest proper case */
176 if (g_ascii_isalpha(ch)) {
177 ch_lower = g_ascii_tolower(ch);
178 ch_upper = g_ascii_toupper(ch);
180 if (g_ascii_islower(pre[pre_len - 1]))
183 if (g_ascii_isupper(pre[pre_len - 1]) &&
184 g_ascii_isupper(pre[pre_len - 2]))
189 chars[ch_lower] += wordfreqs[i].count;
190 chars[ch_upper] += wordfreqs[i].count;
194 /* Apply characters table */
196 while ((sample = sampleiter_next()))
197 if (sample->ch >= 32 && sample->ch < 127)
198 sample->ratings[ENGINE_WORDFREQ] = chars[sample->ch];
201 #endif /* DISABLE_WORDFREQ */