Imported version 0.4-1
[mstardict] / stardict-plugins / stardict-powerword-parsedata-plugin / stardict_powerword_parsedata.cpp
1 #include "stardict_powerword_parsedata.h"
2 #include <glib/gi18n.h>
3
4 #ifdef _WIN32
5 #include <windows.h>
6 #endif
7
8 static size_t xml_strlen(const char *xmlstr)
9 {
10         const char *q;
11         static const char* xml_entrs[] = { "lt;", "gt;", "amp;", "apos;", "quot;", 0 };
12         static const int xml_ent_len[] = { 3,     3,     4,      5,       5 };
13         size_t cur_pos;
14         int i;
15
16         for (cur_pos = 0, q = xmlstr; *q; ++cur_pos) {
17                 if (*q == '&') {
18                         for (i = 0; xml_entrs[i]; ++i)
19                                 if (strncmp(xml_entrs[i], q + 1,
20                                             xml_ent_len[i]) == 0) {
21                                         q += xml_ent_len[i] + 1;
22                                         break;
23                                 }
24                         if (xml_entrs[i] == NULL)
25                                 ++q;
26                 } else if (*q == '<') {
27                         const char *p = strchr(q+1, '>');
28                         if (p)
29                                 q = p + 1;
30                         else
31                                 ++q;
32                         --cur_pos;
33                 } else
34                         q = g_utf8_next_char(q);
35         }
36
37         return cur_pos;
38 }
39
40 static gchar* toUtfPhonetic(const gchar *text, gsize len)
41 {
42         std::string p;
43         gsize i;
44         for (i=0;i<len;i++) {
45                 switch (text[i]) {
46                         case 'A':
47                                 p+="æ"; break;
48                         case 'B':
49                                 p+="ɑ"; break;
50                         case 'C':
51                                 p+="ɔ"; break;
52                         case 'Q':
53                                 p+="ʌ"; break;
54                         case 'E':
55                                 p+="ә"; break;
56                         case 'Z':
57                                 p+="є"; break;
58                         case 'N':
59                                 p+="ŋ"; break;
60                         case 'W':
61                                 p+="θ"; break;
62                         case 'T':
63                                 p+="ð"; break;
64                         case 'F':
65                                 p+="ʃ"; break;
66                         case 'V':
67                                 p+="ʒ"; break;
68                         case 'L':
69                                 p+="ɚ"; break;
70                         case 'I':
71                                 p+="i"; break;
72                         case '^':
73                                 p+="ɡ"; break;
74                         case '9':
75                                 p+="ˏ"; break;
76                         case '5':
77                                 p+="'"; break;
78                         default:
79                                 p+=text[i];
80                                 break;
81                 }
82         }
83         return g_markup_escape_text(p.c_str(), -1);
84 }
85
86 static gchar* toUtfPhonetic2(const gchar *text, glong len)
87 {
88         std::string p;
89         const char *s = text;
90         const char *n;
91         std::string uc;
92         while (s-text < len) {
93                 n = g_utf8_next_char(s);
94                 uc.assign(s, n-s);
95                 if (uc == "8")
96                         p+=":";
97                 else if (uc == "0")
98                         p+="Ŋ";
99                 else if (uc == "¾")
100                         p+="ǔ";
101                 else if (uc == "%")
102                         p+="ɔ";
103                 else if (uc == "µ")
104                         p+="ě";
105                 else if (uc == "³")
106                         p+="ā";
107                 else if (uc == "!")
108                         p+="I";
109                 else if (uc == "W")
110                         p+="ɛ";
111                 else if (uc == "&")
112                         p+="U";
113                 else if (uc == "…")
114                         p+="ə";
115                 else if (uc == "¹")
116                         p+="ǐ";
117                 else if (uc == "“")
118                         p+="′";
119                 else if (uc == "*")
120                         p+="ə";
121                 else if (uc == "6")
122                         p+="ˋ";
123                 else if (uc == "+")
124                         p+="ɚ";
125                 else if (uc == "”")
126                         p+="´";
127                 else if (uc == "‘")
128                         p+="KH";
129                 else if (uc == "$")
130                         p+="ɑ";
131                 else if (uc == "7")
132                         p+="͵";
133                 else if (uc == "'")
134                         p+="KH";
135                 else if (uc == "½")
136                         p+="ō";
137                 else if (uc == "¼")
138                         p+="ǒ";
139                 else if (uc == "¶")
140                         p+="ē";
141                 else if (uc == "º")
142                         p+="ī";
143                 else if (uc == "G")
144                         p+="θ";
145                 else if (uc == "9")
146                         p+="ʒ";
147                 else if (uc == ".")
148                         p+="ʃ";
149                 else if (uc == "/")
150                         p+="ʒ";
151                 else if (uc == "²")
152                         p+="ǎ";
153                 else if (uc == "#")
154                         p+="æ";
155                 else if (uc == "’")
156                         p+="N";
157                 else if (uc == "Y")
158                         p+="t";
159                 else if (uc == "H")
160                         p+="ð";
161                 else if (uc == "÷")
162                         p+="ń";
163                 else if (uc == "é")
164                         p+="ê";
165                 else if (uc == "¿")
166                         p+="ū";
167                 else if (uc == ")")
168                         p+="ɜ";
169                 else if (uc == "Ó")
170                         p+="ǒ";
171                 else if (uc == "ï")
172                         p+="Ś";
173                 else if (uc == "Ä")
174                         p+="ǐ";
175                 else
176                         p+= uc;
177                 s = n;
178         }
179         return g_markup_escape_text(p.c_str(), -1);
180 }
181
182 static void powerword_markup_add_text(const gchar *text, gssize length, std::string *pango, std::string::size_type &cur_pos, LinksPosList *links_list)
183 {
184         const gchar *p;
185         const gchar *end;
186         p = text;
187         end = text + length;
188
189         GString *str;
190         str = g_string_sized_new (length);
191
192         const gchar *n;
193         bool find;
194         bool previous_islink = false;
195         std::string marktags;
196         guint currentmarktag = 0;
197         while (p != end) {
198                 const gchar *next;
199                 next = g_utf8_next_char (p);
200                 switch (*p) {
201                         case '}':
202                                 if (currentmarktag==0) {
203                                         g_string_append (str, "}");
204                                         previous_islink = false;
205                                 }
206                                 else {
207                                         currentmarktag--;
208                                         switch (marktags[currentmarktag]) {
209                                                 case 'b':
210                                                 case 'B':
211                                                         g_string_append (str, "</b>");
212                                                         previous_islink = false;
213                                                         break;
214                                                 case 'I':
215                                                         g_string_append (str, "</i>");
216                                                         previous_islink = false;
217                                                         break;
218                                                 case '+':
219                                                         g_string_append (str, "</sup>");
220                                                         previous_islink = false;
221                                                         break;
222                                                 case '-':
223                                                         g_string_append (str, "</sub>");
224                                                         previous_islink = false;
225                                                         break;
226                                                 case 'x':
227                                                         g_string_append (str, "</span>");
228                                                         previous_islink = false;
229                                                         break;
230                                                 case 'l':
231                                                 case 'D':
232                                                 case 'L':
233                                                 case 'U':
234                                                         g_string_append (str, "</span>");
235                                                         previous_islink = true;
236                                                         break;
237                                                 default:
238                                                         previous_islink = false;
239                                                         break;
240                                         }
241                                 }
242                                 break;
243                         case '&':
244                                 find = false;
245                                 if (next!=end) {
246                                         n = g_utf8_next_char(next);
247                                         if (n!=end && *n == '{') {
248                                                 find=true;
249                                                 currentmarktag++;
250                                                 if (marktags.length()<currentmarktag)
251                                                         marktags+=*next;
252                                                 else
253                                                         marktags[currentmarktag-1]=*next;
254                                                 switch (*next) {
255                                                         case 'b':
256                                                         case 'B':
257                                                                 g_string_append (str, "<b>");
258                                                                 next = n+1;
259                                                                 break;
260                                                         case 'I':
261                                                                 g_string_append (str, "<i>");
262                                                                 next = n+1;
263                                                                 break;
264                                                         case '+':
265                                                                 g_string_append (str, "<sup>");
266                                                                 next = n+1;
267                                                                 break;
268                                                         case '-':
269                                                                 g_string_append (str, "<sub>");
270                                                                 next = n+1;
271                                                                 break;
272                                                         case 'x':
273                                                                 g_string_append (str, "<span foreground=\"blue\" underline=\"single\">");
274                                                                 next = n+1;
275                                                                 break;
276                                                         case 'X':
277                                                         case '2':
278                                                                 {
279                                                                 const gchar *tag_end = n+1;
280                                                                 while (tag_end!=end) {
281                                                                         if (*tag_end=='}')
282                                                                                 break;
283                                                                         else
284                                                                                 tag_end++;
285                                                                 }
286                                                                 g_string_append (str, "<span foreground=\"blue\">");
287                                                                 gchar *tag_str;
288                                                                 if (*next == 'X') {
289                                                                         tag_str = toUtfPhonetic(n+1, tag_end - (n+1));
290                                                                 } else {
291                                                                         tag_str = toUtfPhonetic2(n+1, tag_end - (n+1));
292                                                                 }
293                                                                 g_string_append (str, tag_str);
294                                                                 g_free(tag_str);
295                                                                 g_string_append (str, "</span>");
296                                                                 currentmarktag--;
297                                                                 if (tag_end!=end)
298                                                                         next = tag_end+1;
299                                                                 else
300                                                                         next = end;
301                                                                 previous_islink = false;
302                                                                 break;
303                                                                 }
304                                                         case 'l':
305                                                         case 'D':
306                                                         case 'L':
307                                                         case 'U':
308                                                                 if (previous_islink)
309                                                                         g_string_append (str, "\t");
310                                                                 if (*next == 'l' || *next == 'D')
311                                                                         g_string_append (str, "<span foreground=\"blue\" underline=\"single\">");
312                                                                 else
313                                                                         g_string_append (str, "<span foreground=\"#008080\" underline=\"single\">");
314                                                                 *pango += str->str;
315                                                                 cur_pos += xml_strlen(str->str);
316                                                                 g_string_erase(str, 0, -1);
317                                                                 {
318                                                                 const gchar *tag_end = n+1;
319                                                                 while (tag_end!=end) {
320                                                                         if (*tag_end=='}')
321                                                                                 break;
322                                                                         else
323                                                                                 tag_end++;
324                                                                 }
325                                                                 char *tmpstr = g_markup_escape_text(n+1, tag_end - (n+1));
326                                                                 size_t xml_len = xml_strlen(tmpstr);
327                                                                 std::string link("query://");
328                                                                 link.append(n+1, tag_end - (n+1));
329                                                                 links_list->push_back(LinkDesc(cur_pos, xml_len, link));
330                                                                 *pango += tmpstr;
331                                                                 cur_pos += xml_len;
332                                                                 g_free(tmpstr);
333                                                                 g_string_append (str, "</span>");
334                                                                 currentmarktag--;
335                                                                 if (tag_end!=end)
336                                                                         next = tag_end+1;
337                                                                 else
338                                                                         next = end;
339                                                                 previous_islink = true;
340                                                                 break;
341                                                                 }
342                                                         /*case ' ':
343                                                         case '9':
344                                                         case 'S':*/
345                                                         default:
346                                                                 next = n+1;
347                                                                 break;
348                                                 }
349                                         }
350                                 }
351                                 if (!find) {
352                                         previous_islink = false;
353                                         g_string_append (str, "&amp;");
354                                 }
355                                 break;
356                         case '<':
357                                 previous_islink = false;
358                                 g_string_append (str, "&lt;");
359                                 break;
360                         case '>':
361                                 previous_islink = false;
362                                 g_string_append (str, "&gt;");
363                                 break;
364                         case '\'':
365                                 previous_islink = false;
366                                 g_string_append (str, "&apos;");
367                                 break;
368                         case '"':
369                                 previous_islink = false;
370                                 g_string_append (str, "&quot;");
371                                 break;
372                         default:
373                                 previous_islink = false;
374                                 g_string_append_len (str, p, next - p);
375                                 break;
376                 }
377                 p = next;
378         }
379         if (currentmarktag>0) {
380                 do {
381                         currentmarktag--;
382                         switch (marktags[currentmarktag]) {
383                                 case 'b':
384                                 case 'B':
385                                         g_string_append (str, "</b>");
386                                         break;
387                                 case 'I':
388                                         g_string_append (str, "</i>");
389                                         break;
390                                 case '+':
391                                         g_string_append (str, "</sup>");
392                                         break;
393                                 case '-':
394                                         g_string_append (str, "</sub>");
395                                         break;
396                                 case 'x':
397                                 case 'l':
398                                 case 'D':
399                                 case 'L':
400                                 case 'U':
401                                         g_string_append (str, "</span>");
402                                         break;
403                                 default:
404                                         break;
405                         }
406                 } while (currentmarktag>0);
407         }
408         *pango += str->str;
409         cur_pos += xml_strlen(str->str);
410         g_string_free (str, TRUE);
411 }
412
413 typedef struct _PwUserData {
414         std::string *pango;
415         LinksPosList *links_list;
416         std::string::size_type cur_pos;
417         const gchar *oword;
418         bool first_jbcy;
419 } PwUserData;
420
421 static void func_parse_passthrough(GMarkupParseContext *context, const gchar *passthrough_text, gsize text_len, gpointer user_data, GError **error)
422 {
423         if (!g_str_has_prefix(passthrough_text, "<![CDATA["))
424                 return;
425         const gchar *element = g_markup_parse_context_get_element(context);
426         if (!element)
427                 return;
428         const gchar *text = passthrough_text+9;
429         gsize len = text_len-9-3;
430         while (g_ascii_isspace(*text)) {
431                 text++;
432                 len--;
433         }
434         while (len>0 && g_ascii_isspace(*(text+len-1))) {
435                 len--;
436         }
437         if (len==0)
438                 return;
439         std::string *pango = ((PwUserData*)user_data)->pango;
440         std::string::size_type &cur_pos = ((PwUserData*)user_data)->cur_pos;
441         if (strcmp(element, "词典音标")==0) {
442                 if (!pango->empty()) {
443                         *pango+='\n';
444                         cur_pos++;
445                 }
446                 *pango+="[<span foreground=\"blue\">";
447                 cur_pos++;
448                 gchar *str = toUtfPhonetic(text, len);
449                 *pango+=str;
450                 cur_pos+=xml_strlen(str);
451                 g_free(str);
452                 *pango+="</span>]";
453                 cur_pos++;
454         } else if (strcmp(element, "单词原型")==0) {
455                 const gchar *oword = ((PwUserData*)user_data)->oword;
456                 if (strncmp(oword, text, len)) {
457                         if (!pango->empty()) {
458                                 *pango+='\n';
459                                 cur_pos++;
460                         }
461                         *pango+="<b>";
462                         gchar *str = g_markup_escape_text(text, len);
463                         pango->append(str);
464                         cur_pos+=xml_strlen(str);
465                         g_free(str);
466                         *pango+="</b>";
467                 }
468         } else if (strcmp(element, "单词词性")==0) {
469                 if (!pango->empty()) {
470                         *pango+='\n';
471                         cur_pos++;
472                 }
473                 *pango+="<i>";
474                 powerword_markup_add_text(text, len, pango, cur_pos, ((PwUserData*)user_data)->links_list);
475                 *pango+="</i>";
476         } else if (strcmp(element, "汉语拼音")==0) {
477                 if (!pango->empty()) {
478                         *pango+='\n';
479                         cur_pos++;
480                 }
481                 *pango+="<span foreground=\"blue\" underline=\"single\">";
482                 powerword_markup_add_text(text, len, pango, cur_pos, ((PwUserData*)user_data)->links_list);
483                 *pango+="</span>";
484         } else if (strcmp(element, "例句原型")==0) {
485                 if (!pango->empty()) {
486                         *pango+='\n';
487                         cur_pos++;
488                 }
489                 *pango+="<span foreground=\"#008080\">";
490                 powerword_markup_add_text(text, len, pango, cur_pos, ((PwUserData*)user_data)->links_list);
491                 *pango+="</span>";
492         } else if (strcmp(element, "例句解释")==0) {
493                 if (!pango->empty()) {
494                         *pango+='\n';
495                         cur_pos++;
496                 }
497                 *pango+="<span foreground=\"#01259A\">";
498                 powerword_markup_add_text(text, len, pango, cur_pos, ((PwUserData*)user_data)->links_list);
499                 *pango+="</span>";
500         /*} else if (strcmp(element, "相关词")==0) {
501                 if (!res->empty())
502                         *res+='\n';
503                 std::string tabstr;
504                 tabstr+=text[0];
505                 for (gsize i=1;i<len;i++) {
506                         if (text[i]=='&')
507                                 tabstr+="\t&";
508                         else
509                                 tabstr+=text[i];
510                 }
511                 gchar *str = powerword_markup_escape_text(tabstr.c_str(), tabstr.length());
512                 res->append(str);
513                 g_free(str);*/
514         } else
515         /*} else if (
516         strcmp(element, "解释项")==0 ||
517         strcmp(element, "跟随解释")==0 ||
518         strcmp(element, "相关词")==0 ||
519         strcmp(element, "预解释")==0 ||
520         strcmp(element, "繁体写法")==0 ||
521         strcmp(element, "台湾音标")==0 ||
522         strcmp(element, "图片名称")==0 ||
523         strcmp(element, "跟随注释")==0 ||
524         strcmp(element, "音节分段")==0 ||
525         strcmp(element, "AHD音标")==0 ||
526         strcmp(element, "国际音标")==0 ||
527         strcmp(element, "美国音标")==0 ||
528         strcmp(element, "子解释项")==0 ||
529         strcmp(element, "同义词")==0 ||
530         strcmp(element, "日文发音")==0 ||
531         strcmp(element, "惯用型原型")==0 ||
532         strcmp(element, "惯用型解释")==0 ||
533         strcmp(element, "另见")==0
534         ) {*/
535         {
536                 if (!pango->empty()) {
537                         *pango+='\n';
538                         cur_pos++;
539                 }
540                 powerword_markup_add_text(text, len, pango, cur_pos, ((PwUserData*)user_data)->links_list);
541         }
542 }
543
544 static void func_parse_start_element(GMarkupParseContext *context, const gchar *element_name, const gchar **attribute_names, const gchar **attribute_values, gpointer user_data, GError **error)
545 {
546         std::string res;
547         if (strcmp(element_name, "基本词义")==0) {
548                 if (((PwUserData*)user_data)->first_jbcy) {
549                         ((PwUserData*)user_data)->first_jbcy = false;
550                 } else {
551                         res="\n<span foreground=\"blue\">&lt;基本词义&gt;</span>";
552                 }
553         } else if (strcmp(element_name, "继承用法")==0) {
554                 res="\n<span foreground=\"blue\">&lt;继承用法&gt;</span>";
555         } else if (strcmp(element_name, "习惯用语")==0) {
556                 res="\n<span foreground=\"blue\">&lt;习惯用语&gt;</span>";
557         } else if (strcmp(element_name, "词性变化")==0) {
558                 res="\n<span foreground=\"blue\">&lt;词性变化&gt;</span>";
559         } else if (strcmp(element_name, "特殊用法")==0) {
560                 res="\n<span foreground=\"blue\">&lt;特殊用法&gt;</span>";
561         } else if (strcmp(element_name, "参考词汇")==0) {
562                 res="\n<span foreground=\"blue\">&lt;参考词汇&gt;</span>";
563         } else if (strcmp(element_name, "常用词组")==0) {
564                 res="\n<span foreground=\"blue\">&lt;常用词组&gt;</span>";
565         } else if (strcmp(element_name, "语源")==0) {
566                 res="\n<span foreground=\"blue\">&lt;语源&gt;</span>";
567         } else if (strcmp(element_name, "派生")==0) {
568                 res="\n<span foreground=\"blue\">&lt;派生&gt;</span>";
569         } else if (strcmp(element_name, "用法")==0) {
570                 res="\n<span foreground=\"blue\">&lt;用法&gt;</span>";
571         } else if (strcmp(element_name, "注释")==0) {
572                 res="\n<span foreground=\"blue\">&lt;注释&gt;</span>";
573         }
574         if (!res.empty()) {
575                 *(((PwUserData*)user_data)->pango) += res;
576                 ((PwUserData*)user_data)->cur_pos += xml_strlen(res.c_str());
577         }
578 }
579
580 static void powerword2link(const char *p, guint32 sec_size, const gchar *oword, std::string *pango, LinksPosList *links_list)
581 {
582         PwUserData Data;
583         Data.pango = pango;
584         Data.links_list = links_list;
585         Data.cur_pos = 0;
586         Data.oword = oword;
587         Data.first_jbcy = true;
588
589         GMarkupParser parser;
590         parser.start_element = func_parse_start_element;
591         parser.end_element = NULL;
592         parser.text = NULL;
593         parser.passthrough = func_parse_passthrough;
594         parser.error = NULL;
595         GMarkupParseContext* context = g_markup_parse_context_new(&parser, (GMarkupParseFlags)0, &Data, NULL);
596         g_markup_parse_context_parse(context, p, sec_size, NULL);
597         g_markup_parse_context_end_parse(context, NULL);
598         g_markup_parse_context_free(context);
599 }
600
601 static bool parse(const char *p, unsigned int *parsed_size, ParseResult &result, const char *oword)
602 {
603         if (*p != 'k')
604                 return false;
605         p++;
606         size_t len = strlen(p);
607         if (len) {
608                 std::string pango;
609                 LinksPosList links_list;
610                 powerword2link(p, len, oword, &pango, &links_list);
611                 ParseResultItem item;
612                 item.type = ParseResultItemType_link;
613                 item.link = new ParseResultLinkItem;
614                 item.link->pango = pango;
615                 item.link->links_list = links_list;
616                 result.item_list.push_back(item);
617         }
618         *parsed_size = 1 + len + 1;
619         return true;
620 }
621
622 static void configure()
623 {
624 }
625
626 DLLIMPORT bool stardict_plugin_init(StarDictPlugInObject *obj)
627 {
628         if (strcmp(obj->version_str, PLUGIN_SYSTEM_VERSION)!=0) {
629                 g_print("Error: PowerWord data parsing plugin version doesn't match!\n");
630                 return true;
631         }
632         obj->type = StarDictPlugInType_PARSEDATA;
633         obj->info_xml = g_strdup_printf("<plugin_info><name>%s</name><version>1.0</version><short_desc>%s</short_desc><long_desc>%s</long_desc><author>Hu Zheng &lt;huzheng_001@163.com&gt;</author><website>http://stardict.sourceforge.net</website></plugin_info>", _("PowerWord data parsing"), _("PowerWord data parsing engine."), _("Parse the PowerWord data."));
634         obj->configure_func = configure;
635         return false;
636 }
637
638 DLLIMPORT void stardict_plugin_exit(void)
639 {
640 }
641
642 DLLIMPORT bool stardict_parsedata_plugin_init(StarDictParseDataPlugInObject *obj)
643 {
644         obj->parse_func = parse;
645         g_print(_("PowerWord data parsing plug-in loaded.\n"));
646         return false;
647 }
648
649 #ifdef _WIN32
650 BOOL APIENTRY DllMain (HINSTANCE hInst     /* Library instance handle. */ ,
651                        DWORD reason        /* Reason this function is being called. */ ,
652                        LPVOID reserved     /* Not used. */ )
653 {
654     switch (reason)
655     {
656       case DLL_PROCESS_ATTACH:
657         break;
658
659       case DLL_PROCESS_DETACH:
660         break;
661
662       case DLL_THREAD_ATTACH:
663         break;
664
665       case DLL_THREAD_DETACH:
666         break;
667     }
668
669     /* Returns TRUE on success, FALSE on failure */
670     return TRUE;
671 }
672 #endif