Imported version 0.4-1
[mstardict] / stardict-plugins / stardict-html-parsedata-plugin / stardict_html_parsedata.cpp
1 #include "stardict_html_parsedata.h"
2 #include <glib/gi18n.h>
3
4 #ifdef _WIN32
5 #include <windows.h>
6
7 #ifdef _MSC_VER
8 #  define strncasecmp _strnicmp
9 #endif
10
11 static char *strcasestr (const char *phaystack, const char *pneedle)
12 {
13         register const unsigned char *haystack, *needle;
14         register char b, c;
15
16         haystack = (const unsigned char *) phaystack;
17         needle = (const unsigned char *) pneedle;
18
19         b = tolower(*needle);
20         if (b != '\0') {
21                 haystack--;             /* possible ANSI violation */
22                 do {
23                         c = *++haystack;
24                         if (c == '\0')
25                                 goto ret0;
26                 } while (tolower(c) != (int) b);
27
28                 c = tolower(*++needle);
29                 if (c == '\0')
30                         goto foundneedle;
31                 ++needle;
32                 goto jin;
33
34                 for (;;) {
35                         register char a;
36                         register const unsigned char *rhaystack, *rneedle;
37
38                         do {
39                                 a = *++haystack;
40                                 if (a == '\0')
41                                         goto ret0;
42                                 if (tolower(a) == (int) b)
43                                         break;
44                                 a = *++haystack;
45                                 if (a == '\0')
46                                         goto ret0;
47                         shloop:
48                                 ;
49                         }
50                         while (tolower(a) != (int) b);
51
52                 jin:      a = *++haystack;
53                         if (a == '\0')
54                                 goto ret0;
55
56                         if (tolower(a) != (int) c)
57                                 goto shloop;
58
59                         rhaystack = haystack-- + 1;
60                         rneedle = needle;
61                         a = tolower(*rneedle);
62
63                         if (tolower(*rhaystack) == (int) a)
64                                 do {
65                                         if (a == '\0')
66                                                 goto foundneedle;
67                                         ++rhaystack;
68                                         a = tolower(*++needle);
69                                         if (tolower(*rhaystack) != (int) a)
70                                                 break;
71                                         if (a == '\0')
72                                                 goto foundneedle;
73                                         ++rhaystack;
74                                         a = tolower(*++needle);
75                                 } while (tolower (*rhaystack) == (int) a);
76
77                         needle = rneedle;             /* took the register-poor approach */
78
79                         if (a == '\0')
80                                 break;
81                 }
82         }
83  foundneedle:
84         return (char*) haystack;
85  ret0:
86         return 0;
87 }
88 #endif
89
90 static void html_topango(const std::string& str, std::string &pango, size_t &pango_len)
91 {
92         const char *q, *p;
93         static const char* xml_entrs[] = { "lt;", "gt;", "amp;", "apos;", "quot;", 0 };
94         static const int xml_ent_len[] = { 3,     3,     4,      5,       5 };
95         static const char* html_entries[] = {"nbsp;", 0};
96         static const int html_entry_len[] = {5};
97         static const char* html_values[] = {" "};
98         static const int html_value_len[] = {1};
99         size_t cur_pos;
100         int i;
101         char *etext;
102
103         pango.clear();
104         for (cur_pos = 0, q = str.c_str(); *q; ++cur_pos) {
105                 if (*q == '&') {
106                         for (i = 0; xml_entrs[i]; ++i) {
107                                 if (strncasecmp(xml_entrs[i], q + 1,
108                                             xml_ent_len[i]) == 0) {
109                                         q += xml_ent_len[i] + 1;
110                                         pango += '&';
111                                         pango += xml_entrs[i];
112                                         break;
113                                 }
114                         }
115                         if (xml_entrs[i] == NULL) {
116                                 for (i = 0; html_entries[i]; ++i) {
117                                         if (strncasecmp(html_entries[i], q+1, html_entry_len[i])==0) {
118                                                 q += html_entry_len[i] + 1;
119                                                 pango += html_values[i];
120                                                 cur_pos += (html_value_len[i] -1);
121                                                 break;
122                                         }
123                                 }
124                                 if (html_entries[i] == NULL) {
125                                         if (*(q+1)=='#' && (p = strchr(q+2, ';'))) {
126                                                 std::string str(q+2, p-(q+2));
127                                                 gunichar uc;
128                                                 uc = atoi(str.c_str());
129                                                 gchar utf8[7];
130                                                 gint n = g_unichar_to_utf8(uc, utf8);
131                                                 utf8[n] = '\0';
132                                                 pango += utf8;
133                                                 q = p+1;
134                                         } else {
135                                                 ++q;
136                                                 pango += "&amp;";
137                                         }
138                                 }
139                         }
140                 } else if (*q == '\r' || *q == '\n') {
141                         q++;
142                         cur_pos--;
143                 } else {
144                         p = g_utf8_next_char(q);
145                         etext = g_markup_escape_text(q, p-q);
146                         pango += etext;
147                         g_free(etext);
148                         q = p;
149                 }
150         }
151
152         pango_len = cur_pos;
153 }
154
155 static void xml_decode(const char *str, std::string& decoded)
156 {
157         static const char raw_entrs[] = { 
158                 '<',   '>',   '&',    '\'',    '\"',    0 
159         };
160         static const char* xml_entrs[] = { 
161                 "lt;", "gt;", "amp;", "apos;", "quot;", 0 
162         };
163         static const int xml_ent_len[] = { 
164                 3,     3,     4,      5,       5 
165         };
166         int ient;
167         const char *amp = strchr(str, '&');
168
169         if (amp == NULL) {
170                 decoded = str;
171                 return;
172         }
173         decoded.assign(str, amp - str);
174         
175         while (*amp)
176                 if (*amp == '&') {
177                         for (ient = 0; xml_entrs[ient] != 0; ++ient)
178                                 if (strncmp(amp + 1, xml_entrs[ient],
179                                             xml_ent_len[ient]) == 0) {
180                                         decoded += raw_entrs[ient];
181                                         amp += xml_ent_len[ient]+1;
182                                         break;
183                                 }
184                         if (xml_entrs[ient] == 0)    // unrecognized sequence
185                                 decoded += *amp++;
186
187                 } else {
188                         decoded += *amp++;
189                 }        
190 }
191
192 static void html2result(const char *p, ParseResult &result)
193 {
194         LinksPosList links_list;
195         std::string res;
196         const char *tag, *next;
197         std::string name;
198         std::string::size_type cur_pos;
199         int i;
200
201         struct ReplaceTag {
202                 const char *match_;
203                 int match_len_;
204                 const char *replace_;
205                 int char_len_;
206         };
207         static const ReplaceTag replace_arr[] = {
208                 { "b>", 2, "<b>", 0 },
209                 { "/b>", 3, "</b>", 0 },
210                 { "big>", 4, "<big>", 0},
211                 { "/big>", 5, "</big>", 0},
212                 { "i>", 2, "<i>", 0  },
213                 { "/i>", 3, "</i>", 0 },
214                 { "s>", 2, "<s>", 0  },
215                 { "/s>", 3, "</s>", 0 },
216                 { "sub>", 4, "<sub>", 0 },
217                 { "/sub>", 5, "</sub>", 0},
218                 { "sup>", 4, "<sup>", 0},
219                 { "/sup>", 5, "</sup>", 0},
220                 { "small>", 6, "<small>", 0},
221                 { "/small>", 7, "</small>", 0},
222                 { "tt>", 3, "<tt>", 0},
223                 { "/tt>", 4, "</tt>", 0},
224                 { "u>", 2, "<u>", 0  },
225                 { "/u>", 3, "</u>", 0 },
226                 { "br>", 3, "\n", 1 },
227                 { "nl>", 3, "", 0 },
228                 { "hr>", 3, "\n<span foreground=\"gray\"><s>     </s></span>\n", 7 },
229                 { "/font>", 6, "</span>", 0 },
230                 { NULL, 0, NULL },
231         };
232
233         for (cur_pos = 0; *p && (tag = strchr(p, '<')) != NULL;) {
234                 std::string chunk(p, tag - p);
235                 size_t pango_len;
236                 std::string pango;
237                 html_topango(chunk, pango, pango_len);
238                 res += pango;
239                 cur_pos += pango_len;
240
241                 p = tag;
242                 for (i = 0; replace_arr[i].match_; ++i)
243                         if (strncasecmp(replace_arr[i].match_, p + 1,
244                                                 replace_arr[i].match_len_) == 0) {
245                                 res += replace_arr[i].replace_;
246                                 p += 1 + replace_arr[i].match_len_;
247                                 cur_pos += replace_arr[i].char_len_;
248                                 goto cycle_end;
249                         }
250
251                 if (strncasecmp(p+1, "font ", 5)==0) {
252                         next = strchr(p, '>');
253                         if (!next) {
254                                 ++p;
255                                 continue;
256                         }
257                         res += "<span";
258                         name.assign(p + 6, next - (p + 6));
259                         const char *p1 = strcasestr(name.c_str(), "face=");
260                         if (p1) {
261                                 p1 += sizeof("face=") -1 +1;
262                                 const char *p2 = p1;
263                                 while (true) {
264                                         if (*p2 == '\0') {
265                                                 p2 = NULL;
266                                                 break;
267                                         }
268                                         if (*p2 == '\'' || *p2 == '"')
269                                                 break;
270                                         p2++;
271                                 }
272                                 if (p2) {
273                                         std::string face(p1, p2-p1);
274                                         res += " face=\"";
275                                         res += face;
276                                         res += "\"";
277                                 }
278                         }
279                         p1 = strcasestr(name.c_str(), "color=");
280                         if (p1) {
281                                 p1 += sizeof("color=") -1;
282                                 if (*p1 == '\'' || *p1 == '\"')
283                                         p1++;
284                                 const char *p2 = p1;
285                                 while (true) {
286                                         if (*p2 == '\0') {
287                                                 p2 = NULL;
288                                                 break;
289                                         }
290                                         if (*p2 == '\'' || *p2 == '"' || *p2 == ' ' || *p2 == '>')
291                                                 break;
292                                         p2++;
293                                 }
294                                 if (p2) {
295                                         std::string color(p1, p2-p1);
296                                         if (pango_color_parse(NULL, color.c_str())) {
297                                                 res += " foreground=\"";
298                                                 res += color;
299                                                 res += "\"";
300                                         }
301                                 }
302                         }
303                         res += ">";
304                         p = next + 1;
305                 } else if ((*(p + 1) == 'a' || *(p + 1) == 'A') && *(p + 2) == ' ') {
306                         next = strchr(p, '>');
307                         if (!next) {
308                                 p++;
309                                 continue;
310                         }
311                         p+=3;
312                         name.assign(p, next - p);
313                         const char *p1 = strcasestr(name.c_str(), "href=");
314                         std::string link;
315                         if (p1) {
316                                 p1 += sizeof("href=") -1 +1;
317                                 const char *p2 = p1;
318                                 while (true) {
319                                         if (*p2 == '\0') {
320                                                 p2 = NULL;
321                                                 break;
322                                         }
323                                         if (*p2 == '\'' || *p2 == '"')
324                                                 break;
325                                         p2++;
326                                 }
327                                 if (p2) {
328                                         link.assign(p1, p2-p1);
329                                 }
330                         }
331                         p = next + 1;
332                         next = strcasestr(p, "</a>");
333                         if (!next) {
334                                 continue;
335                         }
336                         res += "<span foreground=\"blue\" underline=\"single\">";
337                         std::string::size_type link_len = next - p;
338                         std::string chunk(p, link_len);
339                         html_topango(chunk, pango, pango_len);
340                         links_list.push_back(LinkDesc(cur_pos, pango_len, link));
341                         res += pango;
342                         cur_pos += pango_len;
343                         res += "</span>";
344                         p = next + sizeof("</a>") - 1;
345                 } else if (strncasecmp(p+1, "ref>", 4)==0) {
346                         next = strcasestr(p, "</ref>");
347                         if (!next) {
348                                 p++;
349                                 continue;
350                         }
351                         p+=5;
352                         res += "<span foreground=\"blue\" underline=\"single\">";
353                         std::string::size_type link_len = next - p;
354                         std::string chunk(p, link_len);
355                         html_topango(chunk, pango, pango_len);
356                         std::string xml_enc;
357                         xml_decode(chunk.c_str(), xml_enc);
358                         std::string link;
359                         link = "query://";
360                         link += xml_enc;
361                         links_list.push_back(LinkDesc(cur_pos, pango_len, link));
362                         res += pango;
363                         cur_pos += pango_len;
364                         res += "</span>";
365                         p = next + sizeof("</ref>") - 1;
366                 } else if (strncasecmp(p+1, "img ", 4)==0) {
367                         next = strchr(p+5, '>');
368                         if (!next) {
369                                 p++;
370                                 continue;
371                         }
372                         name.assign(p+5, next - (p+5));
373                         p = next + 1;
374                         const char *p1 = strcasestr(name.c_str(), "src=");
375                         std::string src;
376                         if (p1) {
377                                 p1 += sizeof("src=") -1 +1;
378                                 const char *p2 = p1;
379                                 while (true) {
380                                         if (*p2 == '\0') {
381                                                 p2 = NULL;
382                                                 break;
383                                         }
384                                         if (*p2 == '\'' || *p2 == '"')
385                                                 break;
386                                         p2++;
387                                 }
388                                 if (p2) {
389                                         src.assign(p1, p2-p1);
390                                 }
391                         }
392                         if (!src.empty()) {
393                                 ParseResultItem item;
394                                 item.type = ParseResultItemType_link;
395                                 item.link = new ParseResultLinkItem;
396                                 item.link->pango = res;
397                                 item.link->links_list = links_list;
398                                 result.item_list.push_back(item);
399                                 res.clear();
400                                 cur_pos = 0;
401                                 links_list.clear();
402                                 item.type = ParseResultItemType_res;
403                                 item.res = new ParseResultResItem;
404                                 item.res->type = "image";
405                                 int n = src.length();
406                                 if (src[0]==0x1e && src[n-1]==0x1f) {
407                                         item.res->key.assign(src.c_str()+1, n-2);
408                                 } else {
409                                         item.res->key = src;
410                                 }
411                                 result.item_list.push_back(item);
412                         }
413                 } else {
414                         next = strchr(p+1, '>');
415                         if (!next) {
416                                 p++;
417                                 res += "&lt;";
418                                 cur_pos++;
419                                 continue;
420                         }
421                         p = next + 1;
422                 }
423 cycle_end:
424                 ;
425         }
426         res += p;
427         ParseResultItem item;
428         item.type = ParseResultItemType_link;
429         item.link = new ParseResultLinkItem;
430         item.link->pango = res;
431         item.link->links_list = links_list;
432         result.item_list.push_back(item);
433 }
434
435 static bool parse(const char *p, unsigned int *parsed_size, ParseResult &result, const char *oword)
436 {
437         if (*p != 'h')
438                 return false;
439         p++;
440         size_t len = strlen(p);
441         if (len) {
442                 html2result(p, result);
443         }
444         *parsed_size = 1 + len + 1;
445         return true;
446 }
447
448 static void configure()
449 {
450 }
451
452 DLLIMPORT bool stardict_plugin_init(StarDictPlugInObject *obj)
453 {
454         if (strcmp(obj->version_str, PLUGIN_SYSTEM_VERSION)!=0) {
455                 g_print("Error: HTML data parsing plugin version doesn't match!\n");
456                 return true;
457         }
458         obj->type = StarDictPlugInType_PARSEDATA;
459         obj->info_xml = g_strdup_printf("<plugin_info><name>%s</name><version>1.0</version><short_desc>%s</short_desc><long_desc>%s</long_desc><author>Hu Zheng &lt;huzheng_001@163.com&gt;</author><website>http://stardict.sourceforge.net</website></plugin_info>", _("HTML data parsing"), _("HTML data parsing engine."), _("Parse the HTML data."));
460         obj->configure_func = configure;
461         return false;
462 }
463
464 DLLIMPORT void stardict_plugin_exit(void)
465 {
466 }
467
468 DLLIMPORT bool stardict_parsedata_plugin_init(StarDictParseDataPlugInObject *obj)
469 {
470         obj->parse_func = parse;
471         g_print(_("HTML data parsing plug-in loaded.\n"));
472         return false;
473 }
474
475 #ifdef _WIN32
476 BOOL APIENTRY DllMain (HINSTANCE hInst     /* Library instance handle. */ ,
477                        DWORD reason        /* Reason this function is being called. */ ,
478                        LPVOID reserved     /* Not used. */ )
479 {
480     switch (reason)
481     {
482       case DLL_PROCESS_ATTACH:
483         break;
484
485       case DLL_PROCESS_DETACH:
486         break;
487
488       case DLL_THREAD_ATTACH:
489         break;
490
491       case DLL_THREAD_DETACH:
492         break;
493     }
494
495     /* Returns TRUE on success, FALSE on failure */
496     return TRUE;
497 }
498 #endif