1 #include "stardict_html_parsedata.h"
2 #include <glib/gi18n.h>
8 # define strncasecmp _strnicmp
11 static char *strcasestr (const char *phaystack, const char *pneedle)
13 register const unsigned char *haystack, *needle;
16 haystack = (const unsigned char *) phaystack;
17 needle = (const unsigned char *) pneedle;
21 haystack--; /* possible ANSI violation */
26 } while (tolower(c) != (int) b);
28 c = tolower(*++needle);
36 register const unsigned char *rhaystack, *rneedle;
42 if (tolower(a) == (int) b)
50 while (tolower(a) != (int) b);
56 if (tolower(a) != (int) c)
59 rhaystack = haystack-- + 1;
61 a = tolower(*rneedle);
63 if (tolower(*rhaystack) == (int) a)
68 a = tolower(*++needle);
69 if (tolower(*rhaystack) != (int) a)
74 a = tolower(*++needle);
75 } while (tolower (*rhaystack) == (int) a);
77 needle = rneedle; /* took the register-poor approach */
84 return (char*) haystack;
90 static void html_topango(const std::string& str, std::string &pango, size_t &pango_len)
93 static const char* xml_entrs[] = { "lt;", "gt;", "amp;", "apos;", "quot;", 0 };
94 static const int xml_ent_len[] = { 3, 3, 4, 5, 5 };
95 static const char* html_entries[] = {"nbsp;", 0};
96 static const int html_entry_len[] = {5};
97 static const char* html_values[] = {" "};
98 static const int html_value_len[] = {1};
104 for (cur_pos = 0, q = str.c_str(); *q; ++cur_pos) {
106 for (i = 0; xml_entrs[i]; ++i) {
107 if (strncasecmp(xml_entrs[i], q + 1,
108 xml_ent_len[i]) == 0) {
109 q += xml_ent_len[i] + 1;
111 pango += xml_entrs[i];
115 if (xml_entrs[i] == NULL) {
116 for (i = 0; html_entries[i]; ++i) {
117 if (strncasecmp(html_entries[i], q+1, html_entry_len[i])==0) {
118 q += html_entry_len[i] + 1;
119 pango += html_values[i];
120 cur_pos += (html_value_len[i] -1);
124 if (html_entries[i] == NULL) {
125 if (*(q+1)=='#' && (p = strchr(q+2, ';'))) {
126 std::string str(q+2, p-(q+2));
128 uc = atoi(str.c_str());
130 gint n = g_unichar_to_utf8(uc, utf8);
140 } else if (*q == '\r' || *q == '\n') {
144 p = g_utf8_next_char(q);
145 etext = g_markup_escape_text(q, p-q);
155 static void xml_decode(const char *str, std::string& decoded)
157 static const char raw_entrs[] = {
158 '<', '>', '&', '\'', '\"', 0
160 static const char* xml_entrs[] = {
161 "lt;", "gt;", "amp;", "apos;", "quot;", 0
163 static const int xml_ent_len[] = {
167 const char *amp = strchr(str, '&');
173 decoded.assign(str, amp - str);
177 for (ient = 0; xml_entrs[ient] != 0; ++ient)
178 if (strncmp(amp + 1, xml_entrs[ient],
179 xml_ent_len[ient]) == 0) {
180 decoded += raw_entrs[ient];
181 amp += xml_ent_len[ient]+1;
184 if (xml_entrs[ient] == 0) // unrecognized sequence
192 static void html2result(const char *p, ParseResult &result)
194 LinksPosList links_list;
196 const char *tag, *next;
198 std::string::size_type cur_pos;
204 const char *replace_;
207 static const ReplaceTag replace_arr[] = {
208 { "b>", 2, "<b>", 0 },
209 { "/b>", 3, "</b>", 0 },
210 { "big>", 4, "<big>", 0},
211 { "/big>", 5, "</big>", 0},
212 { "i>", 2, "<i>", 0 },
213 { "/i>", 3, "</i>", 0 },
214 { "s>", 2, "<s>", 0 },
215 { "/s>", 3, "</s>", 0 },
216 { "sub>", 4, "<sub>", 0 },
217 { "/sub>", 5, "</sub>", 0},
218 { "sup>", 4, "<sup>", 0},
219 { "/sup>", 5, "</sup>", 0},
220 { "small>", 6, "<small>", 0},
221 { "/small>", 7, "</small>", 0},
222 { "tt>", 3, "<tt>", 0},
223 { "/tt>", 4, "</tt>", 0},
224 { "u>", 2, "<u>", 0 },
225 { "/u>", 3, "</u>", 0 },
226 { "br>", 3, "\n", 1 },
228 { "hr>", 3, "\n<span foreground=\"gray\"><s> </s></span>\n", 7 },
229 { "/font>", 6, "</span>", 0 },
233 for (cur_pos = 0; *p && (tag = strchr(p, '<')) != NULL;) {
234 std::string chunk(p, tag - p);
237 html_topango(chunk, pango, pango_len);
239 cur_pos += pango_len;
242 for (i = 0; replace_arr[i].match_; ++i)
243 if (strncasecmp(replace_arr[i].match_, p + 1,
244 replace_arr[i].match_len_) == 0) {
245 res += replace_arr[i].replace_;
246 p += 1 + replace_arr[i].match_len_;
247 cur_pos += replace_arr[i].char_len_;
251 if (strncasecmp(p+1, "font ", 5)==0) {
252 next = strchr(p, '>');
258 name.assign(p + 6, next - (p + 6));
259 const char *p1 = strcasestr(name.c_str(), "face=");
261 p1 += sizeof("face=") -1 +1;
268 if (*p2 == '\'' || *p2 == '"')
273 std::string face(p1, p2-p1);
279 p1 = strcasestr(name.c_str(), "color=");
281 p1 += sizeof("color=") -1;
282 if (*p1 == '\'' || *p1 == '\"')
290 if (*p2 == '\'' || *p2 == '"' || *p2 == ' ' || *p2 == '>')
295 std::string color(p1, p2-p1);
296 if (pango_color_parse(NULL, color.c_str())) {
297 res += " foreground=\"";
305 } else if ((*(p + 1) == 'a' || *(p + 1) == 'A') && *(p + 2) == ' ') {
306 next = strchr(p, '>');
312 name.assign(p, next - p);
313 const char *p1 = strcasestr(name.c_str(), "href=");
316 p1 += sizeof("href=") -1 +1;
323 if (*p2 == '\'' || *p2 == '"')
328 link.assign(p1, p2-p1);
332 next = strcasestr(p, "</a>");
336 res += "<span foreground=\"blue\" underline=\"single\">";
337 std::string::size_type link_len = next - p;
338 std::string chunk(p, link_len);
339 html_topango(chunk, pango, pango_len);
340 links_list.push_back(LinkDesc(cur_pos, pango_len, link));
342 cur_pos += pango_len;
344 p = next + sizeof("</a>") - 1;
345 } else if (strncasecmp(p+1, "ref>", 4)==0) {
346 next = strcasestr(p, "</ref>");
352 res += "<span foreground=\"blue\" underline=\"single\">";
353 std::string::size_type link_len = next - p;
354 std::string chunk(p, link_len);
355 html_topango(chunk, pango, pango_len);
357 xml_decode(chunk.c_str(), xml_enc);
361 links_list.push_back(LinkDesc(cur_pos, pango_len, link));
363 cur_pos += pango_len;
365 p = next + sizeof("</ref>") - 1;
366 } else if (strncasecmp(p+1, "img ", 4)==0) {
367 next = strchr(p+5, '>');
372 name.assign(p+5, next - (p+5));
374 const char *p1 = strcasestr(name.c_str(), "src=");
377 p1 += sizeof("src=") -1 +1;
384 if (*p2 == '\'' || *p2 == '"')
389 src.assign(p1, p2-p1);
393 ParseResultItem item;
394 item.type = ParseResultItemType_link;
395 item.link = new ParseResultLinkItem;
396 item.link->pango = res;
397 item.link->links_list = links_list;
398 result.item_list.push_back(item);
402 item.type = ParseResultItemType_res;
403 item.res = new ParseResultResItem;
404 item.res->type = "image";
405 int n = src.length();
406 if (src[0]==0x1e && src[n-1]==0x1f) {
407 item.res->key.assign(src.c_str()+1, n-2);
411 result.item_list.push_back(item);
414 next = strchr(p+1, '>');
427 ParseResultItem item;
428 item.type = ParseResultItemType_link;
429 item.link = new ParseResultLinkItem;
430 item.link->pango = res;
431 item.link->links_list = links_list;
432 result.item_list.push_back(item);
435 static bool parse(const char *p, unsigned int *parsed_size, ParseResult &result, const char *oword)
440 size_t len = strlen(p);
442 html2result(p, result);
444 *parsed_size = 1 + len + 1;
448 static void configure()
452 DLLIMPORT bool stardict_plugin_init(StarDictPlugInObject *obj)
454 if (strcmp(obj->version_str, PLUGIN_SYSTEM_VERSION)!=0) {
455 g_print("Error: HTML data parsing plugin version doesn't match!\n");
458 obj->type = StarDictPlugInType_PARSEDATA;
459 obj->info_xml = g_strdup_printf("<plugin_info><name>%s</name><version>1.0</version><short_desc>%s</short_desc><long_desc>%s</long_desc><author>Hu Zheng <huzheng_001@163.com></author><website>http://stardict.sourceforge.net</website></plugin_info>", _("HTML data parsing"), _("HTML data parsing engine."), _("Parse the HTML data."));
460 obj->configure_func = configure;
464 DLLIMPORT void stardict_plugin_exit(void)
468 DLLIMPORT bool stardict_parsedata_plugin_init(StarDictParseDataPlugInObject *obj)
470 obj->parse_func = parse;
471 g_print(_("HTML data parsing plug-in loaded.\n"));
476 BOOL APIENTRY DllMain (HINSTANCE hInst /* Library instance handle. */ ,
477 DWORD reason /* Reason this function is being called. */ ,
478 LPVOID reserved /* Not used. */ )
482 case DLL_PROCESS_ATTACH:
485 case DLL_PROCESS_DETACH:
488 case DLL_THREAD_ATTACH:
491 case DLL_THREAD_DETACH:
495 /* Returns TRUE on success, FALSE on failure */