Google plugin: convert &#nnn; escaped characters to utf-8 in movie titles
authorPhilipp Zabel <philipp.zabel@gmail.com>
Wed, 18 Nov 2009 16:47:26 +0000 (17:47 +0100)
committerPhilipp Zabel <philipp.zabel@gmail.com>
Wed, 18 Nov 2009 16:49:07 +0000 (17:49 +0100)
src/plugins/google-parser.vala

index 3eadbcc..452a044 100644 (file)
@@ -141,12 +141,40 @@ public class GoogleParser : Object {
                return value;
        }
 
+       public string unescape_unicode (string s) {
+               string result = "";
+               int i, j;
+               long l = s.length;
+
+               for (i = 0; i < l; i++) {
+                       if (s[i] == '&' && s[i + 1] == '#') {
+                               for (j = i + 2; j < l; j++) {
+                                       if (!s[j].isdigit ())
+                                               break;
+                                       if (s[j] == ';')
+                                               break;
+                               }
+                               if (s[j] == ';') {
+                                       int codepoint = s.substring (i + 2, j - i - 2).to_int ();
+                                       char[] buf = new char[6];
+                                       ((unichar) codepoint).to_utf8 ((string) buf);
+                                       result += (string) buf;
+                                       i = j;
+                                       continue;
+                               }
+                       }
+                       result += s.substring (i, 1);
+               }
+
+               return result;
+       }
+
        public void parse_movie () throws Error {
                expect_tag ("div"); // class=movie
                expect_tag ("div"); // class=name
                expect_tag ("a"); // href="/movies?near=city&amp;mid=..."
                expect_tag ("span"); // dir=ltr
-               var title = convert (parse_text ().replace ("&#39;", "'").replace ("&amp;", "&"), -1, "utf-8", "iso-8859-1"); // FIXME
+               var title = unescape_unicode (convert (parse_text (), -1, "utf-8", "iso-8859-1")).replace ("&amp;", "&"); // FIXME
                expect_tag ("/span");
                expect_tag ("/a");
                expect_tag ("/div");