From 45ff969eb86dd5d4b2c9644e7caa99f27c79e051 Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Tue, 13 Jul 2010 19:29:06 +0200 Subject: [PATCH] Google backend: replace parser with a libxml-2.0 based one --- Makefile.am | 6 +- configure.ac | 4 + src/backends/google/google-backend.vala | 2 +- src/backends/google/google-parser.vala | 403 +++++++++++-------------------- 4 files changed, 144 insertions(+), 271 deletions(-) diff --git a/Makefile.am b/Makefile.am index 31183a3..8236c8d 100644 --- a/Makefile.am +++ b/Makefile.am @@ -209,9 +209,9 @@ cinaest_google_backend_VALASOURCES = \ src/backends/google/google-parser.vala cinaest_google_backend_VALAFLAGS = --vapidir ./vapi --pkg dbus-glib-1 \ - --pkg gconf-2.0 --pkg gee-1.0 --pkg gio-2.0 -cinaest_google_backend_CFLAGS = ${DBUS_CFLAGS} ${GCONF_CFLAGS} ${GEE_CFLAGS} ${GIO_CFLAGS} -cinaest_google_backend_LDADD = ${DBUS_LIBS} ${GCONF_LIBS} ${GEE_LIBS} ${GIO_LIBS} + --pkg gconf-2.0 --pkg gee-1.0 --pkg gio-2.0 --pkg libxml-2.0 +cinaest_google_backend_CFLAGS = ${DBUS_CFLAGS} ${GCONF_CFLAGS} ${GEE_CFLAGS} ${GIO_CFLAGS} ${XML_CFLAGS} +cinaest_google_backend_LDADD = ${DBUS_LIBS} ${GCONF_LIBS} ${GEE_LIBS} ${GIO_LIBS} ${XML_LIBS} src/backends/google/google-backend.c: ${cinaest_google_backend_VALASOURCES} ${VALAC} -C ${cinaest_google_backend_VALASOURCES} ${cinaest_google_backend_VALAFLAGS} diff --git a/configure.ac b/configure.ac index 76919d6..f595916 100644 --- a/configure.ac +++ b/configure.ac @@ -94,6 +94,10 @@ PKG_CHECK_MODULES(DBUS, dbus-glib-1 >= 0.78) AC_SUBST(DBUS_LIBS) AC_SUBST(DBUS_CFLAGS) +PKG_CHECK_MODULES(XML, libxml-2.0) +AC_SUBST(XML_LIBS) +AC_SUBST(XML_CFLAGS) + PKG_CHECK_MODULES(OSSOSETTINGS, osso-af-settings >= 0.9.2) localedir=`$PKG_CONFIG osso-af-settings --variable=localedir` diff --git a/src/backends/google/google-backend.vala b/src/backends/google/google-backend.vala index 7e4d89e..1815970 100644 --- a/src/backends/google/google-backend.vala +++ b/src/backends/google/google-backend.vala @@ -73,7 +73,7 @@ public class MovieSearch : Object { var m = new string[results.length ()]; int i = 0; for (unowned GLib.List node = results.first (); node != null; node = node.next) { - m[i++] = "{\"title\":\"%s\",\"rating\":%f,\"runtime\":%d,\"showtimes\":\"%s\",\"cinema_name\":\"%s\",\"cinema_phone\":\"%s\"}".printf (node.data.title, node.data.rating, node.data.runtime, node.data.showtimes, node.data.cinema.name, node.data.cinema.phone); + m[i++] = "{\"title\":\"%s\",\"rating\":%f,\"runtime\":%d,\"showtimes\":\"%s\",\"cinema_name\":\"%s\",\"cinema_phone\":\"%s\"}".printf (node.data.title, node.data.rating, node.data.runtime, node.data.showtimes, node.data.theater.name, node.data.theater.phone); } movies_found (m, true); service.timeout_quit (); diff --git a/src/backends/google/google-parser.vala b/src/backends/google/google-parser.vala index e18a00b..439c0a3 100644 --- a/src/backends/google/google-parser.vala +++ b/src/backends/google/google-parser.vala @@ -16,33 +16,23 @@ * along with Cinaest. If not, see . */ -errordomain ParserError { - WRONG_TAG, - EOF -} - -public class Cinema { +public class Theater { public string name; public string address; public string phone; - - public Cinema (string _name) { - name = _name; - } } public class GoogleMovie { public string title; public int rating; - public Cinema cinema; + public Theater theater; public int runtime; public string fsk; public string showtimes; } -public class GoogleParser : Object { - char *current; - Cinema last_cinema; +class GoogleParser : Object { + int movies; public string location; string _title; PatternSpec pattern; @@ -50,227 +40,112 @@ public class GoogleParser : Object { public delegate void ReceiveMovie (GoogleMovie movie); public ReceiveMovie _get_callback; - public int next_tag_offset () { - int i = -1; - while (current[++i] != '<' && current[i] != 0); - return i; - } - - public void next_tag () { - if (current[0] == 0) - return; - current += next_tag_offset (); - } - - public void finish_tag () { - while (current[0] != '>' && current[0] != 0) - current++; - if (current[0] == '>') - current++; - } - - public unowned string parse_tag (bool finish = true) throws Error { - unowned string tag; - next_tag (); - int i = 1; - while (current[++i].isalnum ()); - if (current[i] == 0) - throw new ParserError.EOF ("EOF in tag"); - if (current[i] == '>') - finish = false; - current[i] = 0; - tag = (string) (current + 1); - current += i + 1; - if (finish) - finish_tag (); - return tag; - } - - public void expect_tag (string tag) throws Error { - var found = parse_tag (true); - if (tag != found) { - throw new ParserError.WRONG_TAG ("Wrong tag \"%s\", expected \"%s\"", - found, tag); - } - } - - public string parse_text () { - string text = ((string) current).ndup (next_tag_offset ()); - next_tag (); - return text; + private Html.Doc* get_html_document (ref char[] buf) { + return Html.Doc.read_memory (buf, (int) buf.length, + "http://movies.google.de", null, Html.ParserOption.NOERROR | Html.ParserOption.NOWARNING); } - public void parse_attribute (string _attr, out string value) { - string attr; - if (current[0] == 0) - return; - int i = -1; - while (current[++i] != '=' && current[i] != '>' && current[i] != 0) { - - } - attr = ((string) current).ndup (i); - current += i; - if (current[0] == 0) - return; - current++; - i = -1; - while (!current[++i].isspace () && current[i] != '>' && current[i] != 0) { - if (current[i] == '"') - while (current[++i] != '"' && current[i] != 0); - } - if (attr == _attr) { - if (current[0] == '"') - value = ((string) current).substring (1, i - 2); - else - value = ((string) current).ndup (i); + public int parse (ref char[] buf) throws Error { + var doc = get_html_document (ref buf); + if (doc == null) { + stderr.printf ("Error: parsing failed\n"); + return 0; } - current += i; - } - public void skip_whitespace () { - if (current[0] == 0) - return; - int i = -1; - while (current[++i].isspace () && current[i] != 0); - current += i; - } + // TODO: set up location + location = ""; - public string? parse_tag_attribute (string tag, string attribute) throws Error { - var found = parse_tag (false); - if (tag != found) { - throw new ParserError.WRONG_TAG ("Wrong tag \"%s\", expected \"%s\"", - found, tag); + var theater = search_tag_by_class (doc->children, "div", "theater"); + if (theater == null) { + stderr.printf ("Error: does not contain theater\n"); + return 0; } - - string? value = null; - skip_whitespace (); - while (current[0] != '>' && current[0] != 0) { - parse_attribute (attribute, out value); - skip_whitespace (); + movies = 0; + while (theater != null) { + theater = parse_theater (theater); } - // Skip the closing '>' bracket - if (current[0] != 0) - current++; - - return value; + return movies; } - public string unescape_unicode (string s) { - string result = ""; - int i, j; - long l = s.length; - - for (i = 0; i < l; i++) { - if (s[i] == '&' && s[i + 1] == '#') { - for (j = i + 2; j < l; j++) { - if (!s[j].isdigit ()) - break; - if (s[j] == ';') - break; - } - if (s[j] == ';') { - int codepoint = s.substring (i + 2, j - i - 2).to_int (); - char[] buf = new char[6]; - ((unichar) codepoint).to_utf8 ((string) buf); - result += (string) buf; - i = j; - continue; + private Xml.Node* parse_theater (Xml.Node* t) { + var theater = new Theater (); + var desc = t->children; + if (desc != null && desc->name == "div" && desc->get_prop ("class") == "desc") { + var name = desc->children; + if (name != null && name->name == "h2" && name->get_prop ("class") == "name") { + var a = name->children; + if (a != null && a->name == "a") + theater.name = get_child_text_content (a); + print ("THEATER \"%s\"\n", theater.name); + } + var info = name->next; + if (info != null && info->name == "div" && info->get_prop ("class") == "info") { + var text = info->children; + if (text != null && text->name == "text") { + var address_and_phone = text->content.split (" - "); + if (address_and_phone.length >= 2) { + theater.address = address_and_phone[0]; + theater.phone = address_and_phone[1].replace (" ", "").replace ("-", ""); + } } } - if (s.offset (i).has_prefix ("&")) { - result += "&"; - i += 4; - continue; + } + var showtimes = desc->next; + if (showtimes != null && showtimes->name == "div" && showtimes->get_prop ("class") == "showtimes") { + var left = search_tag_by_class (showtimes->children, "div", "show_left"); + if (left != null && left->children != null) { + print ("LEFT\n"); + var movie = search_tag_by_class (left->children, "div", "movie"); + while (movie != null) { + movie = parse_movie (movie, theater); + } } - if (s.offset (i).has_prefix (""")) { - result += "\""; - i += 5; - continue; + var right = search_tag_by_class (left->next, "div", "show_right"); + if (right != null && right->children != null) { + print ("RIGHT\n"); + var movie = search_tag_by_class (right->children, "div", "movie"); + while (movie != null) { + movie = parse_movie (movie, theater); + } } - result += s.substring (i, 1); - } - return result; + } + return t->next; } - public void parse_movie () throws Error { - expect_tag ("div"); // class=movie - expect_tag ("div"); // class=name - expect_tag ("a"); // href="/movies?near=city&mid=..." - expect_tag ("span"); // dir=ltr - var title = unescape_unicode (convert (parse_text (), -1, "utf-8", "iso-8859-1")); // FIXME - expect_tag ("/span"); - expect_tag ("/a"); - expect_tag ("/div"); - expect_tag ("span"); // class=info - string info_text = parse_text ().replace ("‎", ""); - string[] runtime_and_fsk = {}; - double rating = 0.0; - var tag = parse_tag (); - if (tag == "a") { - // Trailer - expect_tag ("/a"); - tag = parse_tag (); - } - if (tag == "a") { - // IMDb - expect_tag ("/a"); - tag = parse_tag (); - } - if (tag == "nobr") { - expect_tag ("nobr"); - string rating_string = parse_tag_attribute ("img", "alt").offset (6); // "Rated " ->"0.0 out of 5.0" - rating = rating_string.to_double (); - expect_tag ("img"); - expect_tag ("img"); - expect_tag ("img"); - expect_tag ("img"); - expect_tag ("/nobr"); - expect_tag ("/nobr"); - info_text = parse_text ().replace ("‎", "").offset (3); - if (parse_tag () == "a") { - // Trailer - expect_tag ("/a"); - if (parse_tag () == "a") { - // IMDb link - expect_tag ("/a"); - expect_tag ("/span"); - } - } - } - runtime_and_fsk = info_text.split (" - "); - expect_tag ("div"); // class=times - var showtimes = parse_text ().replace (" ", ","); - while (parse_tag () == "a") { - showtimes += parse_text () + ","; - expect_tag ("/a"); + private Xml.Node* parse_movie (Xml.Node* m, Theater theater) { + var movie = new GoogleMovie (); + movie.theater = theater; + Xml.Node* n; + for (n = m->children; n != null; n = n->next) { + if (n->name == "div" && n->get_prop ("class") == "name") + movie.title = parse_movie_name (n); + if (n->name == "span" && n->get_prop ("class") == "info") + parse_movie_info (n, movie); + if (n->name == "div" && n->get_prop ("class") == "times") + parse_movie_times (n, movie); } - if (pattern == null) { - if (!title.has_prefix (_title)) - return; + if (!movie.title.has_prefix (_title)) + return m->next; } else { - if (!pattern.match ((uint) title.length, title, null)) - return; + if (!pattern.match ((uint) movie.title.length, movie.title, null)) + return m->next; } + _get_callback (movie); + movies++; + return m->next; + } - var movie = new GoogleMovie (); - - movie.title = strip_tags (title).replace ("\"", "\\\""); - movie.rating = (int) (rating * 10); - - movie.cinema = last_cinema; - movie.runtime = 0; - if (runtime_and_fsk.length >= 2) { - unowned string runtime = runtime_and_fsk[0]; - movie.runtime = 3600 * runtime.to_int (); - runtime = runtime.str ("hr "); - if (runtime != null) - movie.runtime += 60 * runtime.offset (3).to_int (); - movie.fsk = runtime_and_fsk[1]; + private string? parse_movie_name (Xml.Node* n) { + var a = n->children; + if (a != null && a->name == "a") { + var text = a->children; + if (text != null && text->name == "text") + print ("\"%s\"\n", text->content); + return strip_tags (text->content); } - movie.showtimes = showtimes; - _get_callback (movie); + return null; } // FIXME - this is specific for Germany @@ -284,69 +159,63 @@ public class GoogleParser : Object { return title.dup (); } - public void parse_cinema () throws Error { - expect_tag ("div"); // class=theater - expect_tag ("div"); // class=desc id=theater_... - expect_tag ("h2"); // class=name - expect_tag ("a"); // href="/movies?near=city&tid=..." - expect_tag ("span"); // dir=ltr - var name = unescape_unicode (convert (parse_text (), -1, "utf-8", "iso-8859-1")); // FIXME - expect_tag ("/span"); - expect_tag ("/a"); - expect_tag ("/h2"); - expect_tag ("div"); // class=info - var address_and_phone = parse_text ().replace (" ", " ").split (" - "); - string address = null; - string phone = null; - if (address_and_phone.length >= 2) { - address = address_and_phone[0]; - phone = address_and_phone[1].replace (" ", "").replace ("-", ""); + private void parse_movie_info (Xml.Node* i, GoogleMovie movie) { + var text = i->children; + if (text != null && text->name == "text") + print ("\t\"%s\"\n", text->content); + // movie.runtime + for (var n = text->next; n != null; n = n->next) { + if (n->name == "nobr") { + movie.rating = parse_rating (n); + if (movie.rating == 0) + movie.rating = -1; + break; + } } - expect_tag ("a"); // target=_top - expect_tag ("/a"); - expect_tag ("/div"); - expect_tag ("/div"); + } - last_cinema = new Cinema (name); - last_cinema.address = address; - last_cinema.phone = phone; + private int parse_rating (Xml.Node* nobr) { + for (var n = nobr->children; n != null; n = n->next) { + if (n->name == "nobr") { + for (var img = n->children; img != null; img = img->next) { + if (img->name == "img") { + var alt = img->get_prop ("alt"); // "Rated 0.0 out of 5.0" + if (alt != null && alt != "") // ^ + return (int) (10 * alt.offset (6).to_double ()); + print ("\trating: %s - %f\n", alt, alt.offset (6).to_double ()); + } + } + } + } + return 0; } - public int parse (ref char[] buf) throws Error { - int movies = 0; + private void parse_movie_times (Xml.Node* node, GoogleMovie movie) { + movie.showtimes = get_child_text_content (node).replace ("\xc2\xa0", ","); // U+00A0 =   + } - current = buf; - next_tag (); - while (location == null && current[0] != 0) { - int i = 1; - while (current[i++] != '>'); - if (((string) current).has_prefix ("children->content; + else + return null; + } - while (p[++j] != '&' && p[j] != 0); - p[0] = p[0].toupper (); - location = ((string) p).ndup (j); + Xml.Node* search_tag_by_property (Xml.Node* node, string tag, string prop, string val) requires (node != null) { + for (var n = node; n != null; n = n->next) { + if (n->name == tag && n->get_prop (prop) == val) + return n; + if (n->children != null) { + var found = search_tag_by_property (n->children, tag, prop, val); + if (found != null) + return found; } - current += i; - next_tag (); - } - while (current[0] != 0) { - int i = 1; - while (current[i++] != '>'); - if (((string) current).has_prefix ("
")) { - parse_movie (); - movies++; - } else if (((string) current).has_prefix("
")) { - parse_cinema (); - } else { - current += i; - } - next_tag (); } + return null; + } - return movies; + Xml.Node* search_tag_by_class (Xml.Node* node, string tag, string @class) requires (node != null) { + return search_tag_by_property (node, tag, "class", @class); } public async int query (string title, string? location, ReceiveMovie callback, Cancellable? cancellable = null) { -- 1.7.9.5