Convert Google plugin into a D-Bus service
[cinaest] / src / backends / google / google-parser.vala
1 /* This file is part of Cinaest.
2  *
3  * Copyright (C) 2009 Philipp Zabel
4  *
5  * Cinaest is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * Cinaest is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with Cinaest. If not, see <http://www.gnu.org/licenses/>.
17  */
18
19 errordomain ParserError {
20         WRONG_TAG,
21         EOF
22 }
23
24 public class Cinema {
25         public string name;
26         public string address;
27         public string phone;
28
29         public Cinema (string _name) {
30                 name = _name;
31         }
32 }
33
34 public class GoogleMovie {
35         public string title;
36         public int rating;
37         public string secondary;
38         public Cinema cinema;
39         public string runtime;
40         public string fsk;
41         public string showtimes;
42 }
43
44 public class GoogleParser : Object {
45         char *current;
46         Cinema last_cinema;
47         public string location;
48         string _title;
49         PatternSpec pattern;
50
51         public delegate void ReceiveMovie (GoogleMovie movie);
52         public ReceiveMovie _get_callback;
53
54         public int next_tag_offset () {
55                 int i = -1;
56                 while (current[++i] != '<' && current[i] != 0);
57                 return i;
58         }
59
60         public void next_tag () {
61                 if (current[0] == 0)
62                         return;
63                 current += next_tag_offset ();
64         }
65
66         public void finish_tag () {
67                 while (current[0] != '>' && current[0] != 0)
68                         current++;
69                 if (current[0] == '>')
70                         current++;
71         }
72
73         public weak string parse_tag (bool finish = true) throws Error {
74                 weak string tag;
75                 next_tag ();
76                 int i = 1;
77                 while (current[++i].isalnum ());
78                 if (current[i] == 0)
79                         throw new ParserError.EOF ("EOF in tag");
80                 if (current[i] == '>')
81                         finish = false;
82                 current[i] = 0;
83                 tag = (string) (current + 1);
84                 current += i + 1;
85                 if (finish)
86                         finish_tag ();
87                 return tag;
88         }
89
90         public void expect_tag (string tag) throws Error {
91                 var found = parse_tag (true);
92                 if (tag != found) {
93                         throw new ParserError.WRONG_TAG ("Wrong tag \"%s\", expected \"%s\"",
94                                                          found, tag);
95                 }
96         }
97
98         public string parse_text () {
99                 string text = ((string) current).ndup (next_tag_offset ());
100                 next_tag ();
101                 return text;
102         }
103
104         public void parse_attribute (string _attr, out string value) {
105                 string attr;
106                 if (current[0] == 0)
107                         return;
108                 int i = -1;
109                 while (current[++i] != '=' && current[i] != '>' && current[i] != 0) {
110
111                 }
112                 attr = ((string) current).ndup (i);
113                 current += i;
114                 if (current[0] == 0)
115                         return;
116                 current++;
117                 i = -1;
118                 while (!current[++i].isspace () && current[i] != '>' && current[i] != 0) {
119                         if (current[i] == '"')
120                                 while (current[++i] != '"' && current[i] != 0);
121                 }
122                 if (attr == _attr) {
123                         if (current[0] == '"')
124                                 value = ((string) current).substring (1, i - 2);
125                         else
126                                 value = ((string) current).ndup (i);
127                 }
128                 current += i;
129         }
130
131         public void skip_whitespace () {
132                 if (current[0] == 0)
133                         return;
134                 int i = -1;
135                 while (current[++i].isspace () && current[i] != 0);
136                 current += i;
137         }
138
139         public string? parse_tag_attribute (string tag, string attribute) throws Error {
140                 var found = parse_tag (false);
141                 if (tag != found) {
142                         throw new ParserError.WRONG_TAG ("Wrong tag \"%s\", expected \"%s\"",
143                                                          found, tag);
144                 }
145
146                 string? value = null;
147                 skip_whitespace ();
148                 while (current[0] != '>' && current[0] != 0) {
149                         parse_attribute (attribute, out value);
150                         skip_whitespace ();
151                 }
152                 // Skip the closing '>' bracket
153                 if (current[0] != 0)
154                         current++;
155
156                 return value;
157         }
158
159         public string unescape_unicode (string s) {
160                 string result = "";
161                 int i, j;
162                 long l = s.length;
163
164                 for (i = 0; i < l; i++) {
165                         if (s[i] == '&' && s[i + 1] == '#') {
166                                 for (j = i + 2; j < l; j++) {
167                                         if (!s[j].isdigit ())
168                                                 break;
169                                         if (s[j] == ';')
170                                                 break;
171                                 }
172                                 if (s[j] == ';') {
173                                         int codepoint = s.substring (i + 2, j - i - 2).to_int ();
174                                         char[] buf = new char[6];
175                                         ((unichar) codepoint).to_utf8 ((string) buf);
176                                         result += (string) buf;
177                                         i = j;
178                                         continue;
179                                 }
180                         }
181                         if (s.offset (i).has_prefix ("&amp;")) {
182                                 result += "&";
183                                 i += 4;
184                                 continue;
185                         }
186                         if (s.offset (i).has_prefix ("&quot;")) {
187                                 result += "\"";
188                                 i += 5;
189                                 continue;
190                         }
191                         result += s.substring (i, 1);
192                 }
193
194                 return result;
195         }
196
197         public void parse_movie () throws Error {
198                 expect_tag ("div"); // class=movie
199                 expect_tag ("div"); // class=name
200                 expect_tag ("a"); // href="/movies?near=city&amp;mid=..."
201                 expect_tag ("span"); // dir=ltr
202                 var title = unescape_unicode (convert (parse_text (), -1, "utf-8", "iso-8859-1")); // FIXME
203                 expect_tag ("/span");
204                 expect_tag ("/a");
205                 expect_tag ("/div");
206                 expect_tag ("span"); // class=info
207                 string[] runtime_and_fsk = {};
208                 double rating = 0.0;
209                 var tag = parse_tag ();
210                 if (tag == "a") {
211                         // Trailer
212                         expect_tag ("/a");
213                         tag = parse_tag ();
214                 }
215                 if (tag == "a") {
216                         // IMDb
217                         expect_tag ("/a");
218                         tag = parse_tag ();
219                 }
220                 if (tag == "nobr") {
221                         expect_tag ("nobr");
222                         string rating_string = parse_tag_attribute ("img", "alt").offset (6); // "Rated " ->"0.0 out of 5.0"
223                         rating = rating_string.to_double ();
224                         expect_tag ("img");
225                         expect_tag ("img");
226                         expect_tag ("img");
227                         expect_tag ("img");
228                         expect_tag ("/nobr");
229                         expect_tag ("/nobr");
230                         runtime_and_fsk = parse_text ().replace ("&#8206;", "").offset (3).split (" - ");
231                         if (parse_tag () == "a") {
232                                 // Trailer
233                                 expect_tag ("/a");
234                                 if (parse_tag () == "a") {
235                                         // IMDb link
236                                         expect_tag ("/a");
237                                         expect_tag ("/span");
238                                 }
239                         }
240                 }
241                 expect_tag ("div"); // class=times
242                 var showtimes = parse_text ().replace ("&nbsp;", ",");
243                 while (parse_tag () == "a") {
244                         showtimes += parse_text () + ",";
245                         expect_tag ("/a");
246                 }
247
248                 if (pattern == null) {
249                         if (!title.has_prefix (_title))
250                                 return;
251                 } else {
252                         if (!pattern.match ((uint) title.length, title, null))
253                                 return;
254                 }
255
256                 var movie = new GoogleMovie ();
257
258                 movie.title = strip_tags (title).replace ("\"", "\\\"");
259                 movie.rating = (int) (rating * 10);
260
261                 movie.cinema = last_cinema;
262                 if (runtime_and_fsk.length >= 2) {
263                         movie.runtime = runtime_and_fsk[0];
264                         movie.fsk = runtime_and_fsk[1];
265                 }
266                 movie.showtimes = showtimes;
267
268                 // TODO - could be configurable by settings
269                 if (movie.runtime != null)
270                         movie.secondary = "%s - %s - %s".printf (movie.runtime, last_cinema.name, showtimes);
271                 else
272                         movie.secondary = "%s - %s".printf (last_cinema.name, showtimes);
273
274                 _get_callback (movie);
275         }
276
277         // FIXME - this is specific for Germany
278         private string strip_tags (string title) {
279                 string tag_suffix = " (OmU)"; // original audio with subtitles
280                 if (title.has_suffix (tag_suffix))
281                         return title.substring (0, title.length - tag_suffix.length);
282                 tag_suffix = " (OV)"; // original audio
283                 if (title.has_suffix (tag_suffix))
284                         return title.substring (0, title.length - tag_suffix.length);
285                 return title.dup ();
286         }
287
288         public void parse_cinema () throws Error {
289                 expect_tag ("div"); // class=theater
290                 expect_tag ("div"); // class=desc id=theater_...
291                 expect_tag ("h2"); // class=name
292                 expect_tag ("a"); // href="/movies?near=city&amp;tid=..."
293                 expect_tag ("span"); // dir=ltr
294                 var name = unescape_unicode (convert (parse_text (), -1, "utf-8", "iso-8859-1")); // FIXME
295                 expect_tag ("/span");
296                 expect_tag ("/a");
297                 expect_tag ("/h2");
298                 expect_tag ("div"); // class=info
299                 var address_and_phone = parse_text ().replace ("&nbsp;", " ").split (" - ");
300                 string address = null;
301                 string phone = null;
302                 if (address_and_phone.length >= 2) {
303                         address = address_and_phone[0];
304                         phone = address_and_phone[1].replace (" ", "").replace ("-", "");
305                 }
306                 expect_tag ("a"); // target=_top
307                 expect_tag ("/a");
308                 expect_tag ("/div");
309                 expect_tag ("/div");
310
311                 last_cinema = new Cinema (name);
312                 last_cinema.address = address;
313                 last_cinema.phone = phone;
314         }
315
316         public int parse (ref char[] buf) throws Error {
317                 int movies = 0;
318
319                 current = buf;
320                 next_tag ();
321                 while (location == null && current[0] != 0) {
322                         int i = 1;
323                         while (current[i++] != '>');
324                         if (((string) current).has_prefix ("<a href=\"/movies?near=")) {
325                                 string href = parse_tag_attribute ("a", "href");
326                                 char* p = (char*) href.offset (13); // skip "/movies?near="
327                                 int j = -1;
328
329                                 while (p[++j] != '&' && p[j] != 0);
330                                 p[0] = p[0].toupper ();
331                                 location = ((string) p).ndup (j);
332                         }
333                         current += i;
334                         next_tag ();
335                 }
336                 while (current[0] != 0) {
337                         int i = 1;
338                         while (current[i++] != '>');
339                         if (((string) current).has_prefix ("<div class=movie>")) {
340                                 parse_movie ();
341                                 movies++;
342                         } else if (((string) current).has_prefix("<div class=theater>")) {
343                                 parse_cinema ();
344                         } else {
345                                 current += i;
346                         }
347                         next_tag ();
348                 }
349
350                 return movies;
351         }
352
353         public async int query (string title, string? location, ReceiveMovie callback, Cancellable? cancellable = null) {
354                 _get_callback = callback;
355                 _title = title;
356                 if (title.chr(title.length, '*') != null) {
357                         pattern = new PatternSpec (title);
358                 } else {
359                         pattern = null;
360                 }
361                 try {
362                         // TODO - use google.de in Germany, also provides genres
363                         string uri = "http://google.com/movies";
364                         if (location != null && location != "")
365                                 uri += "?near=" + location;
366
367                         stdout.printf ("GET: %s\n", uri);
368
369                         File file = File.new_for_uri (uri);
370                         InputStream stream = yield file.read_async (Priority.DEFAULT_IDLE, null);
371
372                         char[] buf = new char[256*1024];
373                         size_t nread;
374                         size_t total = 0;
375                         while (total < 256*1024) {
376                                 nread = yield stream.read_async ((char *)buf + total, 256*1024 - total, Priority.DEFAULT_IDLE, cancellable);
377                                 total += nread;
378                                 if (cancellable.is_cancelled ())
379                                         return 0;
380                                 if (nread == 0)
381                                         break;
382                         }
383                         buf[total] = 0;
384                         return parse (ref buf);
385                 } catch (Error e) {
386                         stderr.printf ("Error: %s\n", e.message);
387                 }
388
389                 return 0;
390         }
391 }