Google parser: fix for movies with showtime, trailer and IMDb links
[cinaest] / src / plugins / google-parser.vala
1 /* This file is part of Cinaest.
2  *
3  * Copyright (C) 2009 Philipp Zabel
4  *
5  * Cinaest is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * Cinaest is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with Cinaest. If not, see <http://www.gnu.org/licenses/>.
17  */
18
19 errordomain ParserError {
20         WRONG_TAG,
21         EOF
22 }
23
24 public class Cinema {
25         public string name;
26         public string address;
27         public string phone;
28
29         public Cinema (string _name) {
30                 name = _name;
31         }
32 }
33
34 public class GoogleMovie : Movie {
35         public Cinema cinema;
36         public string runtime;
37         public string fsk;
38         public string showtimes;
39 }
40
41 public class GoogleParser : Object {
42         private MovieSource.ReceiveMovieFunction _get_callback;
43         char *current;
44         Cinema last_cinema;
45         public string location;
46         MovieFilter _filter;
47         PatternSpec pattern;
48
49         public int next_tag_offset () {
50                 int i = -1;
51                 while (current[++i] != '<' && current[i] != 0);
52                 return i;
53         }
54
55         public void next_tag () {
56                 if (current[0] == 0)
57                         return;
58                 current += next_tag_offset ();
59         }
60
61         public void finish_tag () {
62                 while (current[0] != '>' && current[0] != 0)
63                         current++;
64                 if (current[0] == '>')
65                         current++;
66         }
67
68         public weak string parse_tag (bool finish = true) throws Error {
69                 weak string tag;
70                 next_tag ();
71                 int i = 1;
72                 while (current[++i].isalnum ());
73                 if (current[i] == 0)
74                         throw new ParserError.EOF ("EOF in tag");
75                 if (current[i] == '>')
76                         finish = false;
77                 current[i] = 0;
78                 tag = (string) (current + 1);
79                 current += i + 1;
80                 if (finish)
81                         finish_tag ();
82                 return tag;
83         }
84
85         public void expect_tag (string tag) throws Error {
86                 var found = parse_tag (true);
87                 if (tag != found) {
88                         throw new ParserError.WRONG_TAG ("Wrong tag \"%s\", expected \"%s\"",
89                                                          found, tag);
90                 }
91         }
92
93         public string parse_text () {
94                 string text = ((string) current).ndup (next_tag_offset ());
95                 next_tag ();
96                 return text;
97         }
98
99         public void parse_attribute (string _attr, out string value) {
100                 string attr;
101                 if (current[0] == 0)
102                         return;
103                 int i = -1;
104                 while (current[++i] != '=' && current[i] != '>' && current[i] != 0) {
105                         
106                 }
107                 attr = ((string) current).ndup (i);
108                 current += i;
109                 if (current[0] == 0)
110                         return;
111                 current++;
112                 i = -1;
113                 while (!current[++i].isspace () && current[i] != '>' && current[i] != 0) {
114                         if (current[i] == '"')
115                                 while (current[++i] != '"' && current[i] != 0);
116                 }
117                 if (attr == _attr) {
118                         if (current[0] == '"')
119                                 value = ((string) current).substring (1, i - 2);
120                         else
121                                 value = ((string) current).ndup (i);
122                 }
123                 current += i;
124         }
125
126         public void skip_whitespace () {
127                 if (current[0] == 0)
128                         return;
129                 int i = -1;
130                 while (current[++i].isspace () && current[i] != 0);
131                 current += i;
132         }
133
134         public string? parse_tag_attribute (string tag, string attribute) throws Error {
135                 var found = parse_tag (false);
136                 if (tag != found) {
137                         throw new ParserError.WRONG_TAG ("Wrong tag \"%s\", expected \"%s\"",
138                                                          found, tag);
139                 }
140
141                 string? value = null;
142                 skip_whitespace ();
143                 while (current[0] != '>' && current[0] != 0) {
144                         parse_attribute (attribute, out value);
145                         skip_whitespace ();
146                 }
147                 // Skip the closing '>' bracket
148                 if (current[0] != 0)
149                         current++;
150
151                 return value;
152         }
153
154         public string unescape_unicode (string s) {
155                 string result = "";
156                 int i, j;
157                 long l = s.length;
158
159                 for (i = 0; i < l; i++) {
160                         if (s[i] == '&' && s[i + 1] == '#') {
161                                 for (j = i + 2; j < l; j++) {
162                                         if (!s[j].isdigit ())
163                                                 break;
164                                         if (s[j] == ';')
165                                                 break;
166                                 }
167                                 if (s[j] == ';') {
168                                         int codepoint = s.substring (i + 2, j - i - 2).to_int ();
169                                         char[] buf = new char[6];
170                                         ((unichar) codepoint).to_utf8 ((string) buf);
171                                         result += (string) buf;
172                                         i = j;
173                                         continue;
174                                 }
175                         }
176                         if (s.offset (i).has_prefix ("&amp;")) {
177                                 result += "&";
178                                 i += 4;
179                                 continue;
180                         }
181                         if (s.offset (i).has_prefix ("&quot;")) {
182                                 result += "\"";
183                                 i += 5;
184                                 continue;
185                         }
186                         result += s.substring (i, 1);
187                 }
188
189                 return result;
190         }
191
192         public void parse_movie () throws Error {
193                 expect_tag ("div"); // class=movie
194                 expect_tag ("div"); // class=name
195                 expect_tag ("a"); // href="/movies?near=city&amp;mid=..."
196                 expect_tag ("span"); // dir=ltr
197                 var title = unescape_unicode (convert (parse_text (), -1, "utf-8", "iso-8859-1")); // FIXME
198                 expect_tag ("/span");
199                 expect_tag ("/a");
200                 expect_tag ("/div");
201                 expect_tag ("span"); // class=info
202                 string[] runtime_and_fsk = {};
203                 double rating = 0.0;
204                 var tag = parse_tag ();
205                 if (tag == "a") {
206                         // Trailer
207                         expect_tag ("/a");
208                         tag = parse_tag ();
209                 }
210                 if (tag == "a") {
211                         // IMDb
212                         expect_tag ("/a");
213                         tag = parse_tag ();
214                 }
215                 if (tag == "nobr") {
216                         expect_tag ("nobr");
217                         string rating_string = parse_tag_attribute ("img", "alt").offset (6); // "Rated " ->"0.0 out of 5.0"
218                         rating = rating_string.to_double ();
219                         expect_tag ("img");
220                         expect_tag ("img");
221                         expect_tag ("img");
222                         expect_tag ("img");
223                         expect_tag ("/nobr");
224                         expect_tag ("/nobr");
225                         runtime_and_fsk = parse_text ().replace ("&#8206;", "").offset (3).split (" - ");
226                         if (parse_tag () == "a") {
227                                 // Trailer
228                                 expect_tag ("/a");
229                                 if (parse_tag () == "a") {
230                                         // IMDb link
231                                         expect_tag ("/a");
232                                         expect_tag ("/span");
233                                 }
234                         }
235                 }
236                 expect_tag ("div"); // class=times
237                 var showtimes = parse_text ().replace ("&nbsp;", ",");
238                 while (parse_tag () == "a") {
239                         showtimes += parse_text () + ",";
240                         expect_tag ("/a");
241                 }
242
243                 if (pattern == null) {
244                         if (!title.has_prefix (_filter.title))
245                                 return;
246                 } else {
247                         if (!pattern.match ((uint) title.length, title, null))
248                                 return;
249                 }
250
251                 var movie = new GoogleMovie ();
252
253                 movie.title = strip_tags (title);
254                 movie.year = 0;
255                 movie.rating = (int) (rating * 10);
256
257                 movie.cinema = last_cinema;
258                 if (runtime_and_fsk.length >= 2) {
259                         movie.runtime = runtime_and_fsk[0];
260                         movie.fsk = runtime_and_fsk[1];
261                 }
262                 movie.showtimes = showtimes;
263
264                 // TODO - could be configurable by settings
265                 if (movie.runtime != null)
266                         movie.secondary = "%s - %s - %s".printf (movie.runtime, last_cinema.name, showtimes);
267                 else
268                         movie.secondary = "%s - %s".printf (last_cinema.name, showtimes);
269
270                 _get_callback (movie);
271         }
272
273         // FIXME - this is specific for Germany
274         private string strip_tags (string title) {
275                 string tag_suffix = " (OmU)"; // original audio with subtitles
276                 if (title.has_suffix (tag_suffix))
277                         return title.substring (0, title.length - tag_suffix.length);
278                 tag_suffix = " (OV)"; // original audio
279                 if (title.has_suffix (tag_suffix))
280                         return title.substring (0, title.length - tag_suffix.length);
281                 return title.dup ();
282         }
283
284         public void parse_cinema () throws Error {
285                 expect_tag ("div"); // class=theater
286                 expect_tag ("div"); // class=desc id=theater_...
287                 expect_tag ("h2"); // class=name
288                 expect_tag ("a"); // href="/movies?near=city&amp;tid=..."
289                 expect_tag ("span"); // dir=ltr
290                 var name = unescape_unicode (convert (parse_text (), -1, "utf-8", "iso-8859-1")); // FIXME
291                 expect_tag ("/span");
292                 expect_tag ("/a");
293                 expect_tag ("/h2");
294                 expect_tag ("div"); // class=info
295                 var address_and_phone = parse_text ().replace ("&nbsp;", " ").split (" - ");
296                 string address = null;
297                 string phone = null;
298                 if (address_and_phone.length >= 2) {
299                         address = address_and_phone[0];
300                         phone = address_and_phone[1].replace (" ", "").replace ("-", "");
301                 }
302                 expect_tag ("a"); // target=_top
303                 expect_tag ("/a");
304                 expect_tag ("/div");
305                 expect_tag ("/div");
306
307                 last_cinema = new Cinema (name);
308                 last_cinema.address = address;
309                 last_cinema.phone = phone;
310         }
311
312         public int parse (ref char[] buf) throws Error {
313                 int movies = 0;
314
315                 current = buf;
316                 next_tag ();
317                 while (location == null && current[0] != 0) {
318                         int i = 1;
319                         while (current[i++] != '>');
320                         if (((string) current).has_prefix ("<a href=\"/movies?near=")) {
321                                 string href = parse_tag_attribute ("a", "href");
322                                 char* p = (char*) href.offset (13); // skip "/movies?near="
323                                 int j = -1;
324
325                                 while (p[++j] != '&' && p[j] != 0);
326                                 p[0] = p[0].toupper ();
327                                 location = ((string) p).ndup (j);
328                         }
329                         current += i;
330                         next_tag ();
331                 }
332                 while (current[0] != 0) {
333                         int i = 1;
334                         while (current[i++] != '>');
335                         if (((string) current).has_prefix ("<div class=movie>")) {
336                                 parse_movie ();
337                                 movies++;
338                         } else if (((string) current).has_prefix("<div class=theater>")) {
339                                 parse_cinema ();
340                         } else {
341                                 current += i;
342                         }
343                         next_tag ();
344                 }
345
346                 return movies;
347         }
348
349         public async int query (MovieFilter filter, string? location, MovieSource.ReceiveMovieFunction callback, Cancellable? cancellable) {
350                 _get_callback = callback;
351                 _filter = filter;
352                 if (filter.title.chr(filter.title.length, '*') != null) {
353                         pattern = new PatternSpec (filter.title);
354                 } else {
355                         pattern = null;
356                 }
357                 try {
358                         // TODO - use google.de in Germany, also provides genres
359                         string uri = "http://google.com/movies";
360                         if (location != null && location != "")
361                                 uri += "?near=" + location;
362
363                         stdout.printf ("GET: %s\n", uri);
364
365                         File file = File.new_for_uri (uri);
366                         InputStream stream = yield file.read_async (Priority.DEFAULT_IDLE, null);
367
368                         char[] buf = new char[256*1024];
369                         size_t nread;
370                         size_t total = 0;
371                         while (total < 256*1024) {
372                                 nread = yield stream.read_async ((char *)buf + total, 256*1024 - total, Priority.DEFAULT_IDLE, cancellable);
373                                 total += nread;
374                                 if (cancellable.is_cancelled ())
375                                         return 0;
376                                 if (nread == 0)
377                                         break;
378                         }
379                         buf[total] = 0;
380                         return parse (ref buf);
381                 } catch (Error e) {
382                         stderr.printf ("Error: %s\n", e.message);
383                 }
384
385                 return 0;
386         }
387 }