e18a00ba28592bea2fb62355f6a8eae701a94328
[cinaest] / src / backends / google / google-parser.vala
1 /* This file is part of Cinaest.
2  *
3  * Copyright (C) 2009 Philipp Zabel
4  *
5  * Cinaest is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * Cinaest is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with Cinaest. If not, see <http://www.gnu.org/licenses/>.
17  */
18
19 errordomain ParserError {
20         WRONG_TAG,
21         EOF
22 }
23
24 public class Cinema {
25         public string name;
26         public string address;
27         public string phone;
28
29         public Cinema (string _name) {
30                 name = _name;
31         }
32 }
33
34 public class GoogleMovie {
35         public string title;
36         public int rating;
37         public Cinema cinema;
38         public int runtime;
39         public string fsk;
40         public string showtimes;
41 }
42
43 public class GoogleParser : Object {
44         char *current;
45         Cinema last_cinema;
46         public string location;
47         string _title;
48         PatternSpec pattern;
49
50         public delegate void ReceiveMovie (GoogleMovie movie);
51         public ReceiveMovie _get_callback;
52
53         public int next_tag_offset () {
54                 int i = -1;
55                 while (current[++i] != '<' && current[i] != 0);
56                 return i;
57         }
58
59         public void next_tag () {
60                 if (current[0] == 0)
61                         return;
62                 current += next_tag_offset ();
63         }
64
65         public void finish_tag () {
66                 while (current[0] != '>' && current[0] != 0)
67                         current++;
68                 if (current[0] == '>')
69                         current++;
70         }
71
72         public unowned string parse_tag (bool finish = true) throws Error {
73                 unowned string tag;
74                 next_tag ();
75                 int i = 1;
76                 while (current[++i].isalnum ());
77                 if (current[i] == 0)
78                         throw new ParserError.EOF ("EOF in tag");
79                 if (current[i] == '>')
80                         finish = false;
81                 current[i] = 0;
82                 tag = (string) (current + 1);
83                 current += i + 1;
84                 if (finish)
85                         finish_tag ();
86                 return tag;
87         }
88
89         public void expect_tag (string tag) throws Error {
90                 var found = parse_tag (true);
91                 if (tag != found) {
92                         throw new ParserError.WRONG_TAG ("Wrong tag \"%s\", expected \"%s\"",
93                                                          found, tag);
94                 }
95         }
96
97         public string parse_text () {
98                 string text = ((string) current).ndup (next_tag_offset ());
99                 next_tag ();
100                 return text;
101         }
102
103         public void parse_attribute (string _attr, out string value) {
104                 string attr;
105                 if (current[0] == 0)
106                         return;
107                 int i = -1;
108                 while (current[++i] != '=' && current[i] != '>' && current[i] != 0) {
109
110                 }
111                 attr = ((string) current).ndup (i);
112                 current += i;
113                 if (current[0] == 0)
114                         return;
115                 current++;
116                 i = -1;
117                 while (!current[++i].isspace () && current[i] != '>' && current[i] != 0) {
118                         if (current[i] == '"')
119                                 while (current[++i] != '"' && current[i] != 0);
120                 }
121                 if (attr == _attr) {
122                         if (current[0] == '"')
123                                 value = ((string) current).substring (1, i - 2);
124                         else
125                                 value = ((string) current).ndup (i);
126                 }
127                 current += i;
128         }
129
130         public void skip_whitespace () {
131                 if (current[0] == 0)
132                         return;
133                 int i = -1;
134                 while (current[++i].isspace () && current[i] != 0);
135                 current += i;
136         }
137
138         public string? parse_tag_attribute (string tag, string attribute) throws Error {
139                 var found = parse_tag (false);
140                 if (tag != found) {
141                         throw new ParserError.WRONG_TAG ("Wrong tag \"%s\", expected \"%s\"",
142                                                          found, tag);
143                 }
144
145                 string? value = null;
146                 skip_whitespace ();
147                 while (current[0] != '>' && current[0] != 0) {
148                         parse_attribute (attribute, out value);
149                         skip_whitespace ();
150                 }
151                 // Skip the closing '>' bracket
152                 if (current[0] != 0)
153                         current++;
154
155                 return value;
156         }
157
158         public string unescape_unicode (string s) {
159                 string result = "";
160                 int i, j;
161                 long l = s.length;
162
163                 for (i = 0; i < l; i++) {
164                         if (s[i] == '&' && s[i + 1] == '#') {
165                                 for (j = i + 2; j < l; j++) {
166                                         if (!s[j].isdigit ())
167                                                 break;
168                                         if (s[j] == ';')
169                                                 break;
170                                 }
171                                 if (s[j] == ';') {
172                                         int codepoint = s.substring (i + 2, j - i - 2).to_int ();
173                                         char[] buf = new char[6];
174                                         ((unichar) codepoint).to_utf8 ((string) buf);
175                                         result += (string) buf;
176                                         i = j;
177                                         continue;
178                                 }
179                         }
180                         if (s.offset (i).has_prefix ("&amp;")) {
181                                 result += "&";
182                                 i += 4;
183                                 continue;
184                         }
185                         if (s.offset (i).has_prefix ("&quot;")) {
186                                 result += "\"";
187                                 i += 5;
188                                 continue;
189                         }
190                         result += s.substring (i, 1);
191                 }
192
193                 return result;
194         }
195
196         public void parse_movie () throws Error {
197                 expect_tag ("div"); // class=movie
198                 expect_tag ("div"); // class=name
199                 expect_tag ("a"); // href="/movies?near=city&amp;mid=..."
200                 expect_tag ("span"); // dir=ltr
201                 var title = unescape_unicode (convert (parse_text (), -1, "utf-8", "iso-8859-1")); // FIXME
202                 expect_tag ("/span");
203                 expect_tag ("/a");
204                 expect_tag ("/div");
205                 expect_tag ("span"); // class=info
206                 string info_text = parse_text ().replace ("&#8206;", "");
207                 string[] runtime_and_fsk = {};
208                 double rating = 0.0;
209                 var tag = parse_tag ();
210                 if (tag == "a") {
211                         // Trailer
212                         expect_tag ("/a");
213                         tag = parse_tag ();
214                 }
215                 if (tag == "a") {
216                         // IMDb
217                         expect_tag ("/a");
218                         tag = parse_tag ();
219                 }
220                 if (tag == "nobr") {
221                         expect_tag ("nobr");
222                         string rating_string = parse_tag_attribute ("img", "alt").offset (6); // "Rated " ->"0.0 out of 5.0"
223                         rating = rating_string.to_double ();
224                         expect_tag ("img");
225                         expect_tag ("img");
226                         expect_tag ("img");
227                         expect_tag ("img");
228                         expect_tag ("/nobr");
229                         expect_tag ("/nobr");
230                         info_text = parse_text ().replace ("&#8206;", "").offset (3);
231                         if (parse_tag () == "a") {
232                                 // Trailer
233                                 expect_tag ("/a");
234                                 if (parse_tag () == "a") {
235                                         // IMDb link
236                                         expect_tag ("/a");
237                                         expect_tag ("/span");
238                                 }
239                         }
240                 }
241                 runtime_and_fsk = info_text.split (" - ");
242                 expect_tag ("div"); // class=times
243                 var showtimes = parse_text ().replace ("&nbsp;", ",");
244                 while (parse_tag () == "a") {
245                         showtimes += parse_text () + ",";
246                         expect_tag ("/a");
247                 }
248
249                 if (pattern == null) {
250                         if (!title.has_prefix (_title))
251                                 return;
252                 } else {
253                         if (!pattern.match ((uint) title.length, title, null))
254                                 return;
255                 }
256
257                 var movie = new GoogleMovie ();
258
259                 movie.title = strip_tags (title).replace ("\"", "\\\"");
260                 movie.rating = (int) (rating * 10);
261
262                 movie.cinema = last_cinema;
263                 movie.runtime = 0;
264                 if (runtime_and_fsk.length >= 2) {
265                         unowned string runtime = runtime_and_fsk[0];
266                         movie.runtime = 3600 * runtime.to_int ();
267                         runtime = runtime.str ("hr ");
268                         if (runtime != null)
269                                 movie.runtime += 60 * runtime.offset (3).to_int ();
270                         movie.fsk = runtime_and_fsk[1];
271                 }
272                 movie.showtimes = showtimes;
273                 _get_callback (movie);
274         }
275
276         // FIXME - this is specific for Germany
277         private string strip_tags (string title) {
278                 string tag_suffix = " (OmU)"; // original audio with subtitles
279                 if (title.has_suffix (tag_suffix))
280                         return title.substring (0, title.length - tag_suffix.length);
281                 tag_suffix = " (OV)"; // original audio
282                 if (title.has_suffix (tag_suffix))
283                         return title.substring (0, title.length - tag_suffix.length);
284                 return title.dup ();
285         }
286
287         public void parse_cinema () throws Error {
288                 expect_tag ("div"); // class=theater
289                 expect_tag ("div"); // class=desc id=theater_...
290                 expect_tag ("h2"); // class=name
291                 expect_tag ("a"); // href="/movies?near=city&amp;tid=..."
292                 expect_tag ("span"); // dir=ltr
293                 var name = unescape_unicode (convert (parse_text (), -1, "utf-8", "iso-8859-1")); // FIXME
294                 expect_tag ("/span");
295                 expect_tag ("/a");
296                 expect_tag ("/h2");
297                 expect_tag ("div"); // class=info
298                 var address_and_phone = parse_text ().replace ("&nbsp;", " ").split (" - ");
299                 string address = null;
300                 string phone = null;
301                 if (address_and_phone.length >= 2) {
302                         address = address_and_phone[0];
303                         phone = address_and_phone[1].replace (" ", "").replace ("-", "");
304                 }
305                 expect_tag ("a"); // target=_top
306                 expect_tag ("/a");
307                 expect_tag ("/div");
308                 expect_tag ("/div");
309
310                 last_cinema = new Cinema (name);
311                 last_cinema.address = address;
312                 last_cinema.phone = phone;
313         }
314
315         public int parse (ref char[] buf) throws Error {
316                 int movies = 0;
317
318                 current = buf;
319                 next_tag ();
320                 while (location == null && current[0] != 0) {
321                         int i = 1;
322                         while (current[i++] != '>');
323                         if (((string) current).has_prefix ("<a href=\"/movies?near=")) {
324                                 string href = parse_tag_attribute ("a", "href");
325                                 char* p = (char*) href.offset (13); // skip "/movies?near="
326                                 int j = -1;
327
328                                 while (p[++j] != '&' && p[j] != 0);
329                                 p[0] = p[0].toupper ();
330                                 location = ((string) p).ndup (j);
331                         }
332                         current += i;
333                         next_tag ();
334                 }
335                 while (current[0] != 0) {
336                         int i = 1;
337                         while (current[i++] != '>');
338                         if (((string) current).has_prefix ("<div class=movie>")) {
339                                 parse_movie ();
340                                 movies++;
341                         } else if (((string) current).has_prefix("<div class=theater>")) {
342                                 parse_cinema ();
343                         } else {
344                                 current += i;
345                         }
346                         next_tag ();
347                 }
348
349                 return movies;
350         }
351
352         public async int query (string title, string? location, ReceiveMovie callback, Cancellable? cancellable = null) {
353                 _get_callback = callback;
354                 _title = title;
355                 if (title.chr(title.length, '*') != null) {
356                         pattern = new PatternSpec (title);
357                 } else {
358                         pattern = null;
359                 }
360                 try {
361                         // TODO - use google.de in Germany, also provides genres
362                         string uri = "http://google.com/movies";
363                         if (location != null && location != "")
364                                 uri += "?near=" + location;
365
366                         stdout.printf ("GET: %s\n", uri);
367
368                         File file = File.new_for_uri (uri);
369                         InputStream stream = yield file.read_async (Priority.DEFAULT_IDLE, null);
370
371                         char[] buf = new char[256*1024];
372                         size_t nread;
373                         size_t total = 0;
374                         while (total < 256*1024) {
375                                 nread = yield stream.read_async ((char *)buf + total, 256*1024 - total, Priority.DEFAULT_IDLE, cancellable);
376                                 total += nread;
377                                 if (cancellable.is_cancelled ())
378                                         return 0;
379                                 if (nread == 0)
380                                         break;
381                         }
382                         buf[total] = 0;
383                         return parse (ref buf);
384                 } catch (Error e) {
385                         stderr.printf ("Error: %s\n", e.message);
386                 }
387
388                 return 0;
389         }
390 }