Google plugin: parse " to quotation marks in movie titles
[cinaest] / src / plugins / google-parser.vala
1 /* This file is part of Cinaest.
2  *
3  * Copyright (C) 2009 Philipp Zabel
4  *
5  * Cinaest is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * Cinaest is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with Cinaest. If not, see <http://www.gnu.org/licenses/>.
17  */
18
19 errordomain ParserError {
20         WRONG_TAG,
21         EOF
22 }
23
24 public class Cinema {
25         public string name;
26         public string address;
27         public string phone;
28
29         public Cinema (string _name) {
30                 name = _name;
31         }
32 }
33
34 public class GoogleMovie : Movie {
35         public Cinema cinema;
36         public string runtime;
37         public string fsk;
38         public string showtimes;
39 }
40
41 public class GoogleParser : Object {
42         private MovieSource.ReceiveMovieFunction _get_callback;
43         char *current;
44         Cinema last_cinema;
45         public string location;
46         MovieFilter _filter;
47         PatternSpec pattern;
48
49         public int next_tag_offset () {
50                 int i = -1;
51                 while (current[++i] != '<' && current[i] != 0);
52                 return i;
53         }
54
55         public void next_tag () {
56                 if (current[0] == 0)
57                         return;
58                 current += next_tag_offset ();
59         }
60
61         public void finish_tag () {
62                 while (current[0] != '>' && current[0] != 0)
63                         current++;
64                 if (current[0] == '>')
65                         current++;
66         }
67
68         public weak string parse_tag (bool finish = true) throws Error {
69                 weak string tag;
70                 next_tag ();
71                 int i = 1;
72                 while (current[++i].isalnum ());
73                 if (current[i] == 0)
74                         throw new ParserError.EOF ("EOF in tag");
75                 if (current[i] == '>')
76                         finish = false;
77                 current[i] = 0;
78                 tag = (string) (current + 1);
79                 current += i + 1;
80                 if (finish)
81                         finish_tag ();
82                 return tag;
83         }
84
85         public void expect_tag (string tag) throws Error {
86                 var found = parse_tag (true);
87                 if (tag != found) {
88                         throw new ParserError.WRONG_TAG ("Wrong tag \"%s\", expected \"%s\"",
89                                                          found, tag);
90                 }
91         }
92
93         public string parse_text () {
94                 string text = ((string) current).ndup (next_tag_offset ());
95                 next_tag ();
96                 return text;
97         }
98
99         public void parse_attribute (string _attr, out string value) {
100                 string attr;
101                 if (current[0] == 0)
102                         return;
103                 int i = -1;
104                 while (current[++i] != '=' && current[i] != '>' && current[i] != 0) {
105                         
106                 }
107                 attr = ((string) current).ndup (i);
108                 current += i;
109                 if (current[0] == 0)
110                         return;
111                 current++;
112                 i = -1;
113                 while (!current[++i].isspace () && current[i] != '>' && current[i] != 0) {
114                         if (current[i] == '"')
115                                 while (current[++i] != '"' && current[i] != 0);
116                 }
117                 if (attr == _attr) {
118                         if (current[0] == '"')
119                                 value = ((string) current).substring (1, i - 2);
120                         else
121                                 value = ((string) current).ndup (i);
122                 }
123                 current += i;
124         }
125
126         public void skip_whitespace () {
127                 if (current[0] == 0)
128                         return;
129                 int i = -1;
130                 while (current[++i].isspace () && current[i] != 0);
131                 current += i;
132         }
133
134         public string? parse_tag_attribute (string tag, string attribute) throws Error {
135                 var found = parse_tag (false);
136                 if (tag != found) {
137                         throw new ParserError.WRONG_TAG ("Wrong tag \"%s\", expected \"%s\"",
138                                                          found, tag);
139                 }
140
141                 string? value = null;
142                 skip_whitespace ();
143                 while (current[0] != '>' && current[0] != 0) {
144                         parse_attribute (attribute, out value);
145                         skip_whitespace ();
146                 }
147                 // Skip the closing '>' bracket
148                 if (current[0] != 0)
149                         current++;
150
151                 return value;
152         }
153
154         public string unescape_unicode (string s) {
155                 string result = "";
156                 int i, j;
157                 long l = s.length;
158
159                 for (i = 0; i < l; i++) {
160                         if (s[i] == '&' && s[i + 1] == '#') {
161                                 for (j = i + 2; j < l; j++) {
162                                         if (!s[j].isdigit ())
163                                                 break;
164                                         if (s[j] == ';')
165                                                 break;
166                                 }
167                                 if (s[j] == ';') {
168                                         int codepoint = s.substring (i + 2, j - i - 2).to_int ();
169                                         char[] buf = new char[6];
170                                         ((unichar) codepoint).to_utf8 ((string) buf);
171                                         result += (string) buf;
172                                         i = j;
173                                         continue;
174                                 }
175                         }
176                         if (s.offset (i).has_prefix ("&amp;")) {
177                                 result += "&";
178                                 i += 4;
179                                 continue;
180                         }
181                         if (s.offset (i).has_prefix ("&quot;")) {
182                                 result += "\"";
183                                 i += 5;
184                                 continue;
185                         }
186                         result += s.substring (i, 1);
187                 }
188
189                 return result;
190         }
191
192         public void parse_movie () throws Error {
193                 expect_tag ("div"); // class=movie
194                 expect_tag ("div"); // class=name
195                 expect_tag ("a"); // href="/movies?near=city&amp;mid=..."
196                 expect_tag ("span"); // dir=ltr
197                 var title = unescape_unicode (convert (parse_text (), -1, "utf-8", "iso-8859-1")); // FIXME
198                 expect_tag ("/span");
199                 expect_tag ("/a");
200                 expect_tag ("/div");
201                 expect_tag ("span"); // class=info
202                 string[] runtime_and_fsk = {};
203                 double rating = 0.0;
204                 if (parse_tag () == "nobr") {
205                         expect_tag ("nobr");
206                         string rating_string = parse_tag_attribute ("img", "alt").offset (6); // "Rated " ->"0.0 out of 5.0"
207                         rating = rating_string.to_double ();
208                         expect_tag ("img");
209                         expect_tag ("img");
210                         expect_tag ("img");
211                         expect_tag ("img");
212                         expect_tag ("/nobr");
213                         expect_tag ("/nobr");
214                         runtime_and_fsk = parse_text ().replace ("&#8206;", "").offset (3).split (" - ");
215                         expect_tag ("/span");
216                 }
217                 expect_tag ("div"); // class=times
218                 var showtimes = parse_text ().replace ("&nbsp;", ",");
219                 expect_tag ("/div");
220                 expect_tag ("/div");
221
222                 if (pattern == null) {
223                         if (!title.has_prefix (_filter.title))
224                                 return;
225                 } else {
226                         if (!pattern.match ((uint) title.length, title, null))
227                                 return;
228                 }
229
230                 var movie = new GoogleMovie ();
231
232                 movie.title = strip_tags (title);
233                 movie.year = 0;
234                 movie.rating = (int) (rating * 10);
235
236                 movie.cinema = last_cinema;
237                 if (runtime_and_fsk.length >= 2) {
238                         movie.runtime = runtime_and_fsk[0];
239                         movie.fsk = runtime_and_fsk[1];
240                 }
241                 movie.showtimes = showtimes;
242
243                 // TODO - could be configurable by settings
244                 if (movie.runtime != null)
245                         movie.secondary = "%s - %s - %s".printf (movie.runtime, last_cinema.name, showtimes);
246                 else
247                         movie.secondary = "%s - %s".printf (last_cinema.name, showtimes);
248
249                 _get_callback (movie);
250         }
251
252         // FIXME - this is specific for Germany
253         private string strip_tags (string title) {
254                 string tag_suffix = " (OmU)"; // original audio with subtitles
255                 if (title.has_suffix (tag_suffix))
256                         return title.substring (0, title.length - tag_suffix.length);
257                 tag_suffix = " (OV)"; // original audio
258                 if (title.has_suffix (tag_suffix))
259                         return title.substring (0, title.length - tag_suffix.length);
260                 return title.dup ();
261         }
262
263         public void parse_cinema () throws Error {
264                 expect_tag ("div"); // class=theater
265                 expect_tag ("div"); // class=desc id=theater_...
266                 expect_tag ("h2"); // class=name
267                 expect_tag ("a"); // href="/movies?near=city&amp;tid=..."
268                 expect_tag ("span"); // dir=ltr
269                 var name = unescape_unicode (convert (parse_text (), -1, "utf-8", "iso-8859-1")); // FIXME
270                 expect_tag ("/span");
271                 expect_tag ("/a");
272                 expect_tag ("/h2");
273                 expect_tag ("div"); // class=info
274                 var address_and_phone = parse_text ().replace ("&nbsp;", " ").split (" - ");
275                 string address = null;
276                 string phone = null;
277                 if (address_and_phone.length >= 2) {
278                         address = address_and_phone[0];
279                         phone = address_and_phone[1].replace (" ", "").replace ("-", "");
280                 }
281                 expect_tag ("a"); // target=_top
282                 expect_tag ("/a");
283                 expect_tag ("/div");
284                 expect_tag ("/div");
285
286                 last_cinema = new Cinema (name);
287                 last_cinema.address = address;
288                 last_cinema.phone = phone;
289         }
290
291         public void parse (ref char[] buf) throws Error {
292                 current = buf;
293                 next_tag ();
294                 while (location == null && current[0] != 0) {
295                         int i = 1;
296                         while (current[i++] != '>');
297                         if (((string) current).has_prefix ("<a href=\"/movies?near=")) {
298                                 string href = parse_tag_attribute ("a", "href");
299                                 char* p = (char*) href.offset (13); // skip "/movies?near="
300                                 int j = -1;
301
302                                 while (p[++j] != '&' && p[j] != 0);
303                                 p[0] = p[0].toupper ();
304                                 location = ((string) p).ndup (j);
305                         }
306                         current += i;
307                         next_tag ();
308                 }
309                 while (current[0] != 0) {
310                         int i = 1;
311                         while (current[i++] != '>');
312                         if (((string) current).has_prefix ("<div class=movie>")) {
313                                 parse_movie ();
314                         } else if (((string) current).has_prefix("<div class=theater>")) {
315                                 parse_cinema ();
316                         } else {
317                                 current += i;
318                         }
319                         next_tag ();
320                 }
321         }
322
323         public async void query (MovieFilter filter, string? location, MovieSource.ReceiveMovieFunction callback, Cancellable? cancellable) {
324                 _get_callback = callback;
325                 _filter = filter;
326                 if (filter.title.chr(filter.title.length, '*') != null) {
327                         pattern = new PatternSpec (filter.title);
328                 } else {
329                         pattern = null;
330                 }
331                 try {
332                         // TODO - use google.de in Germany, also provides genres
333                         string uri = "http://google.com/movies";
334                         if (location != null && location != "")
335                                 uri += "?near=" + location;
336
337                         stdout.printf ("GET: %s\n", uri);
338
339                         File file = File.new_for_uri (uri);
340                         InputStream stream = yield file.read_async (Priority.DEFAULT_IDLE, null);
341
342                         char[] buf = new char[256*1024];
343                         size_t nread;
344                         size_t total = 0;
345                         while (total < 256*1024) {
346                                 nread = yield stream.read_async ((char *)buf + total, 256*1024 - total, Priority.DEFAULT_IDLE, cancellable);
347                                 total += nread;
348                                 if (cancellable.is_cancelled ())
349                                         return;
350                                 if (nread == 0)
351                                         break;
352                         }
353                         buf[total] = 0;
354                         parse (ref buf);
355                 } catch (Error e) {
356                         stderr.printf ("Error: %s\n", e.message);
357                 }
358         }
359 }