IMDb line parser: add parser for actors and actresses lists
[cinaest] / src / imdb / imdb-line-parser.vala
1 abstract class LineParser {
2         internal unowned IMDbSqlite sqlite;
3
4         public LineParser (IMDbSqlite _sqlite) {
5                 sqlite = _sqlite;
6         }
7
8         public abstract void parse_line (string line);
9
10         internal bool skip_title (string title) {
11                 if (title.has_suffix ("(TV)")) {
12                         return true;
13                 }
14                 if (title.has_suffix ("(V)")) {
15                         return true;
16                 }
17                 if (title.has_suffix ("(VG)")) {
18                         return true;
19                 }
20                 return false;
21         }
22 }
23
24 class MovieLineParser : LineParser {
25         Regex re_movie;
26
27         public MovieLineParser (IMDbSqlite _sqlite) {
28                 base (_sqlite);
29                 try {
30                         re_movie = new Regex ("^([^\t]+)\t+([0-9]+)$");
31                 } catch (RegexError e) {
32                         critical ("Failed to initialize regex: %s\n", e.message);
33                 }
34         }
35
36         public override void parse_line (string line) {
37                 MatchInfo matchinfo;
38
39                 // Skip series episodes
40                 if (line[0] == '"')
41                         return;
42
43                 if (!re_movie.match(line, 0, out matchinfo))
44                         return;
45
46                 string title;
47                 string year = matchinfo.fetch (2);
48                 try {
49                         title = convert(matchinfo.fetch (1), -1, "utf-8", "latin1");
50                 } catch (ConvertError e) {
51                         return;
52                 }
53
54                 if (skip_title (title))
55                         return;
56
57                 sqlite.add_movie (title, year.to_int ());
58         }
59 }
60
61 class GenreLineParser : LineParser {
62         Regex re_genre;
63
64         public GenreLineParser (IMDbSqlite _sqlite) {
65                 base (_sqlite);
66                 try {
67                         re_genre = new Regex ("^([^\t]+)\t+([A-Za-z-]+)$");
68                 } catch (RegexError e) {
69                         critical ("Failed to initialize regex: %s\n", e.message);
70                 }
71         }
72
73         public override void parse_line (string line) {
74                 MatchInfo matchinfo;
75
76                 // Skip series episodes
77                 if (line[0] == '"')
78                         return;
79
80                 if (!re_genre.match(line, 0, out matchinfo))
81                         return;
82
83                 string title;
84                 string genre = matchinfo.fetch (2);
85                 try {
86                         title = convert(matchinfo.fetch (1), -1, "utf-8", "latin1");
87                 } catch (ConvertError e) {
88                         return;
89                 }
90
91                 sqlite.movie_add_genre (title, genre);
92         }
93 }
94
95 class RatingLineParser : LineParser {
96         enum RatingState {
97                 HEADER,
98                 NONE
99         }
100         RatingState state;
101         Regex re_rating;
102
103         public RatingLineParser (IMDbSqlite _sqlite) {
104                 base (_sqlite);
105                 state = RatingState.HEADER;
106                 try {
107                         re_rating = new Regex ("^      .+ +([0-9]+) +([0-9.]+) +(.+)$");
108                 } catch (RegexError e) {
109                         critical ("Failed to initialize regex: %s\n", e.message);
110                 }
111         }
112
113         public override void parse_line (string line) {
114                 if (state == RatingState.HEADER) {
115                         if (line == "MOVIE RATINGS REPORT")
116                                 state = RatingState.NONE;
117                         return;
118                 }
119
120                 if (state != RatingState.NONE)
121                         return;
122
123                 MatchInfo matchinfo;
124
125                 // Skip series episodes
126                 if (line[0] == '"')
127                         return;
128
129                 if (!re_rating.match(line, 0, out matchinfo))
130                         return;
131
132                 string title;
133                 string votes = matchinfo.fetch (1);
134                 string rating = matchinfo.fetch (2);
135                 try {
136                         title = convert(matchinfo.fetch (3), -1, "utf-8", "latin1");
137                 } catch (ConvertError e) {
138                         return;
139                 }
140
141                 // Skip series episodes
142                 if (title[0] == '"')
143                         return;
144
145                 if (skip_title (title))
146                         return;
147
148                 sqlite.movie_set_rating (title, (int) (rating.to_double () * 10), votes.to_int ());
149         }
150 }
151
152 class AkaLineParser : LineParser {
153         enum AkaState {
154                 HEADER,
155                 NONE,
156                 TITLE
157         }
158         AkaState state;
159         string title;
160
161         public AkaLineParser (IMDbSqlite _sqlite) {
162                 base (_sqlite);
163                 state = AkaState.HEADER;
164                 title = null;
165         }
166
167         public override void parse_line (string line) {
168                 if (state == AkaState.HEADER) {
169                         if (line == "AKA TITLES LIST") title = line;
170                         if (line == "===============" && title != null)
171                                 state = AkaState.NONE;
172                         return;
173                 }
174
175                 if (state == AkaState.NONE) {
176                         // Skip empty lines
177                         if (line == "")
178                                 return;
179
180                         // Skip series episodes
181                         if (line[0] == '"')
182                                 return;
183
184                         // Parse error
185                         if (line[0] == ' ')
186                                 return;
187
188                         try {
189                                 title = convert (line, -1, "utf-8", "latin1");
190                         } catch (ConvertError e) {
191                                 title = null;
192                                 return;
193                         }
194
195                         if (skip_title (title))
196                                 return;
197
198                         state = AkaState.TITLE;
199                 }
200
201                 if (state == AkaState.TITLE) {
202                         // Empty lines mark end of title
203                         if (line == "") {
204                                 state = AkaState.NONE;
205                                 return;
206                         }
207
208                         if (line.has_prefix ("   (aka ")) {
209                                 if (skip_title (title))
210                                         return;
211
212                                 char* start = line.offset (8);
213                                 char* end = ((string) start).str ("))");
214                                 if (end != null)
215                                         end[1] = '\0';
216
217                                 string aka;
218                                 try {
219                                         aka = convert ((string) start, -1, "utf-8", "latin1");
220                                 } catch (ConvertError e) {
221                                         return;
222                                 }
223
224                                 sqlite.add_aka (title, aka);
225                         }
226                 }
227         }
228 }
229
230 class PlotLineParser : LineParser {
231         enum PlotState {
232                 HEADER,
233                 NONE,
234                 TITLE
235         }
236         string title;
237         string plot;
238         PlotState state;
239
240         public PlotLineParser (IMDbSqlite _sqlite) {
241                 base (_sqlite);
242                 state = PlotState.HEADER;
243                 title = null;
244         }
245
246         public override void parse_line (string line) {
247                 if (state == PlotState.HEADER) {
248                         if (line == "PLOT SUMMARIES LIST") title = line;
249                         if (line == "===================" && title != null)
250                                 state = PlotState.NONE;
251                         return;
252                 }
253
254                 // Skip empty lines
255                 if (line == "")
256                         return;
257
258                 if (state == PlotState.NONE) {
259                         if (line.has_prefix ("MV: ")) {
260                                 // Skip series episodes
261                                 if (line[4] == '"')
262                                         return;
263
264                                 try {
265                                         title = convert (line.offset (4), -1, "utf-8", "latin1");
266                                 } catch (ConvertError e) {
267                                         stderr.printf ("Error converting title to UTF-8\n");
268                                         title = null;
269                                         return;
270                                 }
271
272                                 if (skip_title (title))
273                                         return;
274
275                                 state = PlotState.TITLE;
276                                 plot = "";
277                         }
278                         return;
279                 }
280
281                 if (state == PlotState.TITLE) {
282                         if (line.has_prefix ("PL: ")) {
283                                 if (skip_title (title))
284                                         return;
285
286                                 try {
287                                         if (plot != "")
288                                                 plot += " ";
289                                         plot += convert (line.offset (4), -1, "utf-8", "latin1");
290                                 } catch (ConvertError e) {
291                                         stderr.printf ("Error converting plot for \"%s\" to UTF-8\n", title);
292                                         plot = "";
293                                         return;
294                                 }
295                         }
296
297                         // BY: tag marks end of plot
298                         if (line.has_prefix ("BY: ")) {
299                                 string author;
300                                 try {
301                                         author = convert (line.offset (4), -1, "utf-8", "latin1");
302                                 } catch (ConvertError e) {
303                                         stderr.printf ("Error converting plot author for \"%s\" to UTF-8\n", title);
304                                         author = null;
305                                 }
306
307                                 sqlite.add_plot (title, plot, author);
308
309                                 state = PlotState.NONE;
310                                 return;
311                         }
312                 }
313         }
314 }
315
316 class PersonParser : LineParser {
317         enum PersonState {
318                 HEADER,
319                 NONE,
320                 PERSON
321         }
322         enum PersonType {
323                 NONE,
324                 ACTOR,
325                 ACTRESS,
326                 DIRECTOR,
327                 WRITER
328         }
329         PersonState state;
330         PersonType type;
331         string name;
332
333         public PersonParser (IMDbSqlite _sqlite) {
334                 base (_sqlite);
335                 reset ();
336         }
337
338         public void reset () {
339                 state = PersonState.HEADER;
340                 type = PersonType.NONE;
341                 name = null;
342         }
343
344         public override void parse_line (string line) {
345                 if (state == PersonState.HEADER) {
346                         if (line == "THE ACTORS LIST") type = PersonType.ACTOR;
347                         else if (line == "THE ACTRESSES LIST") type = PersonType.ACTRESS;
348                         else if (line == "THE DIRECTORS LIST") type = PersonType.DIRECTOR;
349                         else if (line == "THE WRITERS LIST") type = PersonType.WRITER;
350                         else if (line == "----\t\t\t------" && type != PersonType.NONE)
351                                 state = PersonState.NONE;
352
353                         return;
354                 }
355
356                 // Skip empty lines
357                 if (line == "") {
358                         state = PersonState.NONE;
359                         name = null;
360
361                         return;
362                 }
363
364                 if (state == PersonState.NONE) {
365                         if (line.has_prefix ("\t")) {
366                                 stderr.printf ("Invalid person entry: %s\n", line);
367                                 error ("EXIT\n");
368                                 return;
369                         }
370
371                         unowned string title = line.rstr ("\t");
372                         if (title == null)
373                                 return;
374                         title = title.offset (1);
375
376                         char* end = line.str ("\t");
377                         if (end != null)
378                                 end[0] = '\0';
379
380                         try {
381                                 name = convert (line, -1, "utf-8", "latin1");
382                         } catch (ConvertError e) {
383                                 stderr.printf ("Error converting name to UTF-8\n");
384                                 name = null;
385
386                                 return;
387                         }
388
389                         sqlite.add_person (name);
390                         parse_title (convert (title, -1, "utf-8", "latin1"));
391
392                         state = PersonState.PERSON;
393
394                         return;
395                 }
396
397                 if (state == PersonState.PERSON) {
398                         if (line.has_prefix ("\t\t\t"))
399                                 parse_title (line.offset (3));
400                         else
401                                 stderr.printf ("\t???: %s\n", line);
402                 }
403         }
404
405         private void parse_title (string title) {
406                 // Skip series episodes
407                 if (title[0] == '"')
408                         return;
409
410                 char* end = title.str ("  ");
411                 unowned string next;
412                 if (end != null) {
413                         next = ((string) end).offset (2);
414                         end[0] = '\0';
415                 } else {
416                         return;
417                 }
418
419                 if (skip_title (title))
420                         return;
421
422
423                 if (type == PersonType.ACTOR || type == PersonType.ACTRESS) {
424                         string character = null;
425                         int number = 0;
426                         string info = null;
427                         do {
428                                 unowned string current = next;
429                                 end = current.str ("  ");
430                                 if (end != null) {
431                                         next = ((string) end).offset (2);
432                                         end[0] = '\0';
433                                 }
434
435                                 if (current.has_prefix ("["))
436                                         character = current.substring (1, current.length - 2);
437                                 if (current.has_prefix ("<"))
438                                         number = current.offset (1).to_int ();
439                                 if (current.has_prefix ("("))
440                                         info = current;
441                         } while (end != null);
442
443                         sqlite.add_actor (name, title, info, character, number);
444                 }
445         }
446 }
447