Now the example parses almos completely the file and extracts all theaters and all...
[maevies] / examples / gmovies.c
1 /* Parses a Google movies web (previously downloaded on a file)
2  * using libxml2. Examples of Google movies web files can be
3  * found in the gmovies_data directory.
4  */ 
5
6 #include <libxml/HTMLparser.h>
7 #include <libxml/tree.h>
8 #include <stdio.h>
9
10
11 xmlNodePtr getSiblingByName(xmlNodePtr node, xmlChar* name, int nameLen)
12 {
13         
14         xmlNodePtr sibling = node->next;
15         while((sibling != NULL) && (strncmp(sibling->name, name, nameLen) != 0)) {
16                 sibling = sibling->next;
17         }
18         
19         return sibling;
20         
21
22
23 xmlNodePtr getChildByName(xmlNodePtr node, xmlChar* name, int nameLen)
24 {
25         return getSiblingByName(node->children, name, nameLen);
26 }
27
28 xmlNodePtr jumpXSiblings(xmlNodePtr node, int siblings)
29 {
30         xmlNodePtr r = node;
31         
32         int i = 0;
33         for(; i<siblings; i++) {
34                 r = r->next;
35         }
36         
37         return r;
38 }
39
40 int isSeparatorTR(xmlNodePtr node)
41 {
42         return ((node != NULL) && (childrenCount(node) == 1));
43 }
44
45 int childrenCount(xmlNodePtr node)
46 {
47         int i=0;
48         xmlNodePtr nav = node->children;
49         while(nav != NULL) {
50                 i++;
51                 nav = nav->next;
52         }
53         
54         return i;
55 }
56
57
58 int startsTheatherData(xmlNodePtr node)
59 {
60         
61         if (strncmp(node->name, "tr", 2) == 0) {
62                 xmlNodePtr td = node->children;
63                 if ((td != NULL) && (strncmp(td->name, "td", 2) == 0)) {
64                         xmlChar* value = xmlGetProp(td, "colspan");
65                         return ((value != NULL) && (strncmp(value, "4", 1)) == 0);
66                 }
67         }
68         
69         return -1;
70 }
71
72
73 int main (int argc, char ** argv)
74 {
75
76         if (argc != 2) {
77                 printf("usage: gmovies file.html\n");
78                 exit(-1);
79         }
80
81         htmlDocPtr doc = htmlReadFile(argv[1], "UTF-8", 0);
82
83         xmlNodePtr root = xmlDocGetRootElement(doc);//html
84         
85         //get the body node
86         xmlNodePtr rootChild = getSiblingByName(root->children, "body", 4);
87
88         //get the form node inside body, the data is in the next node (a table)
89         xmlNodePtr dataTable = getChildByName(rootChild, "form", 4)->next;
90
91         //tbody
92         xmlNodePtr elem = dataTable;
93         xmlNodePtr nav = dataTable->children;
94         int i = 0;
95         while(nav != NULL) {
96                 elem = nav;
97                 if (startsTheatherData(elem)) {
98                         elem = elem->children; //td
99                         elem = elem->children; //a
100                         elem = elem->children; //b
101
102                         printf("Theather %d = %s\n", i++, xmlNodeGetContent(elem));
103                         printf("-------------------------------------------------\n");
104                         
105                         xmlNodePtr n1 = nav->next; //in this tr there is 4 td with 2 film data
106                         while(!startsTheatherData(n1) && !isSeparatorTR(n1)) {
107                                 elem = n1->children->next; //the first td is for rating
108                                 printf("%s\n", xmlNodeGetContent(elem->children->children));
109                                 if (childrenCount(n1->children) > 2) {
110                                         elem = elem->next->next; //the first td is for rating
111                                         printf("%s\n", xmlNodeGetContent(elem->children->children));
112                                 }
113                                 n1 = n1->next;
114                         }
115                         
116                         printf("\n\n");
117                 }
118                 nav = nav->next;
119         }
120
121 }