1 /* Parses a Google movies web (previously downloaded on a file)
2 * using libxml2. Examples of Google movies web files can be
3 * found in the gmovies_data directory.
6 #include <libxml/HTMLparser.h>
7 #include <libxml/tree.h>
8 #include <rest/rest/rest-proxy.h>
14 xmlNodePtr getSiblingByName(xmlNodePtr node, xmlChar* name, int nameLen)
17 xmlNodePtr sibling = node->next;
18 while((sibling != NULL) && (strncmp(sibling->name, name, nameLen) != 0)) {
19 sibling = sibling->next;
26 xmlNodePtr getChildByName(xmlNodePtr node, xmlChar* name, int nameLen)
28 return getSiblingByName(node->children, name, nameLen);
31 xmlNodePtr getFirstSiblingByAttributeValue(
32 xmlNodePtr sibling, xmlChar* attr, xmlChar * attrValue, int attrValueLen)
34 xmlNodePtr tempNode = sibling;
36 while(tempNode != NULL) {
37 xmlChar* value = xmlGetProp(tempNode, attr);
38 if ((value != NULL) && (strncmp(value, attrValue, attrValueLen)) == 0)
40 tempNode = tempNode->next;
47 xmlNodePtr getFirstChildByAttributeValue(
48 xmlNodePtr node, xmlChar* attr, xmlChar * attrValue, int attrValueLen)
50 return getFirstSiblingByAttributeValue(node->children, attr, attrValue, attrValueLen);
54 xmlNodePtr jumpXSiblings(xmlNodePtr node, int siblings)
59 for(; i<siblings; i++) {
66 int isSeparatorTR(xmlNodePtr node)
68 return ((node != NULL) && (childrenCount(node) == 1));
71 int childrenCount(xmlNodePtr node)
74 xmlNodePtr nav = node->children;
84 int startsTheatherData(xmlNodePtr node)
87 if (strncmp(node->name, "tr", 2) == 0) {
88 xmlNodePtr td = node->children;
89 if ((td != NULL) && (strncmp(td->name, "td", 2) == 0)) {
90 xmlChar* value = xmlGetProp(td, "colspan");
91 return ((value != NULL) && (strncmp(value, "4", 1)) == 0);
99 int main (int argc, char ** argv)
103 printf("usage: gmovies file.html\n");
109 const gchar *payload;
110 const char *city = argv[1];
116 proxy = rest_proxy_new(
117 "http://www.google.com/movies",
119 call = rest_proxy_new_call(proxy);
121 rest_proxy_call_add_params(call,
124 rest_proxy_call_run(call, NULL, NULL);
126 payload = rest_proxy_call_get_payload(call);
127 len = rest_proxy_call_get_payload_length(call);
129 //write(1, payload, len);
132 htmlDocPtr doc = htmlReadMemory(payload, len, "http://movies.google.com", "UTF-8", HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
134 xmlNodePtr root = xmlDocGetRootElement(doc);//html
137 xmlNodePtr body = getSiblingByName(root->children, "body", 4);
139 xmlNodePtr tempNode = getFirstChildByAttributeValue(body, "id", "results", 8); //the data is a div with id = results
141 if (tempNode == NULL) {
142 printf("results div not found.\n");
146 tempNode = getFirstChildByAttributeValue(tempNode, "id", "movie_results", 14);
148 if (tempNode == NULL) {
149 printf("movie_results div not found.\n");
153 tempNode = getFirstChildByAttributeValue(tempNode, "class", "movie_results", 14);
155 if (tempNode == NULL) {
156 printf("movie_results class not found.\n");
161 xmlNodePtr nav = tempNode->children;
163 tempNode = getFirstSiblingByAttributeValue(nav, "class", "theater", 7);
164 tempNode = getFirstChildByAttributeValue(tempNode, "class", "desc", 4);
165 if (tempNode != NULL) {
166 tempNode = getFirstChildByAttributeValue(tempNode, "class", "name", 4);
167 printf("Info = %s\n", xmlNodeGetContent(tempNode->children->children));
174 //get the form node inside body, the data is in the next node (a table)
175 xmlNodePtr dataTable = getChildByName(body, "form", 4)->next;
179 xmlNodePtr elem = dataTable;
180 //xmlNodePtr nav = dataTable->children;
184 if (startsTheatherData(elem)) {
185 elem = elem->children; //td
186 elem = elem->children; //a
187 elem = elem->children; //b
189 printf("Theather %d = %s\n", i++, xmlNodeGetContent(elem));
190 printf("-------------------------------------------------\n");
192 xmlNodePtr n1 = nav->next; //in this tr there is 4 td with 2 film data
193 while(!startsTheatherData(n1) && !isSeparatorTR(n1)) {
194 elem = n1->children->next; //the first td is for rating
195 printf("%s\n", xmlNodeGetContent(elem->children->children));
196 if (childrenCount(n1->children) > 2) {
197 elem = elem->next->next; //the first td is for rating
198 printf("%s\n", xmlNodeGetContent(elem->children->children));