1 /* Parses a Google movies web (previously downloaded on a file)
2 * using libxml2. Examples of Google movies web files can be
3 * found in the gmovies_data directory.
6 #include <libxml/HTMLparser.h>
7 #include <libxml/tree.h>
11 xmlNodePtr getSiblingByName(xmlNodePtr node, xmlChar* name, int nameLen)
14 xmlNodePtr sibling = node->next;
15 while((sibling != NULL) && (strncmp(sibling->name, name, nameLen) != 0)) {
16 sibling = sibling->next;
23 xmlNodePtr getChildByName(xmlNodePtr node, xmlChar* name, int nameLen)
25 return getSiblingByName(node->children, name, nameLen);
28 xmlNodePtr jumpXSiblings(xmlNodePtr node, int siblings)
33 for(; i<siblings; i++) {
40 int isSeparatorTR(xmlNodePtr node)
42 return ((node != NULL) && (childrenCount(node) == 1));
45 int childrenCount(xmlNodePtr node)
48 xmlNodePtr nav = node->children;
58 int startsTheatherData(xmlNodePtr node)
61 if (strncmp(node->name, "tr", 2) == 0) {
62 xmlNodePtr td = node->children;
63 if ((td != NULL) && (strncmp(td->name, "td", 2) == 0)) {
64 xmlChar* value = xmlGetProp(td, "colspan");
65 return ((value != NULL) && (strncmp(value, "4", 1)) == 0);
73 int main (int argc, char ** argv)
77 printf("usage: gmovies file.html\n");
81 htmlDocPtr doc = htmlReadFile(argv[1], "UTF-8", 0);
83 xmlNodePtr root = xmlDocGetRootElement(doc);//html
86 xmlNodePtr rootChild = getSiblingByName(root->children, "body", 4);
88 //get the form node inside body, the data is in the next node (a table)
89 xmlNodePtr dataTable = getChildByName(rootChild, "form", 4)->next;
92 xmlNodePtr elem = dataTable;
93 xmlNodePtr nav = dataTable->children;
97 if (startsTheatherData(elem)) {
98 elem = elem->children; //td
99 elem = elem->children; //a
100 elem = elem->children; //b
102 printf("Theather %d = %s\n", i++, xmlNodeGetContent(elem));
103 printf("-------------------------------------------------\n");
105 xmlNodePtr n1 = nav->next; //in this tr there is 4 td with 2 film data
106 while(!startsTheatherData(n1) && !isSeparatorTR(n1)) {
107 elem = n1->children->next; //the first td is for rating
108 printf("%s\n", xmlNodeGetContent(elem->children->children));
109 if (childrenCount(n1->children) > 2) {
110 elem = elem->next->next; //the first td is for rating
111 printf("%s\n", xmlNodeGetContent(elem->children->children));