#include <stdio.h>
+xmlNodePtr getSiblingByName(xmlNodePtr node, xmlChar* name, int nameLen)
+{
+
+ xmlNodePtr sibling = node->next;
+ while((sibling != NULL) && (strncmp(sibling->name, name, nameLen) != 0)) {
+ sibling = sibling->next;
+ }
+
+ return sibling;
+
+}
+
+xmlNodePtr getChildByName(xmlNodePtr node, xmlChar* name, int nameLen)
+{
+ return getSiblingByName(node->children, name, nameLen);
+}
+
+xmlNodePtr jumpXSiblings(xmlNodePtr node, int siblings)
+{
+ xmlNodePtr r = node;
+
+ int i = 0;
+ for(; i<siblings; i++) {
+ r = r->next;
+ }
+
+ return r;
+}
+
+int isSeparatorTR(xmlNodePtr node)
+{
+ return ((node != NULL) && (childrenCount(node) == 1));
+}
+
+int childrenCount(xmlNodePtr node)
+{
+ int i=0;
+ xmlNodePtr nav = node->children;
+ while(nav != NULL) {
+ i++;
+ nav = nav->next;
+ }
+
+ return i;
+}
+
+
+int startsTheatherData(xmlNodePtr node)
+{
+
+ if (strncmp(node->name, "tr", 2) == 0) {
+ xmlNodePtr td = node->children;
+ if ((td != NULL) && (strncmp(td->name, "td", 2) == 0)) {
+ xmlChar* value = xmlGetProp(td, "colspan");
+ return ((value != NULL) && (strncmp(value, "4", 1)) == 0);
+ }
+ }
+
+ return -1;
+}
+
+
int main (int argc, char ** argv)
{
htmlDocPtr doc = htmlReadFile(argv[1], "UTF-8", 0);
xmlNodePtr root = xmlDocGetRootElement(doc);//html
+
+ //get the body node
+ xmlNodePtr rootChild = getSiblingByName(root->children, "body", 4);
- xmlNodePtr body = xmlLastElementChild(root);//body
-
- xmlNodePtr elem = xmlFirstElementChild(body);
- int i =0;
- for(i=0; i<6; i++) {
- elem = xmlNextElementSibling(elem);
- }
+ //get the form node inside body, the data is in the next node (a table)
+ xmlNodePtr dataTable = getChildByName(rootChild, "form", 4)->next;
//tbody
- elem = xmlFirstElementChild(elem); //tr
- elem = xmlFirstElementChild(elem); //td
- elem = xmlFirstElementChild(elem); //a
- elem = xmlFirstElementChild(elem); //b
+ xmlNodePtr elem = dataTable;
+ xmlNodePtr nav = dataTable->children;
+ int i = 0;
+ while(nav != NULL) {
+ elem = nav;
+ if (startsTheatherData(elem)) {
+ elem = elem->children; //td
+ elem = elem->children; //a
+ elem = elem->children; //b
- printf("First theater = %s\n", xmlNodeGetContent(elem));
+ printf("Theather %d = %s\n", i++, xmlNodeGetContent(elem));
+ printf("-------------------------------------------------\n");
+
+ xmlNodePtr n1 = nav->next; //in this tr there is 4 td with 2 film data
+ while(!startsTheatherData(n1) && !isSeparatorTR(n1)) {
+ elem = n1->children->next; //the first td is for rating
+ printf("%s\n", xmlNodeGetContent(elem->children->children));
+ if (childrenCount(n1->children) > 2) {
+ elem = elem->next->next; //the first td is for rating
+ printf("%s\n", xmlNodeGetContent(elem->children->children));
+ }
+ n1 = n1->next;
+ }
+
+ printf("\n\n");
+ }
+ nav = nav->next;
+ }
}