comics = {"xkcd":{"name":"xkcd","link":"http://xkcd.org/","start":666,"dbfile":dbdir + "comicdb.xkcd.csv"},
"sinfest":{"name":"Sinfest","link":"http://sinfest.com/","start":3400,"dbfile":dbdir + "comicdb.sinfest.csv"},
"phd":{"name":"PHD Comics","link":"http://www.phdcomics.com/","start":1240,"dbfile":dbdir + "comicdb.phd.csv"},
- "dilbert":{"name":"Dilbert","link":"http://dilbert.com.com/","start":"2009-01-01","dbfile":dbdir + "comicdb.dilbert.csv"}}
+ "dilbert":{"name":"Dilbert","link":"http://dilbert.com/","start":"2009-01-01","dbfile":dbdir + "comicdb.dilbert.csv"},
+ "cyanide":{"name":"Cyanide&Happiness","link":"http://explosm.com/","start":"1920","dbfile":dbdir + "comicdb.cyanide.csv"},
+ }
+
+previous = False
+next = False
# handling of the comics
class ComicDb():
for row in dbr:
self.db.insert(0,row)
dbf.close()
+ if len(self.db) == 0:
+ self.refresh()
self.currentcomic = 0
dbf = open(self.dbfile, 'w')
dbf.write('comic,id,link,url,filename,title\n')
dbf.close()
+
if os.path.isfile(self.dbfile) == True:
dbf = open(self.dbfile, 'r')
return dbf
def refresh(self):
if len(self.db) < 1:
+ self.currentcomic = -1
self.fetch_latest_std(self.comic, self.start)
elif self.currentcomic == 0 or self.currentcomic < 0:
self.fetch_latest_std(self.comic, self.db[0]['id'])
def fetch_latest_std(self, comic, latest):
print "fetching new after " + str(comic) + " " + str(latest)
+ next = False
dateid = False
- if len(str(latest)) == 10:
- # date id.
- dateid = True
- dt = string.split(latest, "-")
- d = datetime.date(int(dt[0]),int(dt[1]),int(dt[2]))
- newer = d + datetime.timedelta( 1 )
- comicid = newer.isoformat()
- else:
- comicid = int(latest) + 1
+ if comic == 'cyanide':
+ next = self.get_next_id(comic, latest)
+ if not next:
+ return
+ else:
+ comicid = next
+ else:
+ if len(str(latest)) == 10:
+ # date id.
+ dateid = True
+ dt = string.split(latest, "-")
+ d = datetime.date(int(dt[0]),int(dt[1]),int(dt[2]))
+ newer = d + datetime.timedelta( 1 )
+ comicid = newer.isoformat()
+ else:
+ comicid = int(latest) + 1
- lasturl = self.db[0]['url']
+ if len(self.db) > 0:
+ lasturl = self.db[0]['url']
+ else:
+ lasturl = "http"
while True:
irow = self.get_irow(comic, comicid)
if irow:
print "got irow: " + str(irow)
- if irow[3] == lasturl:
- print "Looping the same, break break break!"
- break
- lasturl = irow[3]
- print "inserting..."
- self.insert_row(irow)
+ if str(irow[0]) == 'skip':
+ print "skipping this one..."
+ next = int(irow[1])
+ else:
+ if irow[3] == lasturl:
+ print "Looping the same, break break break!"
+ break
+ lasturl = irow[3]
+ print "inserting..."
+ self.insert_row([irow[0],irow[1],irow[2],irow[3],irow[4],irow[5]])
+ if len(irow) > 6:
+ next = irow[6]
+ if not next:
+ break
if dateid:
dt = string.split(comicid, "-")
d = datetime.date(int(dt[0]),int(dt[1]),int(dt[2]))
newer = d + datetime.timedelta( 1 )
comicid = newer.isoformat()
+ elif next:
+ comicid = next
else:
comicid += 1
else:
break
+
+
+ def get_next_id(self, comic, number):
+ if comic == 'cyanide':
+ link = "http://www.explosm.net/comics/" + str(number) + "/"
+ print "link: " + link
+ try:
+ f = urllib2.urlopen(link)
+ hcode = f.code
+ except:
+ hcode = 404
+ print "got hcode = " + str(hcode) + "\n"
+ if (hcode != 200):
+ return False
+ else:
+ print "Cyanide & Happiness is unreliable, so we need to track next and prev"
+ s = f.read()
+ f.close()
+ # title:
+ splt = string.split(s, 'Previous</a> | <a href="/comics/', 1)
+ if len(splt) < 2:
+ print "no 'next' found"
+ return False
+ splt = string.split(splt[1], '/">Next >', 1)
+ if len(splt) < 2:
+ print "no 'next' found"
+ return False
+ else:
+ return splt[0]
+
+
+
+
+
+
def get_irow(self, comic, number):
dateid = False
if len(str(number)) == 10:
link = "http://dilbert.com/" + str(number) + "/"
elif comic == 'phd':
link = "http://www.phdcomics.com/comics/archive.php?comicid=" + str(number)
+ elif comic == 'cyanide':
+ link = "http://www.explosm.net/comics/" + str(number) + "/"
else:
return False
print "Fake 404! Break break break!"
return False
-
+ elif comic == 'cyanide':
+ s = f.read()
+ f.close()
+ # title:
+ splt = string.split(s, ' First</a> | <a href="/comics/', 1)
+ if len(splt) < 2:
+ print "first comic?"
+ prev = False
+ else:
+ splt = string.split(splt[1], '/">< ', 1)
+ prev = splt[0];
+ splt = string.split(s, 'Previous</a> | <a href="/comics/', 1)
+ if len(splt) < 2:
+ print "last comic?"
+ next = False
+ else:
+ splt = string.split(splt[1], '/">Next ></a>', 1)
+ if len(splt[0]) > 10:
+ next = False
+ else:
+ next = splt[0]
+
+ splt = string.split(s, '</tr><tr><td colspan=2>', 1)
+ if len(splt) < 2:
+ print "no comic?"
+ return False
+ splt = string.split(splt[1], ' <b>by <a href="', 1)
+ title = splt[0];
+ splt = string.split(splt[1], 'a daily webcomic" src="http://www.explosm', 1)
+ if len(splt) < 2:
+ print "a video? Try skipping"
+ return ['skip',next,prev]
+
+ splt = string.split(splt[1], '"></div><br />', 1)
+ url = "http://www.explosm" + splt[0]
+ splt2 = string.rsplit(url, "/", 1)
+ filename = splt2[1]
+ irow = [comic,number,link,url,filename,title,next,prev]
+ return irow
+
splt2 = string.rsplit(url, "/", 1)
filename = splt2[1]
irow = [comic,number,link,url,filename,title]