self.key = key
self.configdir = configdir
self.dir = "%s/%s.d" %(self.configdir, self.key)
- self.tls = threading.local ()
+ self.tls = threading.local()
if not isdir(self.dir):
mkdir(self.dir)
- if not isfile("%s/%s.db" %(self.dir, self.key)):
- self.db.execute("CREATE TABLE feed (id text, title text, contentLink text, date float, updated float, link text, read int);")
+ filename = "%s/%s.db" % (self.dir, self.key)
+ if not isfile(filename):
+ self.db.execute("CREATE TABLE feed (id text, title text, contentLink text, contentHash text, date float, updated float, link text, read int);")
self.db.execute("CREATE TABLE images (id text, imagePath text);")
self.db.commit()
+ else:
+ try:
+ self.db.execute("ALTER TABLE feed ADD COLUMN contentHash text")
+ self.db.commit()
+ except sqlite3.OperationalError, e:
+ if 'duplicate column name' in str(e):
+ pass
+ else:
+ logger.exception("Add column contentHash to %s", filename)
def addImage(self, configdir, key, baseurl, url, proxy=None, opener=None):
filename = configdir+key+".d/"+getId(url)
if(not(entry.has_key("id"))):
entry["id"] = None
content = self.extractContent(entry)
+ contentHash = getId(content)
object_size = len (content)
tmpEntry = {"title":entry["title"], "content":content,
"date":date, "link":entry["link"], "author":entry["author"], "id":entry["id"]}
id = self.generateUniqueId(tmpEntry)
current_version = self.db.execute(
- 'select date, ROWID from feed where id=?',
+ 'select date, ROWID, contentHash from feed where id=?',
(id,)).fetchone()
if (current_version is not None
- and current_version[0] == date):
+ # To detect updates, don't compare by date:
+ # compare by content.
+ #
+ # - If an article update is just a date change
+ # and the content remains the same, we don't
+ # want to register an update.
+ #
+ # - If an article's content changes but not the
+ # date, we want to recognize an update.
+ and current_version[2] == contentHash):
logger.debug("ALREADY DOWNLOADED %s (%s)"
% (entry["title"], entry["link"]))
## This article is already present in the feed listing. Update the "updated" time, so it doesn't expire
values = {'id': id,
'title': tmpEntry["title"],
'contentLink': tmpEntry["contentLink"],
+ 'contentHash': contentHash,
'date': tmpEntry["date"],
'updated': currentTime,
'link': tmpEntry["link"],
def getContentLink(self, id):
return self.db.execute("SELECT contentLink FROM feed WHERE id=?;", (id,) ).fetchone()[0]
+ def getContentHash(self, id):
+ return self.db.execute("SELECT contentHash FROM feed WHERE id=?;", (id,) ).fetchone()[0]
+
def getExternalLink(self, id):
return self.db.execute("SELECT link FROM feed WHERE id=?;", (id,) ).fetchone()[0]