From 7f2133b566843a83028a944e69c708655cf27c41 Mon Sep 17 00:00:00 2001 From: "Neal H. Walfield" Date: Sat, 10 Dec 2011 14:52:25 +0000 Subject: [PATCH] When checking if an article has been updated, compare the content. - Don't rely on an article's date to determine whether it has been updated. - Store the hash of the original content in the database. - Add a new column, contentHash to the feed database. - When adding a new article, set it appropriately. - Use the hash to determine whether the text has been updated. --- src/rss_sqlite.py | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/src/rss_sqlite.py b/src/rss_sqlite.py index e64c8cc..b1ddad9 100644 --- a/src/rss_sqlite.py +++ b/src/rss_sqlite.py @@ -229,14 +229,24 @@ class Feed(BaseObject): self.key = key self.configdir = configdir self.dir = "%s/%s.d" %(self.configdir, self.key) - self.tls = threading.local () + self.tls = threading.local() if not isdir(self.dir): mkdir(self.dir) - if not isfile("%s/%s.db" %(self.dir, self.key)): - self.db.execute("CREATE TABLE feed (id text, title text, contentLink text, date float, updated float, link text, read int);") + filename = "%s/%s.db" % (self.dir, self.key) + if not isfile(filename): + self.db.execute("CREATE TABLE feed (id text, title text, contentLink text, contentHash text, date float, updated float, link text, read int);") self.db.execute("CREATE TABLE images (id text, imagePath text);") self.db.commit() + else: + try: + self.db.execute("ALTER TABLE feed ADD COLUMN contentHash text") + self.db.commit() + except sqlite3.OperationalError, e: + if 'duplicate column name' in str(e): + pass + else: + logger.exception("Add column contentHash to %s", filename) def addImage(self, configdir, key, baseurl, url, proxy=None, opener=None): filename = configdir+key+".d/"+getId(url) @@ -489,16 +499,26 @@ class Feed(BaseObject): if(not(entry.has_key("id"))): entry["id"] = None content = self.extractContent(entry) + contentHash = getId(content) object_size = len (content) tmpEntry = {"title":entry["title"], "content":content, "date":date, "link":entry["link"], "author":entry["author"], "id":entry["id"]} id = self.generateUniqueId(tmpEntry) current_version = self.db.execute( - 'select date, ROWID from feed where id=?', + 'select date, ROWID, contentHash from feed where id=?', (id,)).fetchone() if (current_version is not None - and current_version[0] == date): + # To detect updates, don't compare by date: + # compare by content. + # + # - If an article update is just a date change + # and the content remains the same, we don't + # want to register an update. + # + # - If an article's content changes but not the + # date, we want to recognize an update. + and current_version[2] == contentHash): logger.debug("ALREADY DOWNLOADED %s (%s)" % (entry["title"], entry["link"])) ## This article is already present in the feed listing. Update the "updated" time, so it doesn't expire @@ -566,6 +586,7 @@ class Feed(BaseObject): values = {'id': id, 'title': tmpEntry["title"], 'contentLink': tmpEntry["contentLink"], + 'contentHash': contentHash, 'date': tmpEntry["date"], 'updated': currentTime, 'link': tmpEntry["link"], @@ -739,6 +760,9 @@ class Feed(BaseObject): def getContentLink(self, id): return self.db.execute("SELECT contentLink FROM feed WHERE id=?;", (id,) ).fetchone()[0] + def getContentHash(self, id): + return self.db.execute("SELECT contentHash FROM feed WHERE id=?;", (id,) ).fetchone()[0] + def getExternalLink(self, id): return self.db.execute("SELECT link FROM feed WHERE id=?;", (id,) ).fetchone()[0] -- 1.7.9.5