When checking if an article has been updated, compare the content.
authorNeal H. Walfield <neal@walfield.org>
Sat, 10 Dec 2011 14:52:25 +0000 (14:52 +0000)
committerNeal H. Walfield <neal@walfield.org>
Sat, 10 Dec 2011 14:52:25 +0000 (14:52 +0000)
 - Don't rely on an article's date to determine whether it has been
   updated.
 - Store the hash of the original content in the database.
   - Add a new column, contentHash to the feed database.
   - When adding a new article, set it appropriately.
 - Use the hash to determine whether the text has been updated.

src/rss_sqlite.py

index e64c8cc..b1ddad9 100644 (file)
@@ -229,14 +229,24 @@ class Feed(BaseObject):
         self.key = key
         self.configdir = configdir
         self.dir = "%s/%s.d" %(self.configdir, self.key)
-        self.tls = threading.local ()
+        self.tls = threading.local()
 
         if not isdir(self.dir):
             mkdir(self.dir)
-        if not isfile("%s/%s.db" %(self.dir, self.key)):
-            self.db.execute("CREATE TABLE feed (id text, title text, contentLink text, date float, updated float, link text, read int);")
+        filename = "%s/%s.db" % (self.dir, self.key)
+        if not isfile(filename):
+            self.db.execute("CREATE TABLE feed (id text, title text, contentLink text, contentHash text, date float, updated float, link text, read int);")
             self.db.execute("CREATE TABLE images (id text, imagePath text);")
             self.db.commit()
+        else:
+            try:
+                self.db.execute("ALTER TABLE feed ADD COLUMN contentHash text")
+                self.db.commit()
+            except sqlite3.OperationalError, e:
+                if 'duplicate column name' in str(e):
+                    pass
+                else:
+                    logger.exception("Add column contentHash to %s", filename)
 
     def addImage(self, configdir, key, baseurl, url, proxy=None, opener=None):
         filename = configdir+key+".d/"+getId(url)
@@ -489,16 +499,26 @@ class Feed(BaseObject):
                    if(not(entry.has_key("id"))):
                        entry["id"] = None
                    content = self.extractContent(entry)
+                   contentHash = getId(content)
                    object_size = len (content)
                    tmpEntry = {"title":entry["title"], "content":content,
                                 "date":date, "link":entry["link"], "author":entry["author"], "id":entry["id"]}
                    id = self.generateUniqueId(tmpEntry)
                    
                    current_version = self.db.execute(
-                       'select date, ROWID from feed where id=?',
+                       'select date, ROWID, contentHash from feed where id=?',
                        (id,)).fetchone()
                    if (current_version is not None
-                       and current_version[0] == date):
+                       # To detect updates, don't compare by date:
+                       # compare by content.
+                       #
+                       # - If an article update is just a date change
+                       #   and the content remains the same, we don't
+                       #   want to register an update.
+                       #
+                       # - If an article's content changes but not the
+                       #   date, we want to recognize an update.
+                       and current_version[2] == contentHash):
                        logger.debug("ALREADY DOWNLOADED %s (%s)"
                                     % (entry["title"], entry["link"]))
                        ## This article is already present in the feed listing. Update the "updated" time, so it doesn't expire 
@@ -566,6 +586,7 @@ class Feed(BaseObject):
                    values = {'id': id,
                              'title': tmpEntry["title"],
                              'contentLink': tmpEntry["contentLink"],
+                             'contentHash': contentHash,
                              'date': tmpEntry["date"],
                              'updated': currentTime,
                              'link': tmpEntry["link"],
@@ -739,6 +760,9 @@ class Feed(BaseObject):
     def getContentLink(self, id):
         return self.db.execute("SELECT contentLink FROM feed WHERE id=?;", (id,) ).fetchone()[0]
     
+    def getContentHash(self, id):
+        return self.db.execute("SELECT contentHash FROM feed WHERE id=?;", (id,) ).fetchone()[0]
+    
     def getExternalLink(self, id):
         return self.db.execute("SELECT link FROM feed WHERE id=?;", (id,) ).fetchone()[0]