psa: Full width articles
[feedingit] / psa_harmattan / feedingit / deb_dist / feedingit-0.1.0 / debian / feedingit / usr / share / feedingit / rss_sqlite.py
index bcefd42..867e1af 100644 (file)
@@ -54,6 +54,9 @@ import logging
 logger = logging.getLogger(__name__)
 
 def getId(string):
+    if issubclass(string.__class__, unicode):
+        string = string.encode('utf8', 'replace')
+
     return md5.new(string).hexdigest()
 
 def download_callback(connection):
@@ -226,14 +229,24 @@ class Feed(BaseObject):
         self.key = key
         self.configdir = configdir
         self.dir = "%s/%s.d" %(self.configdir, self.key)
-        self.tls = threading.local ()
+        self.tls = threading.local()
 
         if not isdir(self.dir):
             mkdir(self.dir)
-        if not isfile("%s/%s.db" %(self.dir, self.key)):
-            self.db.execute("CREATE TABLE feed (id text, title text, contentLink text, date float, updated float, link text, read int);")
+        filename = "%s/%s.db" % (self.dir, self.key)
+        if not isfile(filename):
+            self.db.execute("CREATE TABLE feed (id text, title text, contentLink text, contentHash text, date float, updated float, link text, read int);")
             self.db.execute("CREATE TABLE images (id text, imagePath text);")
             self.db.commit()
+        else:
+            try:
+                self.db.execute("ALTER TABLE feed ADD COLUMN contentHash text")
+                self.db.commit()
+            except sqlite3.OperationalError, e:
+                if 'duplicate column name' in str(e):
+                    pass
+                else:
+                    logger.exception("Add column contentHash to %s", filename)
 
     def addImage(self, configdir, key, baseurl, url, proxy=None, opener=None):
         filename = configdir+key+".d/"+getId(url)
@@ -486,33 +499,43 @@ class Feed(BaseObject):
                    if(not(entry.has_key("id"))):
                        entry["id"] = None
                    content = self.extractContent(entry)
+                   contentHash = getId(content)
                    object_size = len (content)
                    tmpEntry = {"title":entry["title"], "content":content,
                                 "date":date, "link":entry["link"], "author":entry["author"], "id":entry["id"]}
                    id = self.generateUniqueId(tmpEntry)
                    
                    current_version = self.db.execute(
-                       'select date, ROWID from feed where id=?',
+                       'select date, ROWID, contentHash from feed where id=?',
                        (id,)).fetchone()
                    if (current_version is not None
-                       and current_version[0] == date):
+                       # To detect updates, don't compare by date:
+                       # compare by content.
+                       #
+                       # - If an article update is just a date change
+                       #   and the content remains the same, we don't
+                       #   want to register an update.
+                       #
+                       # - If an article's content changes but not the
+                       #   date, we want to recognize an update.
+                       and current_version[2] == contentHash):
                        logger.debug("ALREADY DOWNLOADED %s (%s)"
                                     % (entry["title"], entry["link"]))
-                       ## This article is already present in the feed listing. Update the "updated" time, so it doesn't expire                                                      
-                       self.db.execute("UPDATE feed SET updated=? WHERE id=?;",(currentTime,id))                                                                                    
-                       try:                                                                                                                                                         
-                           logger.debug("Updating already downloaded files for %s" %(id))                                                                                           
-                           filename = configdir+self.key+".d/"+id+".html"                                                                                                           
-                           file = open(filename,"a")                                                                                                                                
-                           utime(filename, None)                                                                                                                                    
-                           file.close()                                                                                                                                             
-                           images = self.db.execute("SELECT imagePath FROM images where id=?;", (id, )).fetchall()                                                                  
-                           for image in images:                                                                                                                                     
-                                file = open(image[0],"a")                                                                                                                           
-                                utime(image[0], None)                                                                                                                               
-                                file.close()                                                                                                                                        
-                       except:                                                                                                                                                      
-                           logger.debug("Error in refreshing images for %s" % (id))                                                                                                 
+                       ## This article is already present in the feed listing. Update the "updated" time, so it doesn't expire 
+                       self.db.execute("UPDATE feed SET updated=? WHERE id=?;",(currentTime,id))
+                       try: 
+                           logger.debug("Updating already downloaded files for %s" %(id))
+                           filename = configdir+self.key+".d/"+id+".html"
+                           file = open(filename,"a")
+                           utime(filename, None)
+                           file.close()
+                           images = self.db.execute("SELECT imagePath FROM images where id=?;", (id, )).fetchall()
+                           for image in images:
+                                file = open(image[0],"a")
+                                utime(image[0], None)
+                                file.close()
+                       except:
+                           logger.debug("Error in refreshing images for %s" % (id))
                        self.db.commit()
                        continue                       
 
@@ -520,7 +543,6 @@ class Feed(BaseObject):
                        # The version was updated.  Mark it as unread.
                        logger.debug("UPDATED: %s (%s)"
                                     % (entry["title"], entry["link"]))
-                       self.setEntryUnread(id)
                        updated_objects += 1
                    else:
                        logger.debug("NEW: %s (%s)"
@@ -531,7 +553,6 @@ class Feed(BaseObject):
                    soup = BeautifulSoup(self.getArticle(tmpEntry)) #tmpEntry["content"])
                    images = soup('img')
                    baseurl = tmpEntry["link"]
-                   #if not id in ids:
                    if imageCache and len(images) > 0:
                        self.serial_execution_lock.release ()
                        have_serial_execution_lock = False
@@ -565,6 +586,7 @@ class Feed(BaseObject):
                    values = {'id': id,
                              'title': tmpEntry["title"],
                              'contentLink': tmpEntry["contentLink"],
+                             'contentHash': contentHash,
                              'date': tmpEntry["date"],
                              'updated': currentTime,
                              'link': tmpEntry["link"],
@@ -738,6 +760,9 @@ class Feed(BaseObject):
     def getContentLink(self, id):
         return self.db.execute("SELECT contentLink FROM feed WHERE id=?;", (id,) ).fetchone()[0]
     
+    def getContentHash(self, id):
+        return self.db.execute("SELECT contentHash FROM feed WHERE id=?;", (id,) ).fetchone()[0]
+    
     def getExternalLink(self, id):
         return self.db.execute("SELECT link FROM feed WHERE id=?;", (id,) ).fetchone()[0]
     
@@ -833,13 +858,18 @@ class Feed(BaseObject):
         return text
    
     def getContent(self, id):
-        contentLink = self.db.execute("SELECT contentLink FROM feed WHERE id=?;", (id,)).fetchone()[0]
+        """
+        Return the content of the article with the specified ID.  If
+        the content is not available, returns None.
+        """
+        contentLink = self.getContentLink(id)
         try:
-            file = open(self.entries[id]["contentLink"])
-            content = file.read()
-            file.close()
-        except:
-            content = "Content unavailable"
+            with open(contentLink, 'r') as file:
+                content = file.read()
+        except Exception:
+            logger.exception("Failed get content for %s: reading %s failed",
+                             id, contentLink)
+            content = None
         return content
     
     def extractDate(self, entry):
@@ -889,33 +919,41 @@ class ArchivedArticles(Feed):
         self.db.execute("INSERT INTO feed (id, title, contentLink, date, updated, link, read) VALUES (?, ?, ?, ?, ?, ?, ?);", values)
         self.db.commit()
 
-    def updateFeed(self, configdir, url, etag, modified, expiryTime=24, proxy=None, imageCache=False):
+    # Feed.UpdateFeed calls this function.
+    def _updateFeed(self, configdir, url, etag, modified, expiryTime=24, proxy=None, imageCache=False, priority=0, postFeedUpdateFunc=None, *postFeedUpdateFuncArgs):
         currentTime = 0
         rows = self.db.execute("SELECT id, link FROM feed WHERE updated=0;")
         for row in rows:
-            currentTime = time.time()
-            id = row[0]
-            link = row[1]
-            f = urllib2.urlopen(link)
-            #entry["content"] = f.read()
-            html = f.read()
-            f.close()
-            soup = BeautifulSoup(html)
-            images = soup('img')
-            baseurl = link
-            for img in images:
-                filename = self.addImage(configdir, self.key, baseurl, img['src'], proxy=proxy)
-                img['src']=filename
-                self.db.execute("INSERT INTO images (id, imagePath) VALUES (?, ?);", (id, filename) )
+            try:
+                currentTime = time.time()
+                id = row[0]
+                link = row[1]
+                f = urllib2.urlopen(link)
+                #entry["content"] = f.read()
+                html = f.read()
+                f.close()
+                soup = BeautifulSoup(html)
+                images = soup('img')
+                baseurl = link
+                for img in images:
+                    filename = self.addImage(configdir, self.key, baseurl, img['src'], proxy=proxy)
+                    img['src']=filename
+                    self.db.execute("INSERT INTO images (id, imagePath) VALUES (?, ?);", (id, filename) )
+                    self.db.commit()
+                contentLink = configdir+self.key+".d/"+id+".html"
+                file = open(contentLink, "w")
+                file.write(soup.prettify())
+                file.close()
+                
+                self.db.execute("UPDATE feed SET read=0, contentLink=?, updated=? WHERE id=?;", (contentLink, time.time(), id) )
                 self.db.commit()
-            contentLink = configdir+self.key+".d/"+id+".html"
-            file = open(contentLink, "w")
-            file.write(soup.prettify())
-            file.close()
-            
-            self.db.execute("UPDATE feed SET read=0, contentLink=?, updated=? WHERE id=?;", (contentLink, time.time(), id) )
-            self.db.commit()
-        return (currentTime, None, None)
+            except:
+                logger.error("Error updating Archived Article: %s %s"
+                             % (link,traceback.format_exc(),))
+
+        if postFeedUpdateFunc is not None:
+            postFeedUpdateFunc (self.key, currentTime, None, None, None,
+                                *postFeedUpdateFuncArgs)
     
     def purgeReadArticles(self):
         rows = self.db.execute("SELECT id FROM feed WHERE read=1;")
@@ -986,7 +1024,7 @@ class Listing(BaseObject):
         # state.
         try:
             updater = os.path.basename(sys.argv[0]) == 'update_feeds.py'
-            wc_init (self, True if updater else False)
+            wc_init(config, self, True if updater else False)
             if wc().available() and updater:
                 # The list of known streams.
                 streams = wc().streams_list ()
@@ -1002,19 +1040,22 @@ class Listing(BaseObject):
                         logger.debug(
                             "Registering previously unknown channel: %s (%s)"
                             % (key, title,))
-                        # Use a default refresh interval of 6 hours.
-                        wc().stream_register (key, title, 6 * 60 * 60)
+                        wc().stream_register(
+                            key, title,
+                            self.config.getUpdateInterval() * 60 * 60)
                     else:
                         # Make sure the human readable name is up to date.
                         if wc()[key].human_readable_name != title:
                             wc()[key].human_readable_name = title
                         stream_ids.remove (key)
+                        wc()[key].freshness \
+                            = self.config.getUpdateInterval() * 60 * 60
                         
     
                 # Unregister any streams that are no longer subscribed to.
                 for id in stream_ids:
                     logger.debug("Unregistering %s" % (id,))
-                    w.stream_unregister (id)
+                    wc().stream_unregister (id)
         except Exception:
             logger.exception("Registering streams with Woodchuck")
 
@@ -1223,7 +1264,7 @@ class Listing(BaseObject):
     
     def getCategoryTitle(self, id):
         return self.lookup('categories', 'title', id)
-
+    
     def getCategoryUnread(self, id):
         count = 0
         for key in self.getListOfFeeds(category=id):
@@ -1294,6 +1335,7 @@ class Listing(BaseObject):
                                      human_readable_name=title,
                                      freshness=6*60*60)
 
+            self.cache_invalidate('feeds')
             return True
         else:
             return False
@@ -1313,7 +1355,7 @@ class Listing(BaseObject):
         if wc().available ():
             try:
                 del wc()[key]
-            except KeyError:
+            except KeyError, woodchuck.Error:
                 logger.debug("Removing unregistered feed %s failed" % (key,))
 
         rank = self.db.execute("SELECT rank FROM feeds WHERE id=?;", (key,) ).fetchone()[0]