Fix implementation of feed.getContent.

[feedingit] / src / rss_sqlite.py
diff --git a/src/rss_sqlite.py b/src/rss_sqlite.py

index a1f5d2d..9cbeb82 100644 (file)
--- a/src/rss_sqlite.py
+++ b/src/rss_sqlite.py
@@ -392,13 +392,13 @@ class Feed(BaseObject):
                  logger.debug("%s: No changes to feed." % (self.key,))
                  mainthread.execute(wc_success, async=True)
                  success = True
-            elif len(tmp["entries"])==0 and not tmp.version:
+            elif len(tmp["entries"])==0 and not tmp.get('version', None):
                  # An error occured fetching or parsing the feed.  (Version
                  # will be either None if e.g. the connection timed our or
                  # '' if the data is not a proper feed)
                  logger.error(
                      "Error fetching %s: version is: %s: error: %s"
-                    % (url, str (tmp.version),
+                    % (url, str (tmp.get('version', 'unset')),
                         str (tmp.get ('bozo_exception', 'Unknown error'))))
                  logger.debug(tmp)
                  def register_stream_update_failed(http_status):
@@ -456,8 +456,6 @@ class Feed(BaseObject):
                 #reversedEntries = self.getEntries()
                 #reversedEntries.reverse()
      
-               ids = self.getIds()
-    
                 tmp["entries"].reverse()
                 for entry in tmp["entries"]:
                     # Yield so as to make the main thread a bit more
@@ -493,20 +491,35 @@ class Feed(BaseObject):
                                  "date":date, "link":entry["link"], "author":entry["author"], "id":entry["id"]}
                     id = self.generateUniqueId(tmpEntry)
                     
-                   current_version \
-                       = self.db.execute('select date from feed where id=?',
-                                         (id,)).fetchone()
+                   current_version = self.db.execute(
+                       'select date, ROWID from feed where id=?',
+                       (id,)).fetchone()
                     if (current_version is not None
                         and current_version[0] == date):
                         logger.debug("ALREADY DOWNLOADED %s (%s)"
                                      % (entry["title"], entry["link"]))
+                       ## This article is already present in the feed listing. Update the "updated" time, so it doesn't expire 
+                       self.db.execute("UPDATE feed SET updated=? WHERE id=?;",(currentTime,id))
+                       try: 
+                           logger.debug("Updating already downloaded files for %s" %(id))
+                           filename = configdir+self.key+".d/"+id+".html"
+                           file = open(filename,"a")
+                           utime(filename, None)
+                           file.close()
+                           images = self.db.execute("SELECT imagePath FROM images where id=?;", (id, )).fetchall()
+                           for image in images:
+                                file = open(image[0],"a")
+                                utime(image[0], None)
+                                file.close()
+                       except:
+                           logger.debug("Error in refreshing images for %s" % (id))
+                       self.db.commit()
                         continue                       
  
                     if current_version is not None:
                         # The version was updated.  Mark it as unread.
                         logger.debug("UPDATED: %s (%s)"
                                      % (entry["title"], entry["link"]))
-                       self.setEntryUnread(id)
                         updated_objects += 1
                     else:
                         logger.debug("NEW: %s (%s)"
@@ -517,12 +530,11 @@ class Feed(BaseObject):
                     soup = BeautifulSoup(self.getArticle(tmpEntry)) #tmpEntry["content"])
                     images = soup('img')
                     baseurl = tmpEntry["link"]
-                   #if not id in ids:
                     if imageCache and len(images) > 0:
                         self.serial_execution_lock.release ()
                         have_serial_execution_lock = False
                         for img in images:
-                           if not 'src' in img:
+                           if not img.has_key('src'):
                                 continue
  
                             filename = self.addImage(
@@ -547,29 +559,27 @@ class Feed(BaseObject):
                     file = open(tmpEntry["contentLink"], "w")
                     file.write(soup.prettify())
                     file.close()
-                   if id in ids:
-                       self.db.execute("UPDATE feed SET updated=? WHERE id=?;", (currentTime, id) )
-                       self.db.commit()
-                   else:
-                       values = (id, tmpEntry["title"], tmpEntry["contentLink"], tmpEntry["date"], currentTime, tmpEntry["link"], 0)
-                       self.db.execute("INSERT INTO feed (id, title, contentLink, date, updated, link, read) VALUES (?, ?, ?, ?, ?, ?, ?);", values)
-                       self.db.commit()
-#                   else:
-#                       try:
-#                           self.db.execute("UPDATE feed SET updated=? WHERE id=?;", (currentTime, id) )
-#                           self.db.commit()
-#                           filename = configdir+self.key+".d/"+id+".html"
-#                           file = open(filename,"a")
-#                           utime(filename, None)
-#                           file.close()
-#                           images = self.db.execute("SELECT imagePath FROM images where id=?;", (id, )).fetchall()
-#                           for image in images:
-#                                file = open(image[0],"a")
-#                                utime(image[0], None)
-#                                file.close()
-#                       except:
-#                           pass
-    
+
+                   values = {'id': id,
+                             'title': tmpEntry["title"],
+                             'contentLink': tmpEntry["contentLink"],
+                             'date': tmpEntry["date"],
+                             'updated': currentTime,
+                             'link': tmpEntry["link"],
+                             'read': 0}
+
+                   if current_version is not None:
+                       # This is an update.  Ensure that the existing
+                       # entry is replaced.
+                       values['ROWID'] = current_version[1]
+
+                   cols, values = zip(*values.items())
+                   self.db.execute(
+                       "INSERT OR REPLACE INTO feed (%s) VALUES (%s);"
+                       % (','.join(cols), ','.join(('?',) * len(values))),
+                       values)
+                   self.db.commit()
+
                     # Register the object with Woodchuck and mark it as
                     # downloaded.
                     def register_object_transferred(
@@ -821,13 +831,18 @@ class Feed(BaseObject):
          return text
     
      def getContent(self, id):
-        contentLink = self.db.execute("SELECT contentLink FROM feed WHERE id=?;", (id,)).fetchone()[0]
+        """
+        Return the content of the article with the specified ID.  If
+        the content is not available, returns None.
+        """
+        contentLink = self.getContentLink(id)
          try:
-            file = open(self.entries[id]["contentLink"])
-            content = file.read()
-            file.close()
-        except:
-            content = "Content unavailable"
+            with open(contentLink, 'r') as file:
+                content = file.read()
+        except Exception:
+            logger.exception("Failed get content for %s: reading %s failed",
+                             id, contentLink)
+            content = None
          return content
      
      def extractDate(self, entry):
@@ -877,33 +892,41 @@ class ArchivedArticles(Feed):
          self.db.execute("INSERT INTO feed (id, title, contentLink, date, updated, link, read) VALUES (?, ?, ?, ?, ?, ?, ?);", values)
          self.db.commit()
  
-    def updateFeed(self, configdir, url, etag, modified, expiryTime=24, proxy=None, imageCache=False):
+    # Feed.UpdateFeed calls this function.
+    def _updateFeed(self, configdir, url, etag, modified, expiryTime=24, proxy=None, imageCache=False, priority=0, postFeedUpdateFunc=None, *postFeedUpdateFuncArgs):
          currentTime = 0
          rows = self.db.execute("SELECT id, link FROM feed WHERE updated=0;")
          for row in rows:
-            currentTime = time.time()
-            id = row[0]
-            link = row[1]
-            f = urllib2.urlopen(link)
-            #entry["content"] = f.read()
-            html = f.read()
-            f.close()
-            soup = BeautifulSoup(html)
-            images = soup('img')
-            baseurl = link
-            for img in images:
-                filename = self.addImage(configdir, self.key, baseurl, img['src'], proxy=proxy)
-                img['src']=filename
-                self.db.execute("INSERT INTO images (id, imagePath) VALUES (?, ?);", (id, filename) )
+            try:
+                currentTime = time.time()
+                id = row[0]
+                link = row[1]
+                f = urllib2.urlopen(link)
+                #entry["content"] = f.read()
+                html = f.read()
+                f.close()
+                soup = BeautifulSoup(html)
+                images = soup('img')
+                baseurl = link
+                for img in images:
+                    filename = self.addImage(configdir, self.key, baseurl, img['src'], proxy=proxy)
+                    img['src']=filename
+                    self.db.execute("INSERT INTO images (id, imagePath) VALUES (?, ?);", (id, filename) )
+                    self.db.commit()
+                contentLink = configdir+self.key+".d/"+id+".html"
+                file = open(contentLink, "w")
+                file.write(soup.prettify())
+                file.close()
+                
+                self.db.execute("UPDATE feed SET read=0, contentLink=?, updated=? WHERE id=?;", (contentLink, time.time(), id) )
                  self.db.commit()
-            contentLink = configdir+self.key+".d/"+id+".html"
-            file = open(contentLink, "w")
-            file.write(soup.prettify())
-            file.close()
-            
-            self.db.execute("UPDATE feed SET read=0, contentLink=?, updated=? WHERE id=?;", (contentLink, time.time(), id) )
-            self.db.commit()
-        return (currentTime, None, None)
+            except:
+                logger.error("Error updating Archived Article: %s %s"
+                             % (link,traceback.format_exc(),))
+
+        if postFeedUpdateFunc is not None:
+            postFeedUpdateFunc (self.key, currentTime, None, None, None,
+                                *postFeedUpdateFuncArgs)
      
      def purgeReadArticles(self):
          rows = self.db.execute("SELECT id FROM feed WHERE read=1;")
@@ -972,36 +995,42 @@ class Listing(BaseObject):
  
          # Check that Woodchuck's state is up to date with respect our
          # state.
-        updater = os.path.basename(sys.argv[0]) == 'update_feeds.py'
-        wc_init (self, True if updater else False)
-        if wc().available() and updater:
-            # The list of known streams.
-            streams = wc().streams_list ()
-            stream_ids = [s.identifier for s in streams]
-
-            # Register any unknown streams.  Remove known streams from
-            # STREAMS_IDS.
-            for key in self.getListOfFeeds():
-                title = self.getFeedTitle(key)
-                # XXX: We should also check whether the list of
-                # articles/objects in each feed/stream is up to date.
-                if key not in stream_ids:
-                    logger.debug(
-                        "Registering previously unknown channel: %s (%s)"
-                        % (key, title,))
-                    # Use a default refresh interval of 6 hours.
-                    wc().stream_register (key, title, 6 * 60 * 60)
-                else:
-                    # Make sure the human readable name is up to date.
-                    if wc()[key].human_readable_name != title:
-                        wc()[key].human_readable_name = title
-                    stream_ids.remove (key)
-                    
-
-            # Unregister any streams that are no longer subscribed to.
-            for id in stream_ids:
-                logger.debug("Unregistering %s" % (id,))
-                w.stream_unregister (id)
+        try:
+            updater = os.path.basename(sys.argv[0]) == 'update_feeds.py'
+            wc_init(config, self, True if updater else False)
+            if wc().available() and updater:
+                # The list of known streams.
+                streams = wc().streams_list ()
+                stream_ids = [s.identifier for s in streams]
+    
+                # Register any unknown streams.  Remove known streams from
+                # STREAMS_IDS.
+                for key in self.getListOfFeeds():
+                    title = self.getFeedTitle(key)
+                    # XXX: We should also check whether the list of
+                    # articles/objects in each feed/stream is up to date.
+                    if key not in stream_ids:
+                        logger.debug(
+                            "Registering previously unknown channel: %s (%s)"
+                            % (key, title,))
+                        wc().stream_register(
+                            key, title,
+                            self.config.getUpdateInterval() * 60 * 60)
+                    else:
+                        # Make sure the human readable name is up to date.
+                        if wc()[key].human_readable_name != title:
+                            wc()[key].human_readable_name = title
+                        stream_ids.remove (key)
+                        wc()[key].freshness \
+                            = self.config.getUpdateInterval() * 60 * 60
+                        
+    
+                # Unregister any streams that are no longer subscribed to.
+                for id in stream_ids:
+                    logger.debug("Unregistering %s" % (id,))
+                    w.stream_unregister (id)
+        except Exception:
+            logger.exception("Registering streams with Woodchuck")
  
      def importOldFormatFeeds(self):
          """This function loads feeds that are saved in an outdated format, and converts them to sqlite"""
@@ -1288,7 +1317,7 @@ class Listing(BaseObject):
          if wc().available ():
              try:
                  del wc()[key]
-            except KeyError:
+            except KeyError, woodchuck.Error:
                  logger.debug("Removing unregistered feed %s failed" % (key,))
  
          rank = self.db.execute("SELECT rank FROM feeds WHERE id=?;", (key,) ).fetchone()[0]