Fix img detection.

[feedingit] / src / rss_sqlite.py
diff --git a/src/rss_sqlite.py b/src/rss_sqlite.py

index 5735de2..129f500 100644 (file)
--- a/src/rss_sqlite.py
+++ b/src/rss_sqlite.py
@@ -73,6 +73,27 @@ def downloader(progress_handler=None, proxy=None):
  
      return urllib2.build_opener(*openers)
  
+def transfer_stats(sent, received, **kwargs):
+    """
+    This function takes two arguments: sent is the number of bytes
+    sent so far, received is the number of bytes received.  The
+    function returns a continuation that you can call later.
+
+    The continuation takes the same two arguments.  It returns a tuple
+    of the number of bytes sent, the number of bytes received and the
+    time since the original function was invoked.
+    """
+    start_time = time.time()
+    start_sent = sent
+    start_received = received
+
+    def e(sent, received, **kwargs):
+        return (sent - start_sent,
+                received - start_received,
+                time.time() - start_time)
+
+    return e
+
  # If not None, a subprocess.Popen object corresponding to a
  # update_feeds.py process.
  update_feed_process = None
@@ -118,7 +139,9 @@ class BaseObject(object):
              cache = self.cache[table]
  
              if time.time() - cache[None] > 60:
-                self.cache[table].clear()
+                # logger.debug("%s: Cache too old: clearing" % (table,))
+                del self.cache[table]
+                cache = None
          except KeyError:
              cache = None
  
@@ -127,6 +150,8 @@ class BaseObject(object):
              # The cache is empty or the caller wants a column that we
              # don't cache.
              if (table, column) in self.cached_columns:
+                # logger.debug("%s: Rebuilding cache" % (table,))
+
                  do_cache = True
  
                  self.cache[table] = cache = {}
@@ -172,10 +197,13 @@ class BaseObject(object):
  
          try:
              if id is not None:
-                return cache[column][id]
+                value = cache[column][id]
+                # logger.debug("%s.%s:%s -> %s" % (table, column, id, value))
+                return value
              else:
                  return cache[column].values()
          except KeyError:
+            # logger.debug("%s.%s:%s -> Not found" % (table, column, id))
              return None
  
  class Feed(BaseObject):
@@ -236,7 +264,7 @@ class Feed(BaseObject):
                  except OSError:
                      pass
  
-                raise exception
+                return None
          else:
              #open(filename,"a").close()  # "Touch" the file
              file = open(filename,"a")
@@ -297,10 +325,12 @@ class Feed(BaseObject):
                  time.sleep(1)
  
      def _updateFeed(self, configdir, url, etag, modified, expiryTime=24, proxy=None, imageCache=False, postFeedUpdateFunc=None, *postFeedUpdateFuncArgs):
+        logger.debug("Updating %s" % url)
+
          success = False
          have_serial_execution_lock = False
          try:
-            download_start = time.time ()
+            update_start = time.time ()
  
              progress_handler = HTTPProgressHandler(download_callback)
  
@@ -309,9 +339,11 @@ class Feed(BaseObject):
                  openers.append (proxy)
              kwargs = {'handlers':openers}
              
+            feed_transfer_stats = transfer_stats(0, 0)
+
              tmp=feedparser.parse(url, etag=etag, modified=modified, **kwargs)
-            download_duration = time.time () - download_start
-    
+            download_duration = time.time () - update_start
+
              opener = downloader(progress_handler, proxy)
  
              if JobManager().do_quit:
@@ -323,7 +355,10 @@ class Feed(BaseObject):
              expiry = float(expiryTime) * 3600.
      
              currentTime = 0
-    
+            
+            updated_objects = 0
+            new_objects = 0
+
              def wc_success():
                  try:
                      wc().stream_register (self.key, "", 6 * 60 * 60)
@@ -335,10 +370,11 @@ class Feed(BaseObject):
                                     |woodchuck.Indicator.StreamWide),
                          transferred_down=progress_handler.stats['received'],
                          transferred_up=progress_handler.stats['sent'],
-                        transfer_time=download_start,
+                        transfer_time=update_start,
                          transfer_duration=download_duration,
-                        new_objects=len (tmp.entries),
-                        objects_inline=len (tmp.entries))
+                        new_objects=new_objects,
+                        updated_objects=updated_objects,
+                        objects_inline=new_objects + updated_objects)
                  except KeyError:
                      logger.warn(
                          "Failed to register update of %s with woodchuck!"
@@ -356,13 +392,13 @@ class Feed(BaseObject):
                  logger.debug("%s: No changes to feed." % (self.key,))
                  mainthread.execute(wc_success, async=True)
                  success = True
-            elif len(tmp["entries"])==0 and not tmp.version:
+            elif len(tmp["entries"])==0 and not tmp.get('version', None):
                  # An error occured fetching or parsing the feed.  (Version
                  # will be either None if e.g. the connection timed our or
                  # '' if the data is not a proper feed)
                  logger.error(
                      "Error fetching %s: version is: %s: error: %s"
-                    % (url, str (tmp.version),
+                    % (url, str (tmp.get('version', 'unset')),
                         str (tmp.get ('bozo_exception', 'Unknown error'))))
                  logger.debug(tmp)
                  def register_stream_update_failed(http_status):
@@ -420,19 +456,18 @@ class Feed(BaseObject):
                 #reversedEntries = self.getEntries()
                 #reversedEntries.reverse()
      
-               ids = self.getIds()
-    
                 tmp["entries"].reverse()
                 for entry in tmp["entries"]:
                     # Yield so as to make the main thread a bit more
                     # responsive.
                     time.sleep(0)
      
+                   entry_transfer_stats = transfer_stats(
+                       *feed_transfer_stats(**progress_handler.stats)[0:2])
+
                     if JobManager().do_quit:
                         raise KeyboardInterrupt
  
-                   received_base = progress_handler.stats['received']
-                   sent_base = progress_handler.stats['sent']
                     object_size = 0
  
                     date = self.extractDate(entry)
@@ -452,20 +487,40 @@ class Feed(BaseObject):
                         entry["id"] = None
                     content = self.extractContent(entry)
                     object_size = len (content)
-                   received_base -= len (content)
                     tmpEntry = {"title":entry["title"], "content":content,
                                  "date":date, "link":entry["link"], "author":entry["author"], "id":entry["id"]}
                     id = self.generateUniqueId(tmpEntry)
                     
+                   current_version = self.db.execute(
+                       'select date, ROWID from feed where id=?',
+                       (id,)).fetchone()
+                   if (current_version is not None
+                       and current_version[0] == date):
+                       logger.debug("ALREADY DOWNLOADED %s (%s)"
+                                    % (entry["title"], entry["link"]))
+                       continue                       
+
+                   if current_version is not None:
+                       # The version was updated.  Mark it as unread.
+                       logger.debug("UPDATED: %s (%s)"
+                                    % (entry["title"], entry["link"]))
+                       updated_objects += 1
+                   else:
+                       logger.debug("NEW: %s (%s)"
+                                    % (entry["title"], entry["link"]))
+                       new_objects += 1
+
                     #articleTime = time.mktime(self.entries[id]["dateTuple"])
                     soup = BeautifulSoup(self.getArticle(tmpEntry)) #tmpEntry["content"])
                     images = soup('img')
                     baseurl = tmpEntry["link"]
-                   #if not id in ids:
                     if imageCache and len(images) > 0:
                         self.serial_execution_lock.release ()
                         have_serial_execution_lock = False
                         for img in images:
+                           if not img.has_key('src'):
+                               continue
+
                             filename = self.addImage(
                                 configdir, self.key, baseurl, img['src'],
                                 opener=opener)
@@ -488,29 +543,27 @@ class Feed(BaseObject):
                     file = open(tmpEntry["contentLink"], "w")
                     file.write(soup.prettify())
                     file.close()
-                   if id in ids:
-                       self.db.execute("UPDATE feed SET updated=? WHERE id=?;", (currentTime, id) )
-                       self.db.commit()
-                   else:
-                       values = (id, tmpEntry["title"], tmpEntry["contentLink"], tmpEntry["date"], currentTime, tmpEntry["link"], 0)
-                       self.db.execute("INSERT INTO feed (id, title, contentLink, date, updated, link, read) VALUES (?, ?, ?, ?, ?, ?, ?);", values)
-                       self.db.commit()
-#                   else:
-#                       try:
-#                           self.db.execute("UPDATE feed SET updated=? WHERE id=?;", (currentTime, id) )
-#                           self.db.commit()
-#                           filename = configdir+self.key+".d/"+id+".html"
-#                           file = open(filename,"a")
-#                           utime(filename, None)
-#                           file.close()
-#                           images = self.db.execute("SELECT imagePath FROM images where id=?;", (id, )).fetchall()
-#                           for image in images:
-#                                file = open(image[0],"a")
-#                                utime(image[0], None)
-#                                file.close()
-#                       except:
-#                           pass
-    
+
+                   values = {'id': id,
+                             'title': tmpEntry["title"],
+                             'contentLink': tmpEntry["contentLink"],
+                             'date': tmpEntry["date"],
+                             'updated': currentTime,
+                             'link': tmpEntry["link"],
+                             'read': 0}
+
+                   if current_version is not None:
+                       # This is an update.  Ensure that the existing
+                       # entry is replaced.
+                       values['ROWID'] = current_version[1]
+
+                   cols, values = zip(*values.items())
+                   self.db.execute(
+                       "INSERT OR REPLACE INTO feed (%s) VALUES (%s);"
+                       % (','.join(cols), ','.join(('?',) * len(values))),
+                       values)
+                   self.db.commit()
+
                     # Register the object with Woodchuck and mark it as
                     # downloaded.
                     def register_object_transferred(
@@ -544,9 +597,14 @@ class Feed(BaseObject):
                         else:
                             publication_time = None
  
-                       sent = progress_handler.stats['sent'] - sent_base
-                       received = (progress_handler.stats['received']
-                                   - received_base)
+                       sent, received, _ \
+                           = entry_transfer_stats(**progress_handler.stats)
+                       # sent and received are for objects (in
+                       # particular, images) associated with this
+                       # item.  We also want to attribute the data
+                       # transferred for the item's content.  This is
+                       # a good first approximation.
+                       received += len(content)
  
                         mainthread.execute(
                             register_object_transferred(
@@ -558,12 +616,11 @@ class Feed(BaseObject):
                             async=True)
                 self.db.commit()
  
+               sent, received, _ \
+                   = feed_transfer_stats(**progress_handler.stats)
                 logger.debug (
                     "%s: Update successful: transferred: %d/%d; objects: %d)"
-                   % (self.key,
-                      progress_handler.stats['sent'],
-                      progress_handler.stats['received'],
-                      len (tmp.entries)))
+                   % (url, sent, received, len (tmp.entries)))
                 mainthread.execute (wc_success, async=True)
                 success = True
  
@@ -909,36 +966,39 @@ class Listing(BaseObject):
  
          # Check that Woodchuck's state is up to date with respect our
          # state.
-        updater = os.path.basename(sys.argv[0]) == 'update_feeds.py'
-        wc_init (self, True if updater else False)
-        if wc().available() and updater:
-            # The list of known streams.
-            streams = wc().streams_list ()
-            stream_ids = [s.identifier for s in streams]
-
-            # Register any unknown streams.  Remove known streams from
-            # STREAMS_IDS.
-            for key in self.getListOfFeeds():
-                title = self.getFeedTitle(key)
-                # XXX: We should also check whether the list of
-                # articles/objects in each feed/stream is up to date.
-                if key not in stream_ids:
-                    logger.debug(
-                        "Registering previously unknown channel: %s (%s)"
-                        % (key, title,))
-                    # Use a default refresh interval of 6 hours.
-                    wc().stream_register (key, title, 6 * 60 * 60)
-                else:
-                    # Make sure the human readable name is up to date.
-                    if wc()[key].human_readable_name != title:
-                        wc()[key].human_readable_name = title
-                    stream_ids.remove (key)
-                    
-
-            # Unregister any streams that are no longer subscribed to.
-            for id in stream_ids:
-                logger.debug("Unregistering %s" % (id,))
-                w.stream_unregister (id)
+        try:
+            updater = os.path.basename(sys.argv[0]) == 'update_feeds.py'
+            wc_init (self, True if updater else False)
+            if wc().available() and updater:
+                # The list of known streams.
+                streams = wc().streams_list ()
+                stream_ids = [s.identifier for s in streams]
+    
+                # Register any unknown streams.  Remove known streams from
+                # STREAMS_IDS.
+                for key in self.getListOfFeeds():
+                    title = self.getFeedTitle(key)
+                    # XXX: We should also check whether the list of
+                    # articles/objects in each feed/stream is up to date.
+                    if key not in stream_ids:
+                        logger.debug(
+                            "Registering previously unknown channel: %s (%s)"
+                            % (key, title,))
+                        # Use a default refresh interval of 6 hours.
+                        wc().stream_register (key, title, 6 * 60 * 60)
+                    else:
+                        # Make sure the human readable name is up to date.
+                        if wc()[key].human_readable_name != title:
+                            wc()[key].human_readable_name = title
+                        stream_ids.remove (key)
+                        
+    
+                # Unregister any streams that are no longer subscribed to.
+                for id in stream_ids:
+                    logger.debug("Unregistering %s" % (id,))
+                    w.stream_unregister (id)
+        except Exception:
+            logger.exception("Registering streams with Woodchuck")
  
      def importOldFormatFeeds(self):
          """This function loads feeds that are saved in an outdated format, and converts them to sqlite"""
@@ -1047,8 +1107,11 @@ class Listing(BaseObject):
          in_progress = stats['jobs-in-progress']
          queued = stats['jobs-queued']
  
-        percent = (100 * ((completed + in_progress / 2.))
-                   / (completed + in_progress + queued))
+        try:
+            percent = (100 * ((completed + in_progress / 2.))
+                       / (completed + in_progress + queued))
+        except ZeroDivisionError:
+            percent = 100
  
          update_server_object().UpdateProgress(
              percent, completed, in_progress, queued, 0, 0, 0, key)