psa: Full width articles

[feedingit] / psa_harmattan / feedingit / deb_dist / feedingit-0.1.0 / debian / feedingit / usr / share / feedingit / rss_sqlite.py
diff --git a/psa_harmattan/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/rss_sqlite.py b/psa_harmattan/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/rss_sqlite.py

index bcefd42..867e1af 100644 (file)
--- a/psa_harmattan/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/rss_sqlite.py
+++ b/psa_harmattan/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/rss_sqlite.py
@@ -54,6 +54,9 @@ import logging
  logger = logging.getLogger(__name__)
  
  def getId(string):
+    if issubclass(string.__class__, unicode):
+        string = string.encode('utf8', 'replace')
+
      return md5.new(string).hexdigest()
  
  def download_callback(connection):
@@ -226,14 +229,24 @@ class Feed(BaseObject):
          self.key = key
          self.configdir = configdir
          self.dir = "%s/%s.d" %(self.configdir, self.key)
-        self.tls = threading.local ()
+        self.tls = threading.local()
  
          if not isdir(self.dir):
              mkdir(self.dir)
-        if not isfile("%s/%s.db" %(self.dir, self.key)):
-            self.db.execute("CREATE TABLE feed (id text, title text, contentLink text, date float, updated float, link text, read int);")
+        filename = "%s/%s.db" % (self.dir, self.key)
+        if not isfile(filename):
+            self.db.execute("CREATE TABLE feed (id text, title text, contentLink text, contentHash text, date float, updated float, link text, read int);")
              self.db.execute("CREATE TABLE images (id text, imagePath text);")
              self.db.commit()
+        else:
+            try:
+                self.db.execute("ALTER TABLE feed ADD COLUMN contentHash text")
+                self.db.commit()
+            except sqlite3.OperationalError, e:
+                if 'duplicate column name' in str(e):
+                    pass
+                else:
+                    logger.exception("Add column contentHash to %s", filename)
  
      def addImage(self, configdir, key, baseurl, url, proxy=None, opener=None):
          filename = configdir+key+".d/"+getId(url)
@@ -486,33 +499,43 @@ class Feed(BaseObject):
                     if(not(entry.has_key("id"))):
                         entry["id"] = None
                     content = self.extractContent(entry)
+                   contentHash = getId(content)
                     object_size = len (content)
                     tmpEntry = {"title":entry["title"], "content":content,
                                  "date":date, "link":entry["link"], "author":entry["author"], "id":entry["id"]}
                     id = self.generateUniqueId(tmpEntry)
                     
                     current_version = self.db.execute(
-                       'select date, ROWID from feed where id=?',
+                       'select date, ROWID, contentHash from feed where id=?',
                         (id,)).fetchone()
                     if (current_version is not None
-                       and current_version[0] == date):
+                       # To detect updates, don't compare by date:
+                       # compare by content.
+                       #
+                       # - If an article update is just a date change
+                       #   and the content remains the same, we don't
+                       #   want to register an update.
+                       #
+                       # - If an article's content changes but not the
+                       #   date, we want to recognize an update.
+                       and current_version[2] == contentHash):
                         logger.debug("ALREADY DOWNLOADED %s (%s)"
                                      % (entry["title"], entry["link"]))
-                       ## This article is already present in the feed listing. Update the "updated" time, so it doesn't expire                                                      
-                       self.db.execute("UPDATE feed SET updated=? WHERE id=?;",(currentTime,id))                                                                                    
-                       try:                                                                                                                                                         
-                           logger.debug("Updating already downloaded files for %s" %(id))                                                                                           
-                           filename = configdir+self.key+".d/"+id+".html"                                                                                                           
-                           file = open(filename,"a")                                                                                                                                
-                           utime(filename, None)                                                                                                                                    
-                           file.close()                                                                                                                                             
-                           images = self.db.execute("SELECT imagePath FROM images where id=?;", (id, )).fetchall()                                                                  
-                           for image in images:                                                                                                                                     
-                                file = open(image[0],"a")                                                                                                                           
-                                utime(image[0], None)                                                                                                                               
-                                file.close()                                                                                                                                        
-                       except:                                                                                                                                                      
-                           logger.debug("Error in refreshing images for %s" % (id))                                                                                                 
+                       ## This article is already present in the feed listing. Update the "updated" time, so it doesn't expire 
+                       self.db.execute("UPDATE feed SET updated=? WHERE id=?;",(currentTime,id))
+                       try: 
+                           logger.debug("Updating already downloaded files for %s" %(id))
+                           filename = configdir+self.key+".d/"+id+".html"
+                           file = open(filename,"a")
+                           utime(filename, None)
+                           file.close()
+                           images = self.db.execute("SELECT imagePath FROM images where id=?;", (id, )).fetchall()
+                           for image in images:
+                                file = open(image[0],"a")
+                                utime(image[0], None)
+                                file.close()
+                       except:
+                           logger.debug("Error in refreshing images for %s" % (id))
                         self.db.commit()
                         continue                       
  
@@ -520,7 +543,6 @@ class Feed(BaseObject):
                         # The version was updated.  Mark it as unread.
                         logger.debug("UPDATED: %s (%s)"
                                      % (entry["title"], entry["link"]))
-                       self.setEntryUnread(id)
                         updated_objects += 1
                     else:
                         logger.debug("NEW: %s (%s)"
@@ -531,7 +553,6 @@ class Feed(BaseObject):
                     soup = BeautifulSoup(self.getArticle(tmpEntry)) #tmpEntry["content"])
                     images = soup('img')
                     baseurl = tmpEntry["link"]
-                   #if not id in ids:
                     if imageCache and len(images) > 0:
                         self.serial_execution_lock.release ()
                         have_serial_execution_lock = False
@@ -565,6 +586,7 @@ class Feed(BaseObject):
                     values = {'id': id,
                               'title': tmpEntry["title"],
                               'contentLink': tmpEntry["contentLink"],
+                             'contentHash': contentHash,
                               'date': tmpEntry["date"],
                               'updated': currentTime,
                               'link': tmpEntry["link"],
@@ -738,6 +760,9 @@ class Feed(BaseObject):
      def getContentLink(self, id):
          return self.db.execute("SELECT contentLink FROM feed WHERE id=?;", (id,) ).fetchone()[0]
      
+    def getContentHash(self, id):
+        return self.db.execute("SELECT contentHash FROM feed WHERE id=?;", (id,) ).fetchone()[0]
+    
      def getExternalLink(self, id):
          return self.db.execute("SELECT link FROM feed WHERE id=?;", (id,) ).fetchone()[0]
      
@@ -833,13 +858,18 @@ class Feed(BaseObject):
          return text
     
      def getContent(self, id):
-        contentLink = self.db.execute("SELECT contentLink FROM feed WHERE id=?;", (id,)).fetchone()[0]
+        """
+        Return the content of the article with the specified ID.  If
+        the content is not available, returns None.
+        """
+        contentLink = self.getContentLink(id)
          try:
-            file = open(self.entries[id]["contentLink"])
-            content = file.read()
-            file.close()
-        except:
-            content = "Content unavailable"
+            with open(contentLink, 'r') as file:
+                content = file.read()
+        except Exception:
+            logger.exception("Failed get content for %s: reading %s failed",
+                             id, contentLink)
+            content = None
          return content
      
      def extractDate(self, entry):
@@ -889,33 +919,41 @@ class ArchivedArticles(Feed):
          self.db.execute("INSERT INTO feed (id, title, contentLink, date, updated, link, read) VALUES (?, ?, ?, ?, ?, ?, ?);", values)
          self.db.commit()
  
-    def updateFeed(self, configdir, url, etag, modified, expiryTime=24, proxy=None, imageCache=False):
+    # Feed.UpdateFeed calls this function.
+    def _updateFeed(self, configdir, url, etag, modified, expiryTime=24, proxy=None, imageCache=False, priority=0, postFeedUpdateFunc=None, *postFeedUpdateFuncArgs):
          currentTime = 0
          rows = self.db.execute("SELECT id, link FROM feed WHERE updated=0;")
          for row in rows:
-            currentTime = time.time()
-            id = row[0]
-            link = row[1]
-            f = urllib2.urlopen(link)
-            #entry["content"] = f.read()
-            html = f.read()
-            f.close()
-            soup = BeautifulSoup(html)
-            images = soup('img')
-            baseurl = link
-            for img in images:
-                filename = self.addImage(configdir, self.key, baseurl, img['src'], proxy=proxy)
-                img['src']=filename
-                self.db.execute("INSERT INTO images (id, imagePath) VALUES (?, ?);", (id, filename) )
+            try:
+                currentTime = time.time()
+                id = row[0]
+                link = row[1]
+                f = urllib2.urlopen(link)
+                #entry["content"] = f.read()
+                html = f.read()
+                f.close()
+                soup = BeautifulSoup(html)
+                images = soup('img')
+                baseurl = link
+                for img in images:
+                    filename = self.addImage(configdir, self.key, baseurl, img['src'], proxy=proxy)
+                    img['src']=filename
+                    self.db.execute("INSERT INTO images (id, imagePath) VALUES (?, ?);", (id, filename) )
+                    self.db.commit()
+                contentLink = configdir+self.key+".d/"+id+".html"
+                file = open(contentLink, "w")
+                file.write(soup.prettify())
+                file.close()
+                
+                self.db.execute("UPDATE feed SET read=0, contentLink=?, updated=? WHERE id=?;", (contentLink, time.time(), id) )
                  self.db.commit()
-            contentLink = configdir+self.key+".d/"+id+".html"
-            file = open(contentLink, "w")
-            file.write(soup.prettify())
-            file.close()
-            
-            self.db.execute("UPDATE feed SET read=0, contentLink=?, updated=? WHERE id=?;", (contentLink, time.time(), id) )
-            self.db.commit()
-        return (currentTime, None, None)
+            except:
+                logger.error("Error updating Archived Article: %s %s"
+                             % (link,traceback.format_exc(),))
+
+        if postFeedUpdateFunc is not None:
+            postFeedUpdateFunc (self.key, currentTime, None, None, None,
+                                *postFeedUpdateFuncArgs)
      
      def purgeReadArticles(self):
          rows = self.db.execute("SELECT id FROM feed WHERE read=1;")
@@ -986,7 +1024,7 @@ class Listing(BaseObject):
          # state.
          try:
              updater = os.path.basename(sys.argv[0]) == 'update_feeds.py'
-            wc_init (self, True if updater else False)
+            wc_init(config, self, True if updater else False)
              if wc().available() and updater:
                  # The list of known streams.
                  streams = wc().streams_list ()
@@ -1002,19 +1040,22 @@ class Listing(BaseObject):
                          logger.debug(
                              "Registering previously unknown channel: %s (%s)"
                              % (key, title,))
-                        # Use a default refresh interval of 6 hours.
-                        wc().stream_register (key, title, 6 * 60 * 60)
+                        wc().stream_register(
+                            key, title,
+                            self.config.getUpdateInterval() * 60 * 60)
                      else:
                          # Make sure the human readable name is up to date.
                          if wc()[key].human_readable_name != title:
                              wc()[key].human_readable_name = title
                          stream_ids.remove (key)
+                        wc()[key].freshness \
+                            = self.config.getUpdateInterval() * 60 * 60
                          
      
                  # Unregister any streams that are no longer subscribed to.
                  for id in stream_ids:
                      logger.debug("Unregistering %s" % (id,))
-                    w.stream_unregister (id)
+                    wc().stream_unregister (id)
          except Exception:
              logger.exception("Registering streams with Woodchuck")
  
@@ -1223,7 +1264,7 @@ class Listing(BaseObject):
      
      def getCategoryTitle(self, id):
          return self.lookup('categories', 'title', id)
-
+    
      def getCategoryUnread(self, id):
          count = 0
          for key in self.getListOfFeeds(category=id):
@@ -1294,6 +1335,7 @@ class Listing(BaseObject):
                                       human_readable_name=title,
                                       freshness=6*60*60)
  
+            self.cache_invalidate('feeds')
              return True
          else:
              return False
@@ -1313,7 +1355,7 @@ class Listing(BaseObject):
          if wc().available ():
              try:
                  del wc()[key]
-            except KeyError:
+            except KeyError, woodchuck.Error:
                  logger.debug("Removing unregistered feed %s failed" % (key,))
  
          rank = self.db.execute("SELECT rank FROM feeds WHERE id=?;", (key,) ).fetchone()[0]