logger = logging.getLogger(__name__)
def getId(string):
+ if issubclass(string.__class__, unicode):
+ string = string.encode('utf8', 'replace')
+
return md5.new(string).hexdigest()
def download_callback(connection):
self.key = key
self.configdir = configdir
self.dir = "%s/%s.d" %(self.configdir, self.key)
- self.tls = threading.local ()
+ self.tls = threading.local()
if not isdir(self.dir):
mkdir(self.dir)
- if not isfile("%s/%s.db" %(self.dir, self.key)):
- self.db.execute("CREATE TABLE feed (id text, title text, contentLink text, date float, updated float, link text, read int);")
+ filename = "%s/%s.db" % (self.dir, self.key)
+ if not isfile(filename):
+ self.db.execute("CREATE TABLE feed (id text, title text, contentLink text, contentHash text, date float, updated float, link text, read int);")
self.db.execute("CREATE TABLE images (id text, imagePath text);")
self.db.commit()
+ else:
+ try:
+ self.db.execute("ALTER TABLE feed ADD COLUMN contentHash text")
+ self.db.commit()
+ except sqlite3.OperationalError, e:
+ if 'duplicate column name' in str(e):
+ pass
+ else:
+ logger.exception("Add column contentHash to %s", filename)
def addImage(self, configdir, key, baseurl, url, proxy=None, opener=None):
filename = configdir+key+".d/"+getId(url)
if(not(entry.has_key("id"))):
entry["id"] = None
content = self.extractContent(entry)
+ contentHash = getId(content)
object_size = len (content)
tmpEntry = {"title":entry["title"], "content":content,
"date":date, "link":entry["link"], "author":entry["author"], "id":entry["id"]}
id = self.generateUniqueId(tmpEntry)
current_version = self.db.execute(
- 'select date, ROWID from feed where id=?',
+ 'select date, ROWID, contentHash from feed where id=?',
(id,)).fetchone()
if (current_version is not None
- and current_version[0] == date):
+ # To detect updates, don't compare by date:
+ # compare by content.
+ #
+ # - If an article update is just a date change
+ # and the content remains the same, we don't
+ # want to register an update.
+ #
+ # - If an article's content changes but not the
+ # date, we want to recognize an update.
+ and current_version[2] == contentHash):
logger.debug("ALREADY DOWNLOADED %s (%s)"
% (entry["title"], entry["link"]))
- ## This article is already present in the feed listing. Update the "updated" time, so it doesn't expire
- self.db.execute("UPDATE feed SET updated=? WHERE id=?;",(currentTime,id))
- try:
- logger.debug("Updating already downloaded files for %s" %(id))
- filename = configdir+self.key+".d/"+id+".html"
- file = open(filename,"a")
- utime(filename, None)
- file.close()
- images = self.db.execute("SELECT imagePath FROM images where id=?;", (id, )).fetchall()
- for image in images:
- file = open(image[0],"a")
- utime(image[0], None)
- file.close()
- except:
- logger.debug("Error in refreshing images for %s" % (id))
+ ## This article is already present in the feed listing. Update the "updated" time, so it doesn't expire
+ self.db.execute("UPDATE feed SET updated=? WHERE id=?;",(currentTime,id))
+ try:
+ logger.debug("Updating already downloaded files for %s" %(id))
+ filename = configdir+self.key+".d/"+id+".html"
+ file = open(filename,"a")
+ utime(filename, None)
+ file.close()
+ images = self.db.execute("SELECT imagePath FROM images where id=?;", (id, )).fetchall()
+ for image in images:
+ file = open(image[0],"a")
+ utime(image[0], None)
+ file.close()
+ except:
+ logger.debug("Error in refreshing images for %s" % (id))
self.db.commit()
continue
# The version was updated. Mark it as unread.
logger.debug("UPDATED: %s (%s)"
% (entry["title"], entry["link"]))
- self.setEntryUnread(id)
updated_objects += 1
else:
logger.debug("NEW: %s (%s)"
soup = BeautifulSoup(self.getArticle(tmpEntry)) #tmpEntry["content"])
images = soup('img')
baseurl = tmpEntry["link"]
- #if not id in ids:
if imageCache and len(images) > 0:
self.serial_execution_lock.release ()
have_serial_execution_lock = False
values = {'id': id,
'title': tmpEntry["title"],
'contentLink': tmpEntry["contentLink"],
+ 'contentHash': contentHash,
'date': tmpEntry["date"],
'updated': currentTime,
'link': tmpEntry["link"],
def getContentLink(self, id):
return self.db.execute("SELECT contentLink FROM feed WHERE id=?;", (id,) ).fetchone()[0]
+ def getContentHash(self, id):
+ return self.db.execute("SELECT contentHash FROM feed WHERE id=?;", (id,) ).fetchone()[0]
+
def getExternalLink(self, id):
return self.db.execute("SELECT link FROM feed WHERE id=?;", (id,) ).fetchone()[0]
return text
def getContent(self, id):
- contentLink = self.db.execute("SELECT contentLink FROM feed WHERE id=?;", (id,)).fetchone()[0]
+ """
+ Return the content of the article with the specified ID. If
+ the content is not available, returns None.
+ """
+ contentLink = self.getContentLink(id)
try:
- file = open(self.entries[id]["contentLink"])
- content = file.read()
- file.close()
- except:
- content = "Content unavailable"
+ with open(contentLink, 'r') as file:
+ content = file.read()
+ except Exception:
+ logger.exception("Failed get content for %s: reading %s failed",
+ id, contentLink)
+ content = None
return content
def extractDate(self, entry):
self.db.execute("INSERT INTO feed (id, title, contentLink, date, updated, link, read) VALUES (?, ?, ?, ?, ?, ?, ?);", values)
self.db.commit()
- def updateFeed(self, configdir, url, etag, modified, expiryTime=24, proxy=None, imageCache=False):
+ # Feed.UpdateFeed calls this function.
+ def _updateFeed(self, configdir, url, etag, modified, expiryTime=24, proxy=None, imageCache=False, priority=0, postFeedUpdateFunc=None, *postFeedUpdateFuncArgs):
currentTime = 0
rows = self.db.execute("SELECT id, link FROM feed WHERE updated=0;")
for row in rows:
- currentTime = time.time()
- id = row[0]
- link = row[1]
- f = urllib2.urlopen(link)
- #entry["content"] = f.read()
- html = f.read()
- f.close()
- soup = BeautifulSoup(html)
- images = soup('img')
- baseurl = link
- for img in images:
- filename = self.addImage(configdir, self.key, baseurl, img['src'], proxy=proxy)
- img['src']=filename
- self.db.execute("INSERT INTO images (id, imagePath) VALUES (?, ?);", (id, filename) )
+ try:
+ currentTime = time.time()
+ id = row[0]
+ link = row[1]
+ f = urllib2.urlopen(link)
+ #entry["content"] = f.read()
+ html = f.read()
+ f.close()
+ soup = BeautifulSoup(html)
+ images = soup('img')
+ baseurl = link
+ for img in images:
+ filename = self.addImage(configdir, self.key, baseurl, img['src'], proxy=proxy)
+ img['src']=filename
+ self.db.execute("INSERT INTO images (id, imagePath) VALUES (?, ?);", (id, filename) )
+ self.db.commit()
+ contentLink = configdir+self.key+".d/"+id+".html"
+ file = open(contentLink, "w")
+ file.write(soup.prettify())
+ file.close()
+
+ self.db.execute("UPDATE feed SET read=0, contentLink=?, updated=? WHERE id=?;", (contentLink, time.time(), id) )
self.db.commit()
- contentLink = configdir+self.key+".d/"+id+".html"
- file = open(contentLink, "w")
- file.write(soup.prettify())
- file.close()
-
- self.db.execute("UPDATE feed SET read=0, contentLink=?, updated=? WHERE id=?;", (contentLink, time.time(), id) )
- self.db.commit()
- return (currentTime, None, None)
+ except:
+ logger.error("Error updating Archived Article: %s %s"
+ % (link,traceback.format_exc(),))
+
+ if postFeedUpdateFunc is not None:
+ postFeedUpdateFunc (self.key, currentTime, None, None, None,
+ *postFeedUpdateFuncArgs)
def purgeReadArticles(self):
rows = self.db.execute("SELECT id FROM feed WHERE read=1;")
# state.
try:
updater = os.path.basename(sys.argv[0]) == 'update_feeds.py'
- wc_init (self, True if updater else False)
+ wc_init(config, self, True if updater else False)
if wc().available() and updater:
# The list of known streams.
streams = wc().streams_list ()
logger.debug(
"Registering previously unknown channel: %s (%s)"
% (key, title,))
- # Use a default refresh interval of 6 hours.
- wc().stream_register (key, title, 6 * 60 * 60)
+ wc().stream_register(
+ key, title,
+ self.config.getUpdateInterval() * 60 * 60)
else:
# Make sure the human readable name is up to date.
if wc()[key].human_readable_name != title:
wc()[key].human_readable_name = title
stream_ids.remove (key)
+ wc()[key].freshness \
+ = self.config.getUpdateInterval() * 60 * 60
# Unregister any streams that are no longer subscribed to.
for id in stream_ids:
logger.debug("Unregistering %s" % (id,))
- w.stream_unregister (id)
+ wc().stream_unregister (id)
except Exception:
logger.exception("Registering streams with Woodchuck")
def getCategoryTitle(self, id):
return self.lookup('categories', 'title', id)
-
+
def getCategoryUnread(self, id):
count = 0
for key in self.getListOfFeeds(category=id):
human_readable_name=title,
freshness=6*60*60)
+ self.cache_invalidate('feeds')
return True
else:
return False
if wc().available ():
try:
del wc()[key]
- except KeyError:
+ except KeyError, woodchuck.Error:
logger.debug("Removing unregistered feed %s failed" % (key,))
rank = self.db.execute("SELECT rank FROM feeds WHERE id=?;", (key,) ).fetchone()[0]