0.5.1-0: added image caching for webkit Archived Articles
[feedingit] / src / rss.py
1 #!/usr/bin/env python2.5
2
3
4 # Copyright (c) 2007-2008 INdT.
5 # This program is free software: you can redistribute it and/or modify
6 # it under the terms of the GNU Lesser General Public License as published by
7 # the Free Software Foundation, either version 3 of the License, or
8 # (at your option) any later version.
9 #
10 #  This program is distributed in the hope that it will be useful,
11 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
12 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 #  GNU Lesser General Public License for more details.
14 #
15 #  You should have received a copy of the GNU Lesser General Public License
16 #  along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 #
18
19 # ============================================================================
20 # Name        : FeedingIt.py
21 # Author      : Yves Marcoz
22 # Version     : 0.5.0
23 # Description : Simple RSS Reader
24 # ============================================================================
25
26 from os.path import isfile
27 from os.path import isdir
28 from shutil import rmtree
29 from os import mkdir
30 import pickle
31 import md5
32 import feedparser
33 import time
34 import urllib2
35 from BeautifulSoup import BeautifulSoup
36 from urlparse import urlparse
37
38 #CONFIGDIR="/home/user/.feedingit/"
39
40 def getId(string):
41     return md5.new(string).hexdigest()
42
43 # Entry = {"title":XXX, "content":XXX, "date":XXX, "link":XXX, images = [] }
44
45 class ImageHandler:
46     def __init__(self, configdir):
47         self.configdir = configdir
48         self.images = {}
49         
50     def addImage(self, key, baseurl, url):
51         filename = self.configdir+key+".d/"+getId(url)
52         if not isfile(filename):
53             try:
54                 if url.startswith("http"):
55                     f = urllib2.urlopen(url)
56                 else:
57                     f = urllib2.urlopen(baseurl+"/"+url)
58                 outf = open(filename, "w")
59                 outf.write(f.read())
60                 f.close()
61                 outf.close()
62             except:
63                 print "Could not download" + url
64         if url in self.images:
65             self.images[url] += 1
66         else:
67             self.images[url] = 1
68         return "file://" + filename
69         
70     def removeImage(self, key, url):
71         filename = self.configdir+key+".d/"+getId(url)
72         self.images[url] -= 1
73         if self.images[url] == 0:
74             os.remove(filename)
75             del self.images[url]
76
77 class Feed:
78     def __init__(self, uniqueId, name, url, imageHandler):
79         self.titles = []
80         self.entries = {}
81         self.ids = []
82         self.readItems = {}
83         self.name = name
84         self.url = url
85         self.countUnread = 0
86         self.updateTime = "Never"
87         self.uniqueId = uniqueId
88         self.imageHandler = imageHandler
89
90     def editFeed(self, url):
91         self.url = url
92
93     def saveFeed(self, configdir):
94         if not isdir(configdir+self.uniqueId+".d"):
95              mkdir(configdir+self.uniqueId+".d")
96         file = open(configdir+self.uniqueId+".d/feed", "w")
97         pickle.dump(self, file )
98         file.close()
99         self.saveUnread(configdir)
100         
101     def saveUnread(self, configdir):
102         if not isdir(configdir+self.uniqueId+".d"):
103             mkdir(configdir+self.uniqueId+".d")
104         file = open(configdir+self.uniqueId+".d/unread", "w")
105         pickle.dump(self.readItems, file )
106         file.close()
107
108     def reloadUnread(self, configdir):
109         try:
110             file = open(configdir+self.uniqueId+".d/unread", "r")
111             self.readItems = pickle.load( file )
112             file.close()
113             self.countUnread = 0
114             for id in self.getIds():
115                if self.readItems[id]==False:
116                   self.countUnread = self.countUnread + 1
117         except:
118             pass
119         return self.countUnread
120
121     def updateFeed(self, configdir, expiryTime=24):
122         # Expiry time is in hours
123         tmp=feedparser.parse(self.url)
124         # Check if the parse was succesful (number of entries > 0, else do nothing)
125         if len(tmp["entries"])>0:
126            #reversedEntries = self.getEntries()
127            #reversedEntries.reverse()
128            tmpEntries = {}
129            tmpIds = []
130            for entry in tmp["entries"]:
131                (dateTuple, date) = self.extractDate(entry)
132                tmpEntry = {"title":entry["title"], "content":self.extractContent(entry),
133                             "date":date, "dateTuple":dateTuple, "link":entry["link"], "images":[] }
134                id = self.generateUniqueId(tmpEntry)
135                tmpEntries[id] = tmpEntry
136                tmpIds.append(id)               
137            for entryId in self.getIds():
138                currentTime = time.time()
139                expiry = float(expiryTime) * 3600.
140                articleTime = time.mktime(self.entries[entryId]["dateTuple"])
141                if currentTime - articleTime < expiry:
142                    if not entryId in tmpIds:
143                        tmpEntries[entryId] = self.entries[entryId]
144                        tmpIds.append(entryId)
145                else:
146                     if (not self.isEntryRead(entryId)) and (currentTime - articleTime < 2*expiry):
147                         tmpEntries[entryId] = self.entries[entryId]
148                         tmpIds.append(entryId)
149                    
150            self.entries = tmpEntries
151            self.ids = tmpIds
152            self.countUnread = 0
153            # Initialize the new articles to unread
154            tmpReadItems = self.readItems
155            self.readItems = {}
156            for id in self.getIds():
157                if not tmpReadItems.has_key(id):
158                    self.readItems[id] = False
159                else:
160                    self.readItems[id] = tmpReadItems[id]
161                if self.readItems[id]==False:
162                   self.countUnread = self.countUnread + 1
163            del tmp
164            self.updateTime = time.asctime()
165            self.saveFeed(configdir)
166
167     def extractContent(self, entry):
168         content = ""
169         if entry.has_key('summary'):
170             content = entry.get('summary', '')
171         if entry.has_key('content'):
172             if len(entry.content[0].value) > len(content):
173                 content = entry.content[0].value
174         if content == "":
175             content = entry.get('description', '')
176         return content
177         
178     def extractDate(self, entry):
179         if entry.has_key("updated_parsed"):
180             date1 = entry["updated_parsed"]
181             date = time.strftime("%a, %d %b %Y %H:%M:%S",entry["updated_parsed"])
182         elif entry.has_key("published_parsed"):
183             date1 = entry["published_parsed"]
184             date = time.strftime("%a, %d %b %Y %H:%M:%S", entry["published_parsed"])
185         else:
186             date1= ""
187             date = ""
188         #print date1, date
189         return (date1, date)
190
191     def setEntryRead(self, id):
192         if self.readItems[id]==False:
193             self.countUnread = self.countUnread - 1
194             self.readItems[id] = True
195             
196     def setEntryUnread(self, id):
197         if self.readItems[id]==True:
198             self.countUnread = self.countUnread + 1
199             self.readItems[id] = False
200     
201     def isEntryRead(self, id):
202         return self.readItems[id]
203     
204     def getTitle(self, id):
205         return self.entries[id]["title"]
206     
207     def getLink(self, id):
208         if self.entries[id].has_key("contentLink"):
209             return self.entries[id]["contentLink"]
210         return self.entries[id]["link"]
211     
212     def getDate(self, id):
213         return self.entries[id]["date"]
214
215     def getDateTuple(self, id):
216         return self.entries[id]["dateTuple"]
217  
218     def getUniqueId(self, index):
219         return self.ids[index]
220     
221     def generateUniqueId(self, entry):
222         return getId(entry["date"] + entry["title"])
223     
224     def getUpdateTime(self):
225         return self.updateTime
226     
227     def getEntries(self):
228         return self.entries
229     
230     def getIds(self):
231         return self.ids
232     
233     def getNextId(self, id):
234         return self.ids[(self.ids.index(id)+1) % self.getNumberOfEntries()]
235     
236     def getPreviousId(self, id):
237         return self.ids[(self.ids.index(id)-1) % self.getNumberOfEntries()]
238     
239     def getNumberOfUnreadItems(self):
240         return self.countUnread
241     
242     def getNumberOfEntries(self):
243         return len(self.ids)
244     
245     def getItem(self, id):
246         try:
247             return self.entries[id]
248         except:
249             return []
250     
251     def getContent(self, id):
252         if self.entries[id].has_key("contentLink"):
253             file = open(self.entries[id]["contentLink"])
254             content = file.read()
255             file.close()
256             return content
257         return self.entries[id]["content"]
258     
259     def removeEntry(self, id):
260         entry = self.entries[id]
261         for img in entry["images"]:
262             self.imageHandler.removeImage(self.uniqueId, img)
263         if entry.has_key["contentLink"]:
264             os.remove(entry["contentLink"])
265         self.entries.remove(id)
266         self.ids.remove(id)
267         if self.readItems[id]==False:
268             self.countUnread = self.countUnread - 1
269         self.readItems.remove(id)
270     
271     def getArticle(self, id):
272         self.setEntryRead(id)
273         entry = self.entries[id]
274         title = entry['title']
275         #content = entry.get('content', entry.get('summary_detail', {}))
276         content = entry["content"]
277
278         link = entry['link']
279         date = entry["date"]
280
281         #text = '''<div style="color: black; background-color: white;">'''
282         text = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">'
283         text += "<html><head><title>" + title + "</title>"
284         text += '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>\n'
285         #text += '<style> body {-webkit-user-select: none;} </style>'
286         text += '</head><body><div><a href=\"' + link + '\">' + title + "</a>"
287         text += "<BR /><small><i>Date: " + date + "</i></small></div>"
288         text += "<BR /><BR />"
289         text += content
290         text += "</body></html>"
291         return text
292         
293 class ArchivedArticles(Feed):    
294     def addArchivedArticle(self, title, link, updated_parsed, configdir):
295         entry = {}
296         entry["title"] = title
297         entry["link"] = link
298         entry["summary"] = '<a href=\"' + link + '\">' + title + "</a>"
299         entry["updated_parsed"] = updated_parsed
300         entry["time"] = time.time()
301         #print entry
302         (dateTuple, date) = self.extractDate(entry)
303         tmpEntry = {"title":entry["title"], "content":self.extractContent(entry),
304                             "date":date, "dateTuple":dateTuple, "link":entry["link"], "images":[], "downloaded":False, "time":entry["time"] }
305         id = self.generateUniqueId(tmpEntry)
306         self.entries[id] = tmpEntry
307         self.ids.append(id)  
308         self.readItems[id] = False
309         self.countUnread = self.countUnread + 1
310         self.saveFeed(configdir)
311         self.saveUnread(configdir)
312         
313     def updateFeed(self, configdir, expiryTime=24):
314         for id in self.getIds():
315             entry = self.entries[id]
316             if not entry["downloaded"]:
317                 #try:
318                     f = urllib2.urlopen(entry["link"])
319                     #entry["content"] = f.read()
320                     html = f.read()
321                     f.close()
322                     soup = BeautifulSoup(html)
323                     images = soup.body('img')
324                     baseurl = ''.join(urlparse(entry["link"])[:-1])
325                     for img in images:
326                         filename = self.imageHandler.addImage(self.uniqueId, baseurl, img['src'])
327                         #filename = configdir+self.uniqueId+".d/"+getId(img['src'])
328                         #if not isfile(filename):
329                         #    try:
330                         #        if img['src'].startswith("http"):
331                         #            f = urllib2.urlopen(img['src'])
332                         #        else:
333                         #            f = urllib2.urlopen(baseurl+"/"+img['src'])
334                         #            #print baseurl+"/"+img['src']
335                         #        print filename
336                         #        outf = open(filename, "w")
337                         #        outf.write(f.read())
338                         #        f.close()
339                         #        outf.close()
340                         #    except:
341                         #        print "Could not download" + img['src']
342                         img['src']=filename
343                         entry["images"].append(filename)
344                     entry["contentLink"] = configdir+self.uniqueId+".d/"+id+".html"
345                     file = open(entry["contentLink"], "w")
346                     file.write(soup.prettify())
347                     file.close()
348                     if len(entry["content"]) > 0:
349                         entry["downloaded"] = True
350                         entry["time"] = time.time()
351                         self.setEntryUnread(id)
352                 #except:
353                 #    pass
354             currentTime = time.time()
355             expiry = float(expiryTime) * 3600
356             if currentTime - entry["time"] > expiry:
357                 if self.isEntryRead(id):
358                     self.removeEntry(id)
359                 else:
360                     if currentTime - entry["time"] > 2*expiry:
361                         self.removeEntry(id)
362         self.updateTime = time.asctime()
363         self.saveFeed(configdir)
364
365     def getArticle(self, index):
366         self.setEntryRead(index)
367         content = self.getContent(index)
368         return content
369
370
371 class Listing:
372     # Lists all the feeds in a dictionary, and expose the data
373     def __init__(self, configdir):
374         self.configdir = configdir
375         #self.feeds = {}
376         if isfile(self.configdir+"feeds.pickle"):
377             file = open(self.configdir+"feeds.pickle")
378             self.listOfFeeds = pickle.load(file)
379             file.close()
380         else:
381             self.listOfFeeds = {getId("Slashdot"):{"title":"Slashdot", "url":"http://rss.slashdot.org/Slashdot/slashdot", "unread":0, "updateTime":"Never"}, }
382         if isfile(self.configdir+"images.pickle"):
383             file = open(self.configdir+"images.pickle")
384             self.imageHandler = pickle.load(file)
385             file.close()
386         else:
387             self.imageHandler = ImageHandler(self.configdir)
388         if self.listOfFeeds.has_key("font"):
389             del self.listOfFeeds["font"]
390         if self.listOfFeeds.has_key("feedingit-order"):
391             self.sortedKeys = self.listOfFeeds["feedingit-order"]
392         else:
393             self.sortedKeys = self.listOfFeeds.keys()
394             if "font" in self.sortedKeys:
395                 self.sortedKeys.remove("font")
396             self.sortedKeys.sort(key=lambda obj: self.getFeedTitle(obj))
397         list = self.sortedKeys[:]
398         #for key in list:
399         #    try:
400         #        self.loadFeed(key)
401         #    except:
402                 #import traceback
403                 #if key.startswith('d8'):
404                 #traceback.print_exc()
405         #        self.sortedKeys.remove(key)
406             #print key
407                 #print key in self.sortedKeys
408         #print "d8eb3f07572892a7b5ed9c81c5bb21a2" in self.sortedKeys
409         #print self.listOfFeeds["d8eb3f07572892a7b5ed9c81c5bb21a2"]
410         self.closeCurrentlyDisplayedFeed()
411         #self.saveConfig()
412
413     def addArchivedArticle(self, key, index):
414         feed = self.getFeed(key)
415         title = feed.getTitle(index)
416         link = feed.getLink(index)
417         date = feed.getDateTuple(index)
418         if not self.listOfFeeds.has_key("ArchivedArticles"):
419             self.listOfFeeds["ArchivedArticles"] = {"title":"Archived Articles", "url":"", "unread":0, "updateTime":"Never"}
420             self.sortedKeys.append("ArchivedArticles")
421             #self.feeds["Archived Articles"] = ArchivedArticles("Archived Articles", "")
422             self.saveConfig()
423         archFeed = self.getFeed("ArchivedArticles")
424         archFeed.addArchivedArticle(title, link, date, self.configdir)
425         self.listOfFeeds[key]["unread"] = archFeed.getNumberOfUnreadItems()
426         
427     def loadFeed(self, key):
428             if isfile(self.configdir+key+".d/feed"):
429                 file = open(self.configdir+key+".d/feed")
430                 feed = pickle.load(file)
431                 file.close()
432                 try:
433                     feed.uniqueId
434                     feed.imageHandler
435                 except AttributeError:
436                     feed.uniqueId = getId(feed.name)
437                     feed.imageHandler = self.imageHandler
438                 #feed.reloadUnread(self.configdir)
439             else:
440                 #print key
441                 title = self.listOfFeeds[key]["title"]
442                 url = self.listOfFeeds[key]["url"]
443                 if key == "ArchivedArticles":
444                     feed = ArchivedArticles("ArchivedArticles", title, url, self.imageHandler)
445                 else:
446                     feed = Feed(getId(title), title, url, self.imageHandler)
447             return feed
448         
449     def updateFeeds(self, expiryTime=24):
450         for key in self.getListOfFeeds():
451             feed = self.loadFeed(key)
452             feed.updateFeed(self.configdir, expiryTime)
453             self.listOfFeeds[key]["unread"] = feed.getNumberOfUnreadItems()
454             self.listOfFeeds[key]["updateTime"] = feed.getUpdateTime()
455             
456     def updateFeed(self, key, expiryTime=24):
457         feed = self.loadFeed(key)
458         feed.updateFeed(self.configdir, expiryTime)
459         self.listOfFeeds[key]["unread"] = feed.getNumberOfUnreadItems()
460         self.listOfFeeds[key]["updateTime"] = feed.getUpdateTime()
461         
462     def editFeed(self, key, title, url):
463         self.listOfFeeds[key]["title"] = title
464         self.listOfFeeds[key]["url"] = url
465         feed = self.loadFeed(key)
466         feed.editFeed(url)
467
468     def getFeed(self, key):
469         feed = self.loadFeed(key)
470         feed.reloadUnread(self.configdir)
471         return feed
472     
473     def getFeedUpdateTime(self, key):
474         #print self.listOfFeeds.has_key(key)
475         if not self.listOfFeeds[key].has_key("updateTime"):
476             self.listOfFeeds[key]["updateTime"] = "Never"
477         return self.listOfFeeds[key]["updateTime"]
478     
479     def getFeedNumberOfUnreadItems(self, key):
480         if not self.listOfFeeds[key].has_key("unread"):
481             self.listOfFeeds[key]["unread"] = 0
482         return self.listOfFeeds[key]["unread"]
483
484     def updateUnread(self, key, unreadItems):
485         self.listOfFeeds[key]["unread"] = unreadItems
486    
487     def getFeedTitle(self, key):
488         return self.listOfFeeds[key]["title"]
489     
490     def getFeedUrl(self, key):
491         return self.listOfFeeds[key]["url"]
492     
493     def getListOfFeeds(self):
494         return self.sortedKeys
495     
496     #def getNumberOfUnreadItems(self, key):
497     #    if self.listOfFeeds.has_key("unread"):
498     #       return self.listOfFeeds[key]["unread"]
499     #    else:
500     #       return 0
501     
502     def addFeed(self, title, url):
503         if not self.listOfFeeds.has_key(getId(title)):
504             self.listOfFeeds[getId(title)] = {"title":title, "url":url, "unread":0, "updateTime":"Never"}
505             self.sortedKeys.append(getId(title))
506             self.saveConfig()
507             #self.feeds[getId(title)] = Feed(title, url)
508             return True
509         else:
510             return False
511         
512     def removeFeed(self, key):
513         del self.listOfFeeds[key]
514         self.sortedKeys.remove(key)
515         #del self.feeds[key]
516         if isdir(self.configdir+key+".d/"):
517            rmtree(self.configdir+key+".d/")
518         self.saveConfig()
519     
520     def saveConfig(self):
521         self.listOfFeeds["feedingit-order"] = self.sortedKeys
522         file = open(self.configdir+"feeds.pickle", "w")
523         pickle.dump(self.listOfFeeds, file)
524         file.close()
525         file = open(self.configdir+"images.pickle", "w")
526         pickle.dump(self.imageHandler, file)
527         file.close()
528         
529     def moveUp(self, key):
530         index = self.sortedKeys.index(key)
531         self.sortedKeys[index] = self.sortedKeys[index-1]
532         self.sortedKeys[index-1] = key
533         
534     def moveDown(self, key):
535         index = self.sortedKeys.index(key)
536         index2 = (index+1)%len(self.sortedKeys)
537         self.sortedKeys[index] = self.sortedKeys[index2]
538         self.sortedKeys[index2] = key
539         
540     def setCurrentlyDisplayedFeed(self, key):
541         self.currentlyDisplayedFeed = key
542     def closeCurrentlyDisplayedFeed(self):
543         self.currentlyDisplayedFeed = False
544     def getCurrentlyDisplayedFeed(self):
545         return self.currentlyDisplayedFeed
546     
547 if __name__ == "__main__":
548     listing = Listing('/home/user/.feedingit/')
549     list = listing.getListOfFeeds()[:]
550         #list.reverse()
551     for key in list:
552         if key.startswith('d8'):
553             print listing.getFeedUpdateTime(key)