0ceb2362abb9d8c6e14ead77d720129f70ec1e58
[feedingit] / src / rss.py
1 #!/usr/bin/env python2.5
2
3
4 # Copyright (c) 2007-2008 INdT.
5 # This program is free software: you can redistribute it and/or modify
6 # it under the terms of the GNU Lesser General Public License as published by
7 # the Free Software Foundation, either version 3 of the License, or
8 # (at your option) any later version.
9 #
10 #  This program is distributed in the hope that it will be useful,
11 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
12 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 #  GNU Lesser General Public License for more details.
14 #
15 #  You should have received a copy of the GNU Lesser General Public License
16 #  along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 #
18
19 # ============================================================================
20 # Name        : FeedingIt.py
21 # Author      : Yves Marcoz
22 # Version     : 0.5.2
23 # Description : Simple RSS Reader
24 # ============================================================================
25
26 from os.path import isfile
27 from os.path import isdir
28 from shutil import rmtree
29 from os import mkdir
30 import pickle
31 import md5
32 import feedparser
33 import time
34 import urllib2
35 from BeautifulSoup import BeautifulSoup
36 from urlparse import urlparse
37
38 #CONFIGDIR="/home/user/.feedingit/"
39
40 def getId(string):
41     return md5.new(string).hexdigest()
42
43 # Entry = {"title":XXX, "content":XXX, "date":XXX, "link":XXX, images = [] }
44
45 class ImageHandler:
46     def __init__(self, configdir):
47         self.configdir = configdir
48         self.images = {}
49         
50     def addImage(self, key, baseurl, url):
51         filename = self.configdir+key+".d/"+getId(url)
52         if not isfile(filename):
53             try:
54                 if url.startswith("http"):
55                     f = urllib2.urlopen(url)
56                 else:
57                     f = urllib2.urlopen(baseurl+"/"+url)
58                 outf = open(filename, "w")
59                 outf.write(f.read())
60                 f.close()
61                 outf.close()
62             except:
63                 print "Could not download" + url
64         if url in self.images:
65             self.images[url] += 1
66         else:
67             self.images[url] = 1
68         return "file://" + filename
69         
70     def removeImage(self, key, url):
71         filename = self.configdir+key+".d/"+getId(url)
72         self.images[url] -= 1
73         if self.images[url] == 0:
74             os.remove(filename)
75             del self.images[url]
76
77 class UnreadTracker:
78     def __init__(self):
79         self.readItems = {}
80         self.countUnread
81         
82     def setEntryUnread(self, id):
83         if self.readItems.has_key(id):
84             if self.readItems[id]==True:
85                 self.countUnread = self.countUnread + 1
86                 self.readItems[id] = False
87         else:
88             self.readItems[id] = False
89             self.countUnread = self.countUnread + 1
90     
91     def setEntryRead(self, id):
92         if self.readItems[id]==False:
93             self.countUnread = self.countUnread - 1
94             self.readItems[id] = True
95
96     def isRead(self, id):
97         return self.readItems[id]
98     
99     def removeEntry(self, id):
100         if self.readItems[id]==False:
101             self.countUnread = self.countUnread - 1
102         del self.readItems[id]
103
104 class Feed:
105     def __init__(self, uniqueId, name, url, imageHandler):
106         self.titles = []
107         self.entries = {}
108         self.ids = []
109         self.readItems = {}
110         self.name = name
111         self.url = url
112         self.countUnread = 0
113         self.updateTime = "Never"
114         self.uniqueId = uniqueId
115         self.imageHandler = imageHandler
116
117     def editFeed(self, url):
118         self.url = url
119
120     def saveFeed(self, configdir):
121         if not isdir(configdir+self.uniqueId+".d"):
122              mkdir(configdir+self.uniqueId+".d")
123         file = open(configdir+self.uniqueId+".d/feed", "w")
124         pickle.dump(self, file )
125         file.close()
126         self.saveUnread(configdir)
127         
128     def saveUnread(self, configdir):
129         if not isdir(configdir+self.uniqueId+".d"):
130             mkdir(configdir+self.uniqueId+".d")
131         file = open(configdir+self.uniqueId+".d/unread", "w")
132         pickle.dump(self.readItems, file )
133         file.close()
134
135     def reloadUnread(self, configdir):
136         try:
137             file = open(configdir+self.uniqueId+".d/unread", "r")
138             self.readItems = pickle.load( file )
139             file.close()
140             self.countUnread = 0
141             for id in self.getIds():
142                if self.readItems[id]==False:
143                   self.countUnread = self.countUnread + 1
144         except:
145             pass
146         return self.countUnread
147
148     def updateFeed(self, configdir, expiryTime=24, proxy=None):
149         # Expiry time is in hours
150         if proxy == None:
151             tmp=feedparser.parse(self.url)
152         else:
153             tmp=feedparser.parse(self.url, handlers = [proxy])
154         # Check if the parse was succesful (number of entries > 0, else do nothing)
155         if len(tmp["entries"])>0:
156            #reversedEntries = self.getEntries()
157            #reversedEntries.reverse()
158            tmpEntries = {}
159            tmpIds = []
160            for entry in tmp["entries"]:
161                (dateTuple, date) = self.extractDate(entry)
162                tmpEntry = {"title":entry["title"], "content":self.extractContent(entry),
163                             "date":date, "dateTuple":dateTuple, "link":entry["link"], "images":[] }
164                id = self.generateUniqueId(tmpEntry)
165                tmpEntries[id] = tmpEntry
166                tmpIds.append(id)               
167            for entryId in self.getIds():
168                currentTime = time.time()
169                expiry = float(expiryTime) * 3600.
170                articleTime = time.mktime(self.entries[entryId]["dateTuple"])
171                if currentTime - articleTime < expiry:
172                    if not entryId in tmpIds:
173                        tmpEntries[entryId] = self.entries[entryId]
174                        tmpIds.append(entryId)
175                else:
176                     if (not self.isEntryRead(entryId)) and (currentTime - articleTime < 2*expiry):
177                         tmpEntries[entryId] = self.entries[entryId]
178                         tmpIds.append(entryId)
179                    
180            self.entries = tmpEntries
181            self.ids = tmpIds
182            self.countUnread = 0
183            # Initialize the new articles to unread
184            tmpReadItems = self.readItems
185            self.readItems = {}
186            for id in self.getIds():
187                if not tmpReadItems.has_key(id):
188                    self.readItems[id] = False
189                else:
190                    self.readItems[id] = tmpReadItems[id]
191                if self.readItems[id]==False:
192                   self.countUnread = self.countUnread + 1
193            del tmp
194            self.updateTime = time.asctime()
195            self.saveFeed(configdir)
196
197     def extractContent(self, entry):
198         content = ""
199         if entry.has_key('summary'):
200             content = entry.get('summary', '')
201         if entry.has_key('content'):
202             if len(entry.content[0].value) > len(content):
203                 content = entry.content[0].value
204         if content == "":
205             content = entry.get('description', '')
206         return content
207         
208     def extractDate(self, entry):
209         if entry.has_key("updated_parsed"):
210             date1 = entry["updated_parsed"]
211             date = time.strftime("%a, %d %b %Y %H:%M:%S",entry["updated_parsed"])
212         elif entry.has_key("published_parsed"):
213             date1 = entry["published_parsed"]
214             date = time.strftime("%a, %d %b %Y %H:%M:%S", entry["published_parsed"])
215         else:
216             date1= ""
217             date = ""
218         #print date1, date
219         return (date1, date)
220
221     def setEntryRead(self, id):
222         if self.readItems[id]==False:
223             self.countUnread = self.countUnread - 1
224             self.readItems[id] = True
225             
226     def setEntryUnread(self, id):
227         if self.readItems[id]==True:
228             self.countUnread = self.countUnread + 1
229             self.readItems[id] = False
230     
231     def isEntryRead(self, id):
232         return self.readItems[id]
233     
234     def getTitle(self, id):
235         return self.entries[id]["title"]
236     
237     def getLink(self, id):
238         if self.entries[id].has_key("contentLink"):
239             return self.entries[id]["contentLink"]
240         return self.entries[id]["link"]
241     
242     def getDate(self, id):
243         return self.entries[id]["date"]
244
245     def getDateTuple(self, id):
246         return self.entries[id]["dateTuple"]
247  
248     def getUniqueId(self, index):
249         return self.ids[index]
250     
251     def generateUniqueId(self, entry):
252         return getId(entry["date"] + entry["title"])
253     
254     def getUpdateTime(self):
255         return self.updateTime
256     
257     def getEntries(self):
258         return self.entries
259     
260     def getIds(self):
261         return self.ids
262     
263     def getNextId(self, id):
264         return self.ids[(self.ids.index(id)+1) % self.getNumberOfEntries()]
265     
266     def getPreviousId(self, id):
267         return self.ids[(self.ids.index(id)-1) % self.getNumberOfEntries()]
268     
269     def getNumberOfUnreadItems(self):
270         return self.countUnread
271     
272     def getNumberOfEntries(self):
273         return len(self.ids)
274     
275     def getItem(self, id):
276         try:
277             return self.entries[id]
278         except:
279             return []
280     
281     def getContent(self, id):
282         if self.entries[id].has_key("contentLink"):
283             file = open(self.entries[id]["contentLink"])
284             content = file.read()
285             file.close()
286             return content
287         return self.entries[id]["content"]
288     
289     def removeEntry(self, id):
290         entry = self.entries[id]
291         for img in entry["images"]:
292             self.imageHandler.removeImage(self.uniqueId, img)
293         if entry.has_key["contentLink"]:
294             os.remove(entry["contentLink"])
295         self.entries.remove(id)
296         self.ids.remove(id)
297         if self.readItems[id]==False:
298             self.countUnread = self.countUnread - 1
299         self.readItems.remove(id)
300     
301     def getArticle(self, id):
302         self.setEntryRead(id)
303         entry = self.entries[id]
304         title = entry['title']
305         #content = entry.get('content', entry.get('summary_detail', {}))
306         content = entry["content"]
307
308         link = entry['link']
309         date = entry["date"]
310
311         #text = '''<div style="color: black; background-color: white;">'''
312         text = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">'
313         text += "<html><head><title>" + title + "</title>"
314         text += '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>\n'
315         #text += '<style> body {-webkit-user-select: none;} </style>'
316         text += '</head><body><div><a href=\"' + link + '\">' + title + "</a>"
317         text += "<BR /><small><i>Date: " + date + "</i></small></div>"
318         text += "<BR /><BR />"
319         text += content
320         text += "</body></html>"
321         return text
322         
323 class ArchivedArticles(Feed):    
324     def addArchivedArticle(self, title, link, updated_parsed, configdir):
325         entry = {}
326         entry["title"] = title
327         entry["link"] = link
328         entry["summary"] = '<a href=\"' + link + '\">' + title + "</a>"
329         entry["updated_parsed"] = updated_parsed
330         entry["time"] = time.time()
331         #print entry
332         (dateTuple, date) = self.extractDate(entry)
333         tmpEntry = {"title":entry["title"], "content":self.extractContent(entry),
334                             "date":date, "dateTuple":dateTuple, "link":entry["link"], "images":[], "downloaded":False, "time":entry["time"] }
335         id = self.generateUniqueId(tmpEntry)
336         self.entries[id] = tmpEntry
337         self.ids.append(id)  
338         self.readItems[id] = False
339         self.countUnread = self.countUnread + 1
340         self.saveFeed(configdir)
341         self.saveUnread(configdir)
342         
343     def updateFeed(self, configdir, expiryTime=24, proxy=None):
344         for id in self.getIds():
345             entry = self.entries[id]
346             if not entry["downloaded"]:
347                 #try:
348                     f = urllib2.urlopen(entry["link"])
349                     #entry["content"] = f.read()
350                     html = f.read()
351                     f.close()
352                     soup = BeautifulSoup(html)
353                     images = soup.body('img')
354                     baseurl = ''.join(urlparse(entry["link"])[:-1])
355                     for img in images:
356                         filename = self.imageHandler.addImage(self.uniqueId, baseurl, img['src'])
357                         #filename = configdir+self.uniqueId+".d/"+getId(img['src'])
358                         #if not isfile(filename):
359                         #    try:
360                         #        if img['src'].startswith("http"):
361                         #            f = urllib2.urlopen(img['src'])
362                         #        else:
363                         #            f = urllib2.urlopen(baseurl+"/"+img['src'])
364                         #            #print baseurl+"/"+img['src']
365                         #        print filename
366                         #        outf = open(filename, "w")
367                         #        outf.write(f.read())
368                         #        f.close()
369                         #        outf.close()
370                         #    except:
371                         #        print "Could not download" + img['src']
372                         img['src']=filename
373                         entry["images"].append(filename)
374                     entry["contentLink"] = configdir+self.uniqueId+".d/"+id+".html"
375                     file = open(entry["contentLink"], "w")
376                     file.write(soup.prettify())
377                     file.close()
378                     if len(entry["content"]) > 0:
379                         entry["downloaded"] = True
380                         entry["time"] = time.time()
381                         self.setEntryUnread(id)
382                 #except:
383                 #    pass
384             currentTime = time.time()
385             expiry = float(expiryTime) * 3600
386             if currentTime - entry["time"] > expiry:
387                 if self.isEntryRead(id):
388                     self.removeEntry(id)
389                 else:
390                     if currentTime - entry["time"] > 2*expiry:
391                         self.removeEntry(id)
392         self.updateTime = time.asctime()
393         self.saveFeed(configdir)
394
395     def getArticle(self, index):
396         self.setEntryRead(index)
397         content = self.getContent(index)
398         return content
399
400
401 class Listing:
402     # Lists all the feeds in a dictionary, and expose the data
403     def __init__(self, configdir):
404         self.configdir = configdir
405         #self.feeds = {}
406         if isfile(self.configdir+"feeds.pickle"):
407             file = open(self.configdir+"feeds.pickle")
408             self.listOfFeeds = pickle.load(file)
409             file.close()
410         else:
411             self.listOfFeeds = {getId("Slashdot"):{"title":"Slashdot", "url":"http://rss.slashdot.org/Slashdot/slashdot", "unread":0, "updateTime":"Never"}, }
412         if isfile(self.configdir+"images.pickle"):
413             file = open(self.configdir+"images.pickle")
414             self.imageHandler = pickle.load(file)
415             file.close()
416         else:
417             self.imageHandler = ImageHandler(self.configdir)
418         if self.listOfFeeds.has_key("font"):
419             del self.listOfFeeds["font"]
420         if self.listOfFeeds.has_key("feedingit-order"):
421             self.sortedKeys = self.listOfFeeds["feedingit-order"]
422         else:
423             self.sortedKeys = self.listOfFeeds.keys()
424             if "font" in self.sortedKeys:
425                 self.sortedKeys.remove("font")
426             self.sortedKeys.sort(key=lambda obj: self.getFeedTitle(obj))
427         list = self.sortedKeys[:]
428         #for key in list:
429         #    try:
430         #        self.loadFeed(key)
431         #    except:
432                 #import traceback
433                 #if key.startswith('d8'):
434                 #traceback.print_exc()
435         #        self.sortedKeys.remove(key)
436             #print key
437                 #print key in self.sortedKeys
438         #print "d8eb3f07572892a7b5ed9c81c5bb21a2" in self.sortedKeys
439         #print self.listOfFeeds["d8eb3f07572892a7b5ed9c81c5bb21a2"]
440         self.closeCurrentlyDisplayedFeed()
441         #self.saveConfig()
442
443     def addArchivedArticle(self, key, index):
444         feed = self.getFeed(key)
445         title = feed.getTitle(index)
446         link = feed.getLink(index)
447         date = feed.getDateTuple(index)
448         if not self.listOfFeeds.has_key("ArchivedArticles"):
449             self.listOfFeeds["ArchivedArticles"] = {"title":"Archived Articles", "url":"", "unread":0, "updateTime":"Never"}
450             self.sortedKeys.append("ArchivedArticles")
451             #self.feeds["Archived Articles"] = ArchivedArticles("Archived Articles", "")
452             self.saveConfig()
453         archFeed = self.getFeed("ArchivedArticles")
454         archFeed.addArchivedArticle(title, link, date, self.configdir)
455         self.listOfFeeds[key]["unread"] = archFeed.getNumberOfUnreadItems()
456         
457     def loadFeed(self, key):
458             if isfile(self.configdir+key+".d/feed"):
459                 file = open(self.configdir+key+".d/feed")
460                 feed = pickle.load(file)
461                 file.close()
462                 try:
463                     feed.uniqueId
464                     feed.imageHandler
465                 except AttributeError:
466                     feed.uniqueId = getId(feed.name)
467                     feed.imageHandler = self.imageHandler
468                 #feed.reloadUnread(self.configdir)
469             else:
470                 #print key
471                 title = self.listOfFeeds[key]["title"]
472                 url = self.listOfFeeds[key]["url"]
473                 if key == "ArchivedArticles":
474                     feed = ArchivedArticles("ArchivedArticles", title, url, self.imageHandler)
475                 else:
476                     feed = Feed(getId(title), title, url, self.imageHandler)
477             return feed
478         
479     def updateFeeds(self, expiryTime=24, proxy=None):
480         for key in self.getListOfFeeds():
481             feed = self.loadFeed(key)
482             feed.updateFeed(self.configdir, expiryTime, proxy)
483             self.listOfFeeds[key]["unread"] = feed.getNumberOfUnreadItems()
484             self.listOfFeeds[key]["updateTime"] = feed.getUpdateTime()
485             
486     def updateFeed(self, key, expiryTime=24, proxy=None):
487         feed = self.getFeed(key)
488         feed.updateFeed(self.configdir, expiryTime, proxy)
489         self.listOfFeeds[key]["unread"] = feed.getNumberOfUnreadItems()
490         self.listOfFeeds[key]["updateTime"] = feed.getUpdateTime()
491         
492     def editFeed(self, key, title, url):
493         self.listOfFeeds[key]["title"] = title
494         self.listOfFeeds[key]["url"] = url
495         feed = self.loadFeed(key)
496         feed.editFeed(url)
497
498     def getFeed(self, key):
499         feed = self.loadFeed(key)
500         feed.reloadUnread(self.configdir)
501         return feed
502     
503     def getFeedUpdateTime(self, key):
504         #print self.listOfFeeds.has_key(key)
505         if not self.listOfFeeds[key].has_key("updateTime"):
506             self.listOfFeeds[key]["updateTime"] = "Never"
507         return self.listOfFeeds[key]["updateTime"]
508     
509     def getFeedNumberOfUnreadItems(self, key):
510         if not self.listOfFeeds[key].has_key("unread"):
511             self.listOfFeeds[key]["unread"] = 0
512         return self.listOfFeeds[key]["unread"]
513
514     def updateUnread(self, key, unreadItems):
515         self.listOfFeeds[key]["unread"] = unreadItems
516    
517     def getFeedTitle(self, key):
518         return self.listOfFeeds[key]["title"]
519     
520     def getFeedUrl(self, key):
521         return self.listOfFeeds[key]["url"]
522     
523     def getListOfFeeds(self):
524         return self.sortedKeys
525     
526     #def getNumberOfUnreadItems(self, key):
527     #    if self.listOfFeeds.has_key("unread"):
528     #       return self.listOfFeeds[key]["unread"]
529     #    else:
530     #       return 0
531     
532     def addFeed(self, title, url):
533         if not self.listOfFeeds.has_key(getId(title)):
534             self.listOfFeeds[getId(title)] = {"title":title, "url":url, "unread":0, "updateTime":"Never"}
535             self.sortedKeys.append(getId(title))
536             self.saveConfig()
537             #self.feeds[getId(title)] = Feed(title, url)
538             return True
539         else:
540             return False
541         
542     def removeFeed(self, key):
543         del self.listOfFeeds[key]
544         self.sortedKeys.remove(key)
545         #del self.feeds[key]
546         if isdir(self.configdir+key+".d/"):
547            rmtree(self.configdir+key+".d/")
548         self.saveConfig()
549     
550     def saveConfig(self):
551         self.listOfFeeds["feedingit-order"] = self.sortedKeys
552         file = open(self.configdir+"feeds.pickle", "w")
553         pickle.dump(self.listOfFeeds, file)
554         file.close()
555         file = open(self.configdir+"images.pickle", "w")
556         pickle.dump(self.imageHandler, file)
557         file.close()
558         
559     def moveUp(self, key):
560         index = self.sortedKeys.index(key)
561         self.sortedKeys[index] = self.sortedKeys[index-1]
562         self.sortedKeys[index-1] = key
563         
564     def moveDown(self, key):
565         index = self.sortedKeys.index(key)
566         index2 = (index+1)%len(self.sortedKeys)
567         self.sortedKeys[index] = self.sortedKeys[index2]
568         self.sortedKeys[index2] = key
569         
570     def setCurrentlyDisplayedFeed(self, key):
571         self.currentlyDisplayedFeed = key
572     def closeCurrentlyDisplayedFeed(self):
573         self.currentlyDisplayedFeed = False
574     def getCurrentlyDisplayedFeed(self):
575         return self.currentlyDisplayedFeed
576     
577 if __name__ == "__main__":
578     listing = Listing('/home/user/.feedingit/')
579     list = listing.getListOfFeeds()[:]
580         #list.reverse()
581     for key in list:
582         if key.startswith('d8'):
583             print listing.getFeedUpdateTime(key)