0.5.4-0 stable version.
[feedingit] / src / rss.py
1 #!/usr/bin/env python2.5
2
3
4 # Copyright (c) 2007-2008 INdT.
5 # This program is free software: you can redistribute it and/or modify
6 # it under the terms of the GNU Lesser General Public License as published by
7 # the Free Software Foundation, either version 3 of the License, or
8 # (at your option) any later version.
9 #
10 #  This program is distributed in the hope that it will be useful,
11 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
12 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 #  GNU Lesser General Public License for more details.
14 #
15 #  You should have received a copy of the GNU Lesser General Public License
16 #  along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 #
18
19 # ============================================================================
20 # Name        : FeedingIt.py
21 # Author      : Yves Marcoz
22 # Version     : 0.5.4
23 # Description : Simple RSS Reader
24 # ============================================================================
25
26 from os.path import isfile
27 from os.path import isdir
28 from shutil import rmtree
29 from os import mkdir, remove
30 import pickle
31 import md5
32 import feedparser
33 import time
34 import urllib2
35 from BeautifulSoup import BeautifulSoup
36 from urlparse import urlparse
37
38 #CONFIGDIR="/home/user/.feedingit/"
39
40 def getId(string):
41     return md5.new(string).hexdigest()
42
43 def getProxy():
44     import gconf
45     if gconf.client_get_default().get_bool('/system/http_proxy/use_http_proxy'):
46         port = gconf.client_get_default().get_int('/system/http_proxy/port')
47         http = gconf.client_get_default().get_string('/system/http_proxy/host')
48         proxy = proxy = urllib2.ProxyHandler( {"http":"http://%s:%s/"% (http,port)} )
49         return (True, proxy)
50     return (False, None)
51
52 # Enable proxy support for images and ArchivedArticles
53 (proxy_support, proxy) = getProxy()
54 if proxy_support:
55     opener = urllib2.build_opener(proxy)
56     urllib2.install_opener(opener)
57
58 # Entry = {"title":XXX, "content":XXX, "date":XXX, "link":XXX, images = [] }
59
60 class ImageHandler:
61     def __init__(self, configdir):
62         self.configdir = configdir
63         self.images = {}
64         
65     def addImage(self, key, baseurl, url):
66         filename = self.configdir+key+".d/"+getId(url)
67         if not isfile(filename):
68             try:
69                 if url.startswith("http"):
70                     f = urllib2.urlopen(url)
71                 else:
72                     f = urllib2.urlopen(baseurl+"/"+url)
73                 outf = open(filename, "w")
74                 outf.write(f.read())
75                 f.close()
76                 outf.close()
77             except:
78                 print "Could not download" + url
79         if filename in self.images:
80             self.images[filename] += 1
81         else:
82             self.images[filename] = 1
83         return filename
84         
85     def removeImage(self, key, filename):
86         #filename = self.configdir+key+".d/"+getId(url)
87         try:
88             self.images[filename] -= 1
89         except:
90             self.images[filename] = 0 #Delete image
91         try:
92             if self.images[filename] == 0:
93                 remove(filename) #os.remove
94                 del self.images[filename]
95         except:
96             print "Could not remove image %s" % filename
97
98 class Feed:
99     def __init__(self, uniqueId, name, url, imageHandler):
100         self.titles = []
101         self.entries = {}
102         self.ids = []
103         self.readItems = {}
104         self.name = name
105         self.url = url
106         self.countUnread = 0
107         self.updateTime = "Never"
108         self.uniqueId = uniqueId
109         self.imageHandler = imageHandler
110
111     def editFeed(self, url):
112         self.url = url
113
114     def saveFeed(self, configdir):
115         if not isdir(configdir+self.uniqueId+".d"):
116              mkdir(configdir+self.uniqueId+".d")
117         file = open(configdir+self.uniqueId+".d/feed", "w")
118         pickle.dump(self, file )
119         file.close()
120         self.saveUnread(configdir)
121         
122     def saveUnread(self, configdir):
123         if not isdir(configdir+self.uniqueId+".d"):
124             mkdir(configdir+self.uniqueId+".d")
125         file = open(configdir+self.uniqueId+".d/unread", "w")
126         pickle.dump(self.readItems, file )
127         file.close()
128
129     def reloadUnread(self, configdir):
130         try:
131             file = open(configdir+self.uniqueId+".d/unread", "r")
132             self.readItems = pickle.load( file )
133             file.close()
134             self.countUnread = 0
135             for id in self.getIds():
136                if self.readItems[id]==False:
137                   self.countUnread = self.countUnread + 1
138         except:
139             pass
140         return self.countUnread
141
142     def updateFeed(self, configdir, expiryTime=24, proxy=None, imageCache=False):
143         # Expiry time is in hours
144         if proxy == None:
145             tmp=feedparser.parse(self.url)
146         else:
147             tmp=feedparser.parse(self.url, handlers = [proxy])
148         expiry = float(expiryTime) * 3600.
149         # Check if the parse was succesful (number of entries > 0, else do nothing)
150         if len(tmp["entries"])>0:
151            #reversedEntries = self.getEntries()
152            #reversedEntries.reverse()
153            if not isdir(configdir+self.uniqueId+".d"):
154                mkdir(configdir+self.uniqueId+".d")
155            currentTime = time.time()
156            tmpEntries = {}
157            tmpIds = []
158            for entry in tmp["entries"]:
159                (dateTuple, date) = self.extractDate(entry)
160                tmpEntry = {"title":entry["title"], "content":self.extractContent(entry),
161                             "date":date, "dateTuple":dateTuple, "link":entry["link"], "images":[] }
162                id = self.generateUniqueId(tmpEntry)
163                
164                #articleTime = time.mktime(self.entries[id]["dateTuple"])
165                if not id in self.ids:
166                    soup = BeautifulSoup(self.getArticle(tmpEntry)) #tmpEntry["content"])
167                    images = soup('img')
168                    baseurl = ''.join(urlparse(tmpEntry["link"])[:-1])
169                    if imageCache:
170                       for img in images:
171                           try:
172                             filename = self.imageHandler.addImage(self.uniqueId, baseurl, img['src'])
173                             img['src']=filename
174                             tmpEntry["images"].append(filename)
175                           except:
176                               print "Error downloading image %s" %img
177                    tmpEntry["contentLink"] = configdir+self.uniqueId+".d/"+id+".html"
178                    file = open(tmpEntry["contentLink"], "w")
179                    file.write(soup.prettify())
180                    file.close()
181                    tmpEntries[id] = tmpEntry
182                    tmpIds.append(id)
183                    if id not in self.readItems:
184                        self.readItems[id] = False
185                else:
186                     tmpEntries[id] = self.entries[id]
187                     tmpIds.append(id)
188             
189            oldIds = self.ids[:]
190            for entryId in oldIds:
191                 if not entryId in tmpIds:
192                     try:
193                         articleTime = time.mktime(self.entries[entryId]["dateTuple"])
194                         if (currentTime - articleTime > 2*expiry):
195                             self.removeEntry(entryId)
196                             continue
197                         if (currentTime - articleTime > expiry) and (self.isEntryRead(entryId)):
198                             # Entry is over 24 hours, and already read
199                             self.removeEntry(entryId)
200                             continue
201                         tmpEntries[entryId] = self.entries[entryId]
202                         tmpIds.append(entryId)
203                     except:
204                         print "Error purging old articles %s" % entryId
205                         self.removeEntry(entryId)
206
207            self.entries = tmpEntries
208            self.ids = tmpIds
209            tmpUnread = 0
210            
211
212            ids = self.ids[:]
213            for id in ids:
214                if not self.readItems.has_key(id):
215                    self.readItems[id] = False
216                if self.readItems[id]==False:
217                   tmpUnread = tmpUnread + 1
218            del tmp
219            self.countUnread = tmpUnread
220            self.updateTime = time.asctime()
221            self.saveFeed(configdir)
222
223     def extractContent(self, entry):
224         content = ""
225         if entry.has_key('summary'):
226             content = entry.get('summary', '')
227         if entry.has_key('content'):
228             if len(entry.content[0].value) > len(content):
229                 content = entry.content[0].value
230         if content == "":
231             content = entry.get('description', '')
232         return content
233         
234     def extractDate(self, entry):
235         if entry.has_key("updated_parsed"):
236             date1 = entry["updated_parsed"]
237             date = time.strftime("%a, %d %b %Y %H:%M:%S",entry["updated_parsed"])
238         elif entry.has_key("published_parsed"):
239             date1 = entry["published_parsed"]
240             date = time.strftime("%a, %d %b %Y %H:%M:%S", entry["published_parsed"])
241         else:
242             date1= ""
243             date = ""
244         #print date1, date
245         return (date1, date)
246
247     def setEntryRead(self, id):
248         if self.readItems[id]==False:
249             self.countUnread = self.countUnread - 1
250             self.readItems[id] = True
251             
252     def setEntryUnread(self, id):
253         if self.readItems[id]==True:
254             self.countUnread = self.countUnread + 1
255             self.readItems[id] = False
256     
257     def isEntryRead(self, id):
258         return self.readItems[id]
259     
260     def getTitle(self, id):
261         return self.entries[id]["title"]
262     
263     def getContentLink(self, id):
264         if self.entries[id].has_key("contentLink"):
265             return self.entries[id]["contentLink"]
266         return self.entries[id]["link"]
267     
268     def getExternalLink(self, id):
269         return self.entries[id]["link"]
270     
271     def getDate(self, id):
272         return self.entries[id]["date"]
273
274     def getDateTuple(self, id):
275         return self.entries[id]["dateTuple"]
276  
277     def getUniqueId(self, index):
278         return self.ids[index]
279     
280     def generateUniqueId(self, entry):
281         return getId(entry["date"] + entry["title"])
282     
283     def getUpdateTime(self):
284         return self.updateTime
285     
286     def getEntries(self):
287         return self.entries
288     
289     def getIds(self):
290         return self.ids
291     
292     def getNextId(self, id):
293         return self.ids[(self.ids.index(id)+1) % self.getNumberOfEntries()]
294     
295     def getPreviousId(self, id):
296         return self.ids[(self.ids.index(id)-1) % self.getNumberOfEntries()]
297     
298     def getNumberOfUnreadItems(self):
299         return self.countUnread
300     
301     def getNumberOfEntries(self):
302         return len(self.ids)
303     
304     def getItem(self, id):
305         try:
306             return self.entries[id]
307         except:
308             return []
309     
310     def getContent(self, id):
311         if self.entries[id].has_key("contentLink"):
312             file = open(self.entries[id]["contentLink"])
313             content = file.read()
314             file.close()
315             return content
316         return self.entries[id]["content"]
317     
318     def removeEntry(self, id):
319         #try:
320         if self.entries.has_key(id):
321             entry = self.entries[id]
322             if entry.has_key("images"):
323                 for img in entry["images"]:
324                     self.imageHandler.removeImage(self.uniqueId, img)
325             
326             if entry.has_key("contentLink"):
327                 try:
328                     remove(entry["contentLink"])  #os.remove
329                 except:
330                     print "File not found for deletion: %s" % entry["contentLink"]
331             del self.entries[id]
332         else:
333             print "Entries has no %s key" % id
334         if id in self.ids:
335             self.ids.remove(id)
336         else:
337             print "Ids has no %s key" % id
338         if self.readItems.has_key(id):
339             if self.readItems[id]==False:
340                 self.countUnread = self.countUnread - 1
341             del self.readItems[id]
342         else:
343             print "ReadItems has no %s key" % id
344         #except:
345         #    print "Error removing entry %s" %id
346     
347     def getArticle(self, entry):
348         #self.setEntryRead(id)
349         #entry = self.entries[id]
350         title = entry['title']
351         #content = entry.get('content', entry.get('summary_detail', {}))
352         content = entry["content"]
353
354         link = entry['link']
355         date = entry["date"]
356
357         #text = '''<div style="color: black; background-color: white;">'''
358         text = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">'
359         text += "<html><head><title>" + title + "</title>"
360         text += '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>\n'
361         #text += '<style> body {-webkit-user-select: none;} </style>'
362         text += '</head><body><div><a href=\"' + link + '\">' + title + "</a>"
363         text += "<BR /><small><i>Date: " + date + "</i></small></div>"
364         text += "<BR /><BR />"
365         text += content
366         text += "</body></html>"
367         return text
368         
369 class ArchivedArticles(Feed):    
370     def addArchivedArticle(self, title, link, updated_parsed, configdir):
371         entry = {}
372         entry["title"] = title
373         entry["link"] = link
374         entry["summary"] = '<a href=\"' + link + '\">' + title + "</a>"
375         entry["updated_parsed"] = updated_parsed
376         entry["time"] = time.time()
377         #print entry
378         (dateTuple, date) = self.extractDate(entry)
379         tmpEntry = {"title":entry["title"], "content":self.extractContent(entry),
380                             "date":date, "dateTuple":dateTuple, "link":entry["link"], "images":[], "downloaded":False, "time":entry["time"] }
381         id = self.generateUniqueId(tmpEntry)
382         self.entries[id] = tmpEntry
383         self.ids.append(id)  
384         self.readItems[id] = False
385         self.countUnread = self.countUnread + 1
386         self.saveFeed(configdir)
387         self.saveUnread(configdir)
388         
389     def updateFeed(self, configdir, expiryTime=24, proxy=None, imageCache=False):
390         for id in self.getIds():
391             entry = self.entries[id]
392             if not entry["downloaded"]:
393                 #try:
394                     f = urllib2.urlopen(entry["link"])
395                     #entry["content"] = f.read()
396                     html = f.read()
397                     f.close()
398                     soup = BeautifulSoup(html)
399                     images = soup('img')
400                     baseurl = ''.join(urlparse(entry["link"])[:-1])
401                     for img in images:
402                         filename = self.imageHandler.addImage(self.uniqueId, baseurl, img['src'])
403                         #filename = configdir+self.uniqueId+".d/"+getId(img['src'])
404                         #if not isfile(filename):
405                         #    try:
406                         #        if img['src'].startswith("http"):
407                         #            f = urllib2.urlopen(img['src'])
408                         #        else:
409                         #            f = urllib2.urlopen(baseurl+"/"+img['src'])
410                         #            #print baseurl+"/"+img['src']
411                         #        print filename
412                         #        outf = open(filename, "w")
413                         #        outf.write(f.read())
414                         #        f.close()
415                         #        outf.close()
416                         #    except:
417                         #        print "Could not download" + img['src']
418                         img['src']=filename
419                         entry["images"].append(filename)
420                     entry["contentLink"] = configdir+self.uniqueId+".d/"+id+".html"
421                     file = open(entry["contentLink"], "w")
422                     file.write(soup.prettify())
423                     file.close()
424                     if len(entry["content"]) > 0:
425                         entry["downloaded"] = True
426                         entry["time"] = time.time()
427                         self.setEntryUnread(id)
428                 #except:
429                 #    pass
430             currentTime = time.time()
431             expiry = float(expiryTime) * 3600
432             if currentTime - entry["time"] > expiry:
433                 if self.isEntryRead(id):
434                     self.removeEntry(id)
435                 else:
436                     if currentTime - entry["time"] > 2*expiry:
437                         self.removeEntry(id)
438         self.updateTime = time.asctime()
439         self.saveFeed(configdir)
440
441     def getArticle(self, index):
442         self.setEntryRead(index)
443         content = self.getContent(index)
444         return content
445
446
447 class Listing:
448     # Lists all the feeds in a dictionary, and expose the data
449     def __init__(self, configdir):
450         self.configdir = configdir
451         #self.feeds = {}
452         if isfile(self.configdir+"feeds.pickle"):
453             file = open(self.configdir+"feeds.pickle")
454             self.listOfFeeds = pickle.load(file)
455             file.close()
456         else:
457             self.listOfFeeds = {getId("Slashdot"):{"title":"Slashdot", "url":"http://rss.slashdot.org/Slashdot/slashdot", "unread":0, "updateTime":"Never"}, }
458         if isfile(self.configdir+"images.pickle"):
459             file = open(self.configdir+"images.pickle")
460             self.imageHandler = pickle.load(file)
461             file.close()
462         else:
463             self.imageHandler = ImageHandler(self.configdir)
464         if self.listOfFeeds.has_key("font"):
465             del self.listOfFeeds["font"]
466         if self.listOfFeeds.has_key("feedingit-order"):
467             self.sortedKeys = self.listOfFeeds["feedingit-order"]
468         else:
469             self.sortedKeys = self.listOfFeeds.keys()
470             if "font" in self.sortedKeys:
471                 self.sortedKeys.remove("font")
472             self.sortedKeys.sort(key=lambda obj: self.getFeedTitle(obj))
473         list = self.sortedKeys[:]
474         self.closeCurrentlyDisplayedFeed()
475
476     def addArchivedArticle(self, key, index):
477         feed = self.getFeed(key)
478         title = feed.getTitle(index)
479         link = feed.getExternalLink(index)
480         date = feed.getDateTuple(index)
481         if not self.listOfFeeds.has_key("ArchivedArticles"):
482             self.listOfFeeds["ArchivedArticles"] = {"title":"Archived Articles", "url":"", "unread":0, "updateTime":"Never"}
483             self.sortedKeys.append("ArchivedArticles")
484             #self.feeds["Archived Articles"] = ArchivedArticles("Archived Articles", "")
485             self.saveConfig()
486         archFeed = self.getFeed("ArchivedArticles")
487         archFeed.addArchivedArticle(title, link, date, self.configdir)
488         self.listOfFeeds[key]["unread"] = archFeed.getNumberOfUnreadItems()
489         
490     def loadFeed(self, key):
491             if isfile(self.configdir+key+".d/feed"):
492                 file = open(self.configdir+key+".d/feed")
493                 feed = pickle.load(file)
494                 file.close()
495                 try:
496                     feed.uniqueId
497                     feed.imageHandler
498                 except AttributeError:
499                     feed.uniqueId = getId(feed.name)
500                     feed.imageHandler = self.imageHandler
501                 #feed.reloadUnread(self.configdir)
502             else:
503                 #print key
504                 title = self.listOfFeeds[key]["title"]
505                 url = self.listOfFeeds[key]["url"]
506                 if key == "ArchivedArticles":
507                     feed = ArchivedArticles("ArchivedArticles", title, url, self.imageHandler)
508                 else:
509                     feed = Feed(getId(title), title, url, self.imageHandler)
510             return feed
511         
512     def updateFeeds(self, expiryTime=24, proxy=None, imageCache=False):
513         for key in self.getListOfFeeds():
514             feed = self.loadFeed(key)
515             feed.updateFeed(self.configdir, expiryTime, proxy, imageCache)
516             self.listOfFeeds[key]["unread"] = feed.getNumberOfUnreadItems()
517             self.listOfFeeds[key]["updateTime"] = feed.getUpdateTime()
518             
519     def updateFeed(self, key, expiryTime=24, proxy=None, imageCache=False):
520         feed = self.getFeed(key)
521         feed.updateFeed(self.configdir, expiryTime, proxy, imageCache)
522         self.listOfFeeds[key]["unread"] = feed.getNumberOfUnreadItems()
523         self.listOfFeeds[key]["updateTime"] = feed.getUpdateTime()
524         
525     def editFeed(self, key, title, url):
526         self.listOfFeeds[key]["title"] = title
527         self.listOfFeeds[key]["url"] = url
528         feed = self.loadFeed(key)
529         feed.editFeed(url)
530
531     def getFeed(self, key):
532         feed = self.loadFeed(key)
533         feed.reloadUnread(self.configdir)
534         return feed
535     
536     def getFeedUpdateTime(self, key):
537         #print self.listOfFeeds.has_key(key)
538         if not self.listOfFeeds[key].has_key("updateTime"):
539             self.listOfFeeds[key]["updateTime"] = "Never"
540         return self.listOfFeeds[key]["updateTime"]
541     
542     def getFeedNumberOfUnreadItems(self, key):
543         if not self.listOfFeeds[key].has_key("unread"):
544             self.listOfFeeds[key]["unread"] = 0
545         return self.listOfFeeds[key]["unread"]
546
547     def updateUnread(self, key, unreadItems):
548         self.listOfFeeds[key]["unread"] = unreadItems
549    
550     def getFeedTitle(self, key):
551         return self.listOfFeeds[key]["title"]
552     
553     def getFeedUrl(self, key):
554         return self.listOfFeeds[key]["url"]
555     
556     def getListOfFeeds(self):
557         return self.sortedKeys
558     
559     def addFeed(self, title, url):
560         if not self.listOfFeeds.has_key(getId(title)):
561             self.listOfFeeds[getId(title)] = {"title":title, "url":url, "unread":0, "updateTime":"Never"}
562             self.sortedKeys.append(getId(title))
563             self.saveConfig()
564             #self.feeds[getId(title)] = Feed(title, url)
565             return True
566         else:
567             return False
568         
569     def removeFeed(self, key):
570         del self.listOfFeeds[key]
571         self.sortedKeys.remove(key)
572         #del self.feeds[key]
573         if isdir(self.configdir+key+".d/"):
574            rmtree(self.configdir+key+".d/")
575         self.saveConfig()
576     
577     def saveConfig(self):
578         self.listOfFeeds["feedingit-order"] = self.sortedKeys
579         file = open(self.configdir+"feeds.pickle", "w")
580         pickle.dump(self.listOfFeeds, file)
581         file.close()
582         file = open(self.configdir+"images.pickle", "w")
583         pickle.dump(self.imageHandler, file)
584         file.close()
585         
586     def moveUp(self, key):
587         index = self.sortedKeys.index(key)
588         self.sortedKeys[index] = self.sortedKeys[index-1]
589         self.sortedKeys[index-1] = key
590         
591     def moveDown(self, key):
592         index = self.sortedKeys.index(key)
593         index2 = (index+1)%len(self.sortedKeys)
594         self.sortedKeys[index] = self.sortedKeys[index2]
595         self.sortedKeys[index2] = key
596         
597     def setCurrentlyDisplayedFeed(self, key):
598         self.currentlyDisplayedFeed = key
599     def closeCurrentlyDisplayedFeed(self):
600         self.currentlyDisplayedFeed = False
601     def getCurrentlyDisplayedFeed(self):
602         return self.currentlyDisplayedFeed
603     
604 if __name__ == "__main__":
605     listing = Listing('/home/user/.feedingit/')
606     list = listing.getListOfFeeds()[:]
607         #list.reverse()
608     for key in list:
609         if key.startswith('d8'):
610             print listing.getFeedUpdateTime(key)