Fixed etag value of None if a feed did not contain updates, plus parsing for gzipped...
[feedingit] / src / rss.py
1 #!/usr/bin/env python2.5
2
3
4 # Copyright (c) 2007-2008 INdT.
5 # This program is free software: you can redistribute it and/or modify
6 # it under the terms of the GNU Lesser General Public License as published by
7 # the Free Software Foundation, either version 3 of the License, or
8 # (at your option) any later version.
9 #
10 #  This program is distributed in the hope that it will be useful,
11 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
12 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 #  GNU Lesser General Public License for more details.
14 #
15 #  You should have received a copy of the GNU Lesser General Public License
16 #  along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 #
18
19 # ============================================================================
20 # Name        : FeedingIt.py
21 # Author      : Yves Marcoz
22 # Version     : 0.5.4
23 # Description : Simple RSS Reader
24 # ============================================================================
25
26 from os.path import isfile, isdir
27 from shutil import rmtree
28 from os import mkdir, remove, utime
29 import pickle
30 import md5
31 import feedparser
32 import time
33 import urllib2
34 from BeautifulSoup import BeautifulSoup
35 from urlparse import urljoin
36
37 #CONFIGDIR="/home/user/.feedingit/"
38
39 def getId(string):
40     return md5.new(string).hexdigest()
41
42 #def getProxy():
43 #    import gconf
44 #    if gconf.client_get_default().get_bool('/system/http_proxy/use_http_proxy'):
45 #        port = gconf.client_get_default().get_int('/system/http_proxy/port')
46 #        http = gconf.client_get_default().get_string('/system/http_proxy/host')
47 #        proxy = proxy = urllib2.ProxyHandler( {"http":"http://%s:%s/"% (http,port)} )
48 #        return (True, proxy)
49 #    return (False, None)
50
51 # Enable proxy support for images and ArchivedArticles
52 #(proxy_support, proxy) = getProxy()
53 #if proxy_support:
54 #    opener = urllib2.build_opener(proxy)
55 #    urllib2.install_opener(opener)
56
57 # Entry = {"title":XXX, "content":XXX, "date":XXX, "link":XXX, images = [] }
58
59 class ImageHandler:
60     def __init__(self, configdir):
61         pass
62
63 class Feed:
64     def __init__(self, uniqueId, name, url):
65         self.titles = []
66         self.entries = {}
67         self.ids = []
68         self.readItems = {}
69         self.name = name
70         self.url = url
71         self.countUnread = 0
72         self.updateTime = "Never"
73         self.uniqueId = uniqueId
74         self.etag = None
75         self.modified = None
76
77     def addImage(self, configdir, key, baseurl, url):
78         filename = configdir+key+".d/"+getId(url)
79         if not isfile(filename):
80             try:
81                 #if url.startswith("http"):
82                 #    f = urllib2.urlopen(url)
83                 #else:
84                 f = urllib2.urlopen(urljoin(baseurl,url))
85                 outf = open(filename, "w")
86                 outf.write(f.read())
87                 f.close()
88                 outf.close()
89             except:
90                 print "Could not download " + url
91         else:
92             #open(filename,"a").close()  # "Touch" the file
93             file = open(filename,"a")
94             utime(filename, None)
95             file.close()
96         return filename
97
98     def editFeed(self, url):
99         self.url = url
100
101     def saveFeed(self, configdir):
102         if not isdir(configdir+self.uniqueId+".d"):
103              mkdir(configdir+self.uniqueId+".d")
104         file = open(configdir+self.uniqueId+".d/feed", "w")
105         pickle.dump(self, file )
106         file.close()
107         self.saveUnread(configdir)
108         
109     def saveUnread(self, configdir):
110         if not isdir(configdir+self.uniqueId+".d"):
111             mkdir(configdir+self.uniqueId+".d")
112         file = open(configdir+self.uniqueId+".d/unread", "w")
113         pickle.dump(self.readItems, file )
114         file.close()
115
116     def reloadUnread(self, configdir):
117         try:
118             file = open(configdir+self.uniqueId+".d/unread", "r")
119             self.readItems = pickle.load( file )
120             file.close()
121             self.countUnread = 0
122             for id in self.getIds():
123                if self.readItems[id]==False:
124                   self.countUnread = self.countUnread + 1
125         except:
126             pass
127         return self.countUnread
128
129     def updateFeed(self, configdir, expiryTime=24, proxy=None, imageCache=False):
130         # Expiry time is in hours
131         if proxy == None:
132             tmp=feedparser.parse(self.url, etag = self.etag, modified = self.modified)
133         else:
134             tmp=feedparser.parse(self.url, etag = self.etag, modified = self.modified, handlers = [proxy])
135         expiry = float(expiryTime) * 3600.
136
137         # Check if the parse was succesful (number of entries > 0, else do nothing)
138         if len(tmp["entries"])>0:
139            # The etag and modified value should only be updated if the content was not null
140            try:
141                self.etag = tmp["etag"]
142            except KeyError:
143                self.etag = None
144            try:
145                self.modified = tmp["modified"]
146            except KeyError:
147                self.modified = None
148            #if len(tmp["entries"])>0:
149            if not isdir(configdir+self.uniqueId+".d"):
150                mkdir(configdir+self.uniqueId+".d")
151            try:
152                f = urllib2.urlopen(urljoin(tmp["feed"]["link"],"/favicon.ico"))
153                data = f.read()
154                f.close()
155                outf = open(configdir+self.uniqueId+".d/favicon.ico", "w")
156                outf.write(data)
157                outf.close()
158                del data
159            except:
160                #import traceback
161                #traceback.print_exc()
162                 pass
163
164
165            #reversedEntries = self.getEntries()
166            #reversedEntries.reverse()
167
168            currentTime = time.time()
169            tmpEntries = {}
170            tmpIds = []
171            for entry in tmp["entries"]:
172                (dateTuple, date) = self.extractDate(entry)
173                try:
174                    entry["title"]
175                except:
176                    entry["title"] = "No Title"
177                try:
178                    entry["link"]
179                except:
180                    entry["link"] = ""
181                tmpEntry = {"title":entry["title"], "content":self.extractContent(entry),
182                             "date":date, "dateTuple":dateTuple, "link":entry["link"], "images":[] }
183                id = self.generateUniqueId(tmpEntry)
184                
185                #articleTime = time.mktime(self.entries[id]["dateTuple"])
186                if not id in self.ids:
187                    soup = BeautifulSoup(self.getArticle(tmpEntry)) #tmpEntry["content"])
188                    images = soup('img')
189                    baseurl = tmpEntry["link"]
190                    if imageCache:
191                       for img in images:
192                           try:
193                             filename = self.addImage(configdir, self.uniqueId, baseurl, img['src'])
194                             img['src']=filename
195                             tmpEntry["images"].append(filename)
196                           except:
197                               print "Error downloading image %s" % img
198                    tmpEntry["contentLink"] = configdir+self.uniqueId+".d/"+id+".html"
199                    file = open(tmpEntry["contentLink"], "w")
200                    file.write(soup.prettify())
201                    file.close()
202                    tmpEntries[id] = tmpEntry
203                    tmpIds.append(id)
204                    if id not in self.readItems:
205                        self.readItems[id] = False
206                else:
207                    try:
208                        filename = configdir+self.uniqueId+".d/"+id+".html"
209                        file = open(filename,"a")
210                        utime(filename, None)
211                        file.close()
212                        for image in self.entries[id]["images"]:
213                             file = open(image,"a")
214                             utime(image, None)
215                             file.close()
216                    except:
217                        pass
218                    tmpEntries[id] = self.entries[id]
219                    tmpIds.append(id)
220             
221            oldIds = self.ids[:]
222            for entryId in oldIds:
223                 if not entryId in tmpIds:
224                     try:
225                         articleTime = time.mktime(self.entries[entryId]["dateTuple"])
226                         if (currentTime - articleTime > 2*expiry):
227                             self.removeEntry(entryId)
228                             continue
229                         if (currentTime - articleTime > expiry) and (self.isEntryRead(entryId)):
230                             # Entry is over 24 hours, and already read
231                             self.removeEntry(entryId)
232                             continue
233                         tmpEntries[entryId] = self.entries[entryId]
234                         tmpIds.append(entryId)
235                     except:
236                         print "Error purging old articles %s" % entryId
237                         self.removeEntry(entryId)
238
239            self.entries = tmpEntries
240            self.ids = tmpIds
241            tmpUnread = 0
242            
243
244            ids = self.ids[:]
245            for id in ids:
246                if not self.readItems.has_key(id):
247                    self.readItems[id] = False
248                if self.readItems[id]==False:
249                   tmpUnread = tmpUnread + 1
250            keys = self.readItems.keys()
251            for id in keys:
252                if not id in self.ids:
253                    del self.readItems[id]
254            del tmp
255            self.countUnread = tmpUnread
256            self.updateTime = time.asctime()
257            self.saveFeed(configdir)
258            from glob import glob
259            from os import stat
260            for file in glob(configdir+self.uniqueId+".d/*"):
261                 #
262                 stats = stat(file)
263                 #
264                 # put the two dates into matching format
265                 #
266                 lastmodDate = stats[8]
267                 #
268                 expDate = time.time()-expiry*3
269                 # check if image-last-modified-date is outdated
270                 #
271                 if expDate > lastmodDate:
272                     #
273                     try:
274                         #
275                         #print 'Removing', file
276                         #
277                         remove(file) # commented out for testing
278                         #
279                     except OSError:
280                         #
281                         print 'Could not remove', file
282            
283
284     def extractContent(self, entry):
285         content = ""
286         if entry.has_key('summary'):
287             content = entry.get('summary', '')
288         if entry.has_key('content'):
289             if len(entry.content[0].value) > len(content):
290                 content = entry.content[0].value
291         if content == "":
292             content = entry.get('description', '')
293         return content
294         
295     def extractDate(self, entry):
296         if entry.has_key("updated_parsed"):
297             date1 = entry["updated_parsed"]
298             date = time.strftime("%a, %d %b %Y %H:%M:%S",entry["updated_parsed"])
299         elif entry.has_key("published_parsed"):
300             date1 = entry["published_parsed"]
301             date = time.strftime("%a, %d %b %Y %H:%M:%S", entry["published_parsed"])
302         else:
303             date1= ""
304             date = ""
305         #print date1, date
306         return (date1, date)
307
308     def setEntryRead(self, id):
309         if self.readItems[id]==False:
310             self.countUnread = self.countUnread - 1
311             self.readItems[id] = True
312             
313     def setEntryUnread(self, id):
314         if self.readItems[id]==True:
315             self.countUnread = self.countUnread + 1
316             self.readItems[id] = False
317     
318     def isEntryRead(self, id):
319         # Check if an entry is read; return False if the read
320         # status of an entry is unknown (id not in readItems)
321         return self.readItems.get(id, False)
322     
323     def getTitle(self, id):
324         return self.entries[id]["title"]
325     
326     def getContentLink(self, id):
327         if self.entries[id].has_key("contentLink"):
328             return self.entries[id]["contentLink"]
329         return self.entries[id]["link"]
330     
331     def getExternalLink(self, id):
332         return self.entries[id]["link"]
333     
334     def getDate(self, id):
335         return self.entries[id]["date"]
336
337     def getDateTuple(self, id):
338         return self.entries[id]["dateTuple"]
339  
340     def getUniqueId(self, index):
341         return self.ids[index]
342     
343     def generateUniqueId(self, entry):
344         return getId(entry["date"] + entry["title"])
345     
346     def getUpdateTime(self):
347         return self.updateTime
348     
349     def getEntries(self):
350         return self.entries
351     
352     def getIds(self):
353         return self.ids
354     
355     def getNextId(self, id):
356         return self.ids[(self.ids.index(id)+1) % self.getNumberOfEntries()]
357     
358     def getPreviousId(self, id):
359         return self.ids[(self.ids.index(id)-1) % self.getNumberOfEntries()]
360     
361     def getNumberOfUnreadItems(self):
362         return self.countUnread
363     
364     def getNumberOfEntries(self):
365         return len(self.ids)
366     
367     def getItem(self, id):
368         try:
369             return self.entries[id]
370         except:
371             return []
372     
373     def getContent(self, id):
374         if self.entries[id].has_key("contentLink"):
375             file = open(self.entries[id]["contentLink"])
376             content = file.read()
377             file.close()
378             return content
379         return self.entries[id]["content"]
380     
381     def removeEntry(self, id):
382         #try:
383         if self.entries.has_key(id):
384             entry = self.entries[id]
385             
386             if entry.has_key("contentLink"):
387                 try:
388                     remove(entry["contentLink"])  #os.remove
389                 except:
390                     print "File not found for deletion: %s" % entry["contentLink"]
391             del self.entries[id]
392         else:
393             print "Entries has no %s key" % id
394         if id in self.ids:
395             self.ids.remove(id)
396         else:
397             print "Ids has no %s key" % id
398         if self.readItems.has_key(id):
399             if self.readItems[id]==False:
400                 self.countUnread = self.countUnread - 1
401             del self.readItems[id]
402         else:
403             print "ReadItems has no %s key" % id
404         #except:
405         #    print "Error removing entry %s" %id
406     
407     def getArticle(self, entry):
408         #self.setEntryRead(id)
409         #entry = self.entries[id]
410         title = entry['title']
411         #content = entry.get('content', entry.get('summary_detail', {}))
412         content = entry["content"]
413
414         link = entry['link']
415         date = entry["date"]
416
417         #text = '''<div style="color: black; background-color: white;">'''
418         text = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">'
419         text += "<html><head><title>" + title + "</title>"
420         text += '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>\n'
421         #text += '<style> body {-webkit-user-select: none;} </style>'
422         text += '</head><body><div><a href=\"' + link + '\">' + title + "</a>"
423         text += "<BR /><small><i>Date: " + date + "</i></small></div>"
424         text += "<BR /><BR />"
425         text += content
426         text += "</body></html>"
427         return text
428         
429 class ArchivedArticles(Feed):    
430     def addArchivedArticle(self, title, link, updated_parsed, configdir):
431         entry = {}
432         entry["title"] = title
433         entry["link"] = link
434         entry["summary"] = '<a href=\"' + link + '\">' + title + "</a>"
435         entry["updated_parsed"] = updated_parsed
436         entry["time"] = time.time()
437         #print entry
438         (dateTuple, date) = self.extractDate(entry)
439         tmpEntry = {"title":entry["title"], "content":self.extractContent(entry),
440                             "date":date, "dateTuple":dateTuple, "link":entry["link"], "images":[], "downloaded":False, "time":entry["time"] }
441         id = self.generateUniqueId(tmpEntry)
442         self.entries[id] = tmpEntry
443         self.ids.append(id)  
444         self.readItems[id] = False
445         self.countUnread = self.countUnread + 1
446         self.saveFeed(configdir)
447         self.saveUnread(configdir)
448         
449     def updateFeed(self, configdir, expiryTime=24, proxy=None, imageCache=False):
450         for id in self.getIds():
451             entry = self.entries[id]
452             if not entry["downloaded"]:
453                 #try:
454                     f = urllib2.urlopen(entry["link"])
455                     #entry["content"] = f.read()
456                     html = f.read()
457                     f.close()
458                     soup = BeautifulSoup(html)
459                     images = soup('img')
460                     baseurl = entry["link"]
461                     for img in images:
462                         filename = self.addImage(configdir, self.uniqueId, baseurl, img['src'])
463                         img['src']=filename
464                     entry["contentLink"] = configdir+self.uniqueId+".d/"+id+".html"
465                     file = open(entry["contentLink"], "w")
466                     file.write(soup.prettify())
467                     file.close()
468                     if len(entry["content"]) > 0:
469                         entry["downloaded"] = True
470                         entry["time"] = time.time()
471                         self.setEntryUnread(id)
472                 #except:
473                 #    pass
474             #currentTime = time.time()
475             #expiry = float(expiryTime) * 3600
476             #if currentTime - entry["time"] > expiry:
477             #    if self.isEntryRead(id):
478             #        self.removeEntry(id)
479             #    else:
480             #        if currentTime - entry["time"] > 2*expiry:
481             #            self.removeEntry(id)
482         self.updateTime = time.asctime()
483         self.saveFeed(configdir)
484         
485     def purgeReadArticles(self):
486         ids = self.getIds()
487         for id in ids:
488             entry = self.entries[id]
489             if self.isEntryRead(id):
490                 self.removeEntry(id)
491                 
492     def removeArticle(self, id):
493         self.removeEntry(id)
494
495     def getArticle(self, index):
496         self.setEntryRead(index)
497         content = self.getContent(index)
498         return content
499
500
501 class Listing:
502     # Lists all the feeds in a dictionary, and expose the data
503     def __init__(self, configdir):
504         self.configdir = configdir
505         #self.feeds = {}
506         if isfile(self.configdir+"feeds.pickle"):
507             file = open(self.configdir+"feeds.pickle")
508             self.listOfFeeds = pickle.load(file)
509             file.close()
510         else:
511             self.listOfFeeds = {getId("Maemo News"):{"title":"Maemo News", "url":"http://maemo.org/news/items.xml", "unread":0, "updateTime":"Never"}, }
512         if self.listOfFeeds.has_key("font"):
513             del self.listOfFeeds["font"]
514         if self.listOfFeeds.has_key("feedingit-order"):
515             self.sortedKeys = self.listOfFeeds["feedingit-order"]
516         else:
517             self.sortedKeys = self.listOfFeeds.keys()
518             if "font" in self.sortedKeys:
519                 self.sortedKeys.remove("font")
520             self.sortedKeys.sort(key=lambda obj: self.getFeedTitle(obj))
521         list = self.sortedKeys[:]
522         #self.closeCurrentlyDisplayedFeed()
523
524     def addArchivedArticle(self, key, index):
525         feed = self.getFeed(key)
526         title = feed.getTitle(index)
527         link = feed.getExternalLink(index)
528         date = feed.getDateTuple(index)
529         if not self.listOfFeeds.has_key("ArchivedArticles"):
530             self.listOfFeeds["ArchivedArticles"] = {"title":"Archived Articles", "url":"", "unread":0, "updateTime":"Never"}
531             self.sortedKeys.append("ArchivedArticles")
532             #self.feeds["Archived Articles"] = ArchivedArticles("Archived Articles", "")
533             self.saveConfig()
534         archFeed = self.getFeed("ArchivedArticles")
535         archFeed.addArchivedArticle(title, link, date, self.configdir)
536         self.listOfFeeds[key]["unread"] = archFeed.getNumberOfUnreadItems()
537         
538     def loadFeed(self, key):
539             if isfile(self.configdir+key+".d/feed"):
540                 file = open(self.configdir+key+".d/feed")
541                 feed = pickle.load(file)
542                 file.close()
543                 try:
544                     feed.uniqueId
545                 except AttributeError:
546                     feed.uniqueId = getId(feed.name)
547                 try:
548                     del feed.imageHandler
549                 except:
550                     pass
551                 try:
552                     feed.etag
553                 except AttributeError:
554                     feed.etag = None
555                 try:
556                     feed.modified
557                 except AttributeError:
558                     feed.modified = None
559                 #feed.reloadUnread(self.configdir)
560             else:
561                 #print key
562                 title = self.listOfFeeds[key]["title"]
563                 url = self.listOfFeeds[key]["url"]
564                 if key == "ArchivedArticles":
565                     feed = ArchivedArticles("ArchivedArticles", title, url)
566                 else:
567                     feed = Feed(getId(title), title, url)
568             return feed
569         
570     def updateFeeds(self, expiryTime=24, proxy=None, imageCache=False):
571         for key in self.getListOfFeeds():
572             feed = self.loadFeed(key)
573             feed.updateFeed(self.configdir, expiryTime, proxy, imageCache)
574             self.listOfFeeds[key]["unread"] = feed.getNumberOfUnreadItems()
575             self.listOfFeeds[key]["updateTime"] = feed.getUpdateTime()
576             
577     def updateFeed(self, key, expiryTime=24, proxy=None, imageCache=False):
578         feed = self.getFeed(key)
579         feed.updateFeed(self.configdir, expiryTime, proxy, imageCache)
580         self.listOfFeeds[key]["unread"] = feed.getNumberOfUnreadItems()
581         self.listOfFeeds[key]["updateTime"] = feed.getUpdateTime()
582         
583     def editFeed(self, key, title, url):
584         self.listOfFeeds[key]["title"] = title
585         self.listOfFeeds[key]["url"] = url
586         feed = self.loadFeed(key)
587         feed.editFeed(url)
588
589     def getFeed(self, key):
590         try:
591             feed = self.loadFeed(key)
592             feed.reloadUnread(self.configdir)
593         except:
594             # If the feed file gets corrupted, we need to reset the feed.
595             import traceback
596             traceback.print_exc()
597             import dbus
598             bus = dbus.SessionBus()
599             remote_object = bus.get_object("org.freedesktop.Notifications", # Connection name
600                                "/org/freedesktop/Notifications" # Object's path
601                               )
602             iface = dbus.Interface(remote_object, 'org.freedesktop.Notifications')
603             iface.SystemNoteInfoprint("Error opening feed %s, it has been reset." % self.getFeedTitle(key))
604             if isdir(self.configdir+key+".d/"):
605                 rmtree(self.configdir+key+".d/")
606             feed = self.loadFeed(key)
607         return feed
608     
609     def getFeedUpdateTime(self, key):
610         #print self.listOfFeeds.has_key(key)
611         if not self.listOfFeeds[key].has_key("updateTime"):
612             self.listOfFeeds[key]["updateTime"] = "Never"
613         return self.listOfFeeds[key]["updateTime"]
614     
615     def getFeedNumberOfUnreadItems(self, key):
616         if not self.listOfFeeds[key].has_key("unread"):
617             self.listOfFeeds[key]["unread"] = 0
618         return self.listOfFeeds[key]["unread"]
619
620     def updateUnread(self, key, unreadItems):
621         self.listOfFeeds[key]["unread"] = unreadItems
622    
623     def getFeedTitle(self, key):
624         return self.listOfFeeds[key]["title"]
625     
626     def getFeedUrl(self, key):
627         return self.listOfFeeds[key]["url"]
628     
629     def getListOfFeeds(self):
630         return self.sortedKeys
631     
632     def getFavicon(self, key):
633         filename = self.configdir+key+".d/favicon.ico"
634         if isfile(filename):
635             return filename
636         else:
637             return False
638     
639     def addFeed(self, title, url):
640         if not self.listOfFeeds.has_key(getId(title)):
641             self.listOfFeeds[getId(title)] = {"title":title, "url":url, "unread":0, "updateTime":"Never"}
642             self.sortedKeys.append(getId(title))
643             self.saveConfig()
644             #self.feeds[getId(title)] = Feed(title, url)
645             return True
646         else:
647             return False
648         
649     def removeFeed(self, key):
650         del self.listOfFeeds[key]
651         self.sortedKeys.remove(key)
652         #del self.feeds[key]
653         if isdir(self.configdir+key+".d/"):
654            rmtree(self.configdir+key+".d/")
655         self.saveConfig()
656     
657     def saveConfig(self):
658         self.listOfFeeds["feedingit-order"] = self.sortedKeys
659         file = open(self.configdir+"feeds.pickle", "w")
660         pickle.dump(self.listOfFeeds, file)
661         file.close()
662         
663     def moveUp(self, key):
664         index = self.sortedKeys.index(key)
665         self.sortedKeys[index] = self.sortedKeys[index-1]
666         self.sortedKeys[index-1] = key
667         
668     def moveDown(self, key):
669         index = self.sortedKeys.index(key)
670         index2 = (index+1)%len(self.sortedKeys)
671         self.sortedKeys[index] = self.sortedKeys[index2]
672         self.sortedKeys[index2] = key
673     
674 if __name__ == "__main__":
675     listing = Listing('/home/user/.feedingit/')
676     list = listing.getListOfFeeds()[:]
677         #list.reverse()
678     for key in list:
679         if key.startswith('d8'):
680             print listing.getFeedUpdateTime(key)