Fix a bug in isEntryRead for unknwn IDs
[feedingit] / src / rss.py
1 #!/usr/bin/env python2.5
2
3
4 # Copyright (c) 2007-2008 INdT.
5 # This program is free software: you can redistribute it and/or modify
6 # it under the terms of the GNU Lesser General Public License as published by
7 # the Free Software Foundation, either version 3 of the License, or
8 # (at your option) any later version.
9 #
10 #  This program is distributed in the hope that it will be useful,
11 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
12 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 #  GNU Lesser General Public License for more details.
14 #
15 #  You should have received a copy of the GNU Lesser General Public License
16 #  along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 #
18
19 # ============================================================================
20 # Name        : FeedingIt.py
21 # Author      : Yves Marcoz
22 # Version     : 0.5.4
23 # Description : Simple RSS Reader
24 # ============================================================================
25
26 from os.path import isfile, isdir
27 from shutil import rmtree
28 from os import mkdir, remove, utime
29 import pickle
30 import md5
31 import feedparser
32 import time
33 import urllib2
34 from BeautifulSoup import BeautifulSoup
35 from urlparse import urljoin
36
37 #CONFIGDIR="/home/user/.feedingit/"
38
39 def getId(string):
40     return md5.new(string).hexdigest()
41
42 #def getProxy():
43 #    import gconf
44 #    if gconf.client_get_default().get_bool('/system/http_proxy/use_http_proxy'):
45 #        port = gconf.client_get_default().get_int('/system/http_proxy/port')
46 #        http = gconf.client_get_default().get_string('/system/http_proxy/host')
47 #        proxy = proxy = urllib2.ProxyHandler( {"http":"http://%s:%s/"% (http,port)} )
48 #        return (True, proxy)
49 #    return (False, None)
50
51 # Enable proxy support for images and ArchivedArticles
52 #(proxy_support, proxy) = getProxy()
53 #if proxy_support:
54 #    opener = urllib2.build_opener(proxy)
55 #    urllib2.install_opener(opener)
56
57 # Entry = {"title":XXX, "content":XXX, "date":XXX, "link":XXX, images = [] }
58
59 class ImageHandler:
60     def __init__(self, configdir):
61         pass
62
63 class Feed:
64     def __init__(self, uniqueId, name, url):
65         self.titles = []
66         self.entries = {}
67         self.ids = []
68         self.readItems = {}
69         self.name = name
70         self.url = url
71         self.countUnread = 0
72         self.updateTime = "Never"
73         self.uniqueId = uniqueId
74         self.etag = None
75         self.modified = None
76
77     def addImage(self, configdir, key, baseurl, url):
78         filename = configdir+key+".d/"+getId(url)
79         if not isfile(filename):
80             try:
81                 #if url.startswith("http"):
82                 #    f = urllib2.urlopen(url)
83                 #else:
84                 f = urllib2.urlopen(urljoin(baseurl,url))
85                 outf = open(filename, "w")
86                 outf.write(f.read())
87                 f.close()
88                 outf.close()
89             except:
90                 print "Could not download " + url
91         else:
92             #open(filename,"a").close()  # "Touch" the file
93             file = open(filename,"a")
94             utime(filename, None)
95             file.close()
96         return filename
97
98     def editFeed(self, url):
99         self.url = url
100
101     def saveFeed(self, configdir):
102         if not isdir(configdir+self.uniqueId+".d"):
103              mkdir(configdir+self.uniqueId+".d")
104         file = open(configdir+self.uniqueId+".d/feed", "w")
105         pickle.dump(self, file )
106         file.close()
107         self.saveUnread(configdir)
108         
109     def saveUnread(self, configdir):
110         if not isdir(configdir+self.uniqueId+".d"):
111             mkdir(configdir+self.uniqueId+".d")
112         file = open(configdir+self.uniqueId+".d/unread", "w")
113         pickle.dump(self.readItems, file )
114         file.close()
115
116     def reloadUnread(self, configdir):
117         try:
118             file = open(configdir+self.uniqueId+".d/unread", "r")
119             self.readItems = pickle.load( file )
120             file.close()
121             self.countUnread = 0
122             for id in self.getIds():
123                if self.readItems[id]==False:
124                   self.countUnread = self.countUnread + 1
125         except:
126             pass
127         return self.countUnread
128
129     def updateFeed(self, configdir, expiryTime=24, proxy=None, imageCache=False):
130         # Expiry time is in hours
131         if proxy == None:
132             tmp=feedparser.parse(self.url, etag = self.etag, modified = self.modified)
133         else:
134             tmp=feedparser.parse(self.url, etag = self.etag, modified = self.modified, handlers = [proxy])
135         try:
136             self.etag = tmp["etag"]
137         except KeyError:
138             pass
139         try:
140             self.modified = tmp["modified"]
141         except KeyError:
142             pass
143         expiry = float(expiryTime) * 3600.
144         # Check if the parse was succesful (number of entries > 0, else do nothing)
145         if len(tmp["entries"])>0:
146            if not isdir(configdir+self.uniqueId+".d"):
147                mkdir(configdir+self.uniqueId+".d")
148            try:
149                f = urllib2.urlopen(urljoin(tmp["feed"]["link"],"/favicon.ico"))
150                data = f.read()
151                f.close()
152                outf = open(configdir+self.uniqueId+".d/favicon.ico", "w")
153                outf.write(data)
154                outf.close()
155                del data
156            except:
157                #import traceback
158                #traceback.print_exc()
159                 pass
160
161
162            #reversedEntries = self.getEntries()
163            #reversedEntries.reverse()
164
165            currentTime = time.time()
166            tmpEntries = {}
167            tmpIds = []
168            for entry in tmp["entries"]:
169                (dateTuple, date) = self.extractDate(entry)
170                try:
171                    entry["title"]
172                except:
173                    entry["title"] = "No Title"
174                try:
175                    entry["link"]
176                except:
177                    entry["link"] = ""
178                tmpEntry = {"title":entry["title"], "content":self.extractContent(entry),
179                             "date":date, "dateTuple":dateTuple, "link":entry["link"], "images":[] }
180                id = self.generateUniqueId(tmpEntry)
181                
182                #articleTime = time.mktime(self.entries[id]["dateTuple"])
183                if not id in self.ids:
184                    soup = BeautifulSoup(self.getArticle(tmpEntry)) #tmpEntry["content"])
185                    images = soup('img')
186                    baseurl = tmpEntry["link"]
187                    if imageCache:
188                       for img in images:
189                           try:
190                             filename = self.addImage(configdir, self.uniqueId, baseurl, img['src'])
191                             img['src']=filename
192                             tmpEntry["images"].append(filename)
193                           except:
194                               print "Error downloading image %s" % img
195                    tmpEntry["contentLink"] = configdir+self.uniqueId+".d/"+id+".html"
196                    file = open(tmpEntry["contentLink"], "w")
197                    file.write(soup.prettify())
198                    file.close()
199                    tmpEntries[id] = tmpEntry
200                    tmpIds.append(id)
201                    if id not in self.readItems:
202                        self.readItems[id] = False
203                else:
204                    try:
205                        filename = configdir+self.uniqueId+".d/"+id+".html"
206                        file = open(filename,"a")
207                        utime(filename, None)
208                        file.close()
209                        for image in self.entries[id]["images"]:
210                             file = open(image,"a")
211                             utime(image, None)
212                             file.close()
213                    except:
214                        pass
215                    tmpEntries[id] = self.entries[id]
216                    tmpIds.append(id)
217             
218            oldIds = self.ids[:]
219            for entryId in oldIds:
220                 if not entryId in tmpIds:
221                     try:
222                         articleTime = time.mktime(self.entries[entryId]["dateTuple"])
223                         if (currentTime - articleTime > 2*expiry):
224                             self.removeEntry(entryId)
225                             continue
226                         if (currentTime - articleTime > expiry) and (self.isEntryRead(entryId)):
227                             # Entry is over 24 hours, and already read
228                             self.removeEntry(entryId)
229                             continue
230                         tmpEntries[entryId] = self.entries[entryId]
231                         tmpIds.append(entryId)
232                     except:
233                         print "Error purging old articles %s" % entryId
234                         self.removeEntry(entryId)
235
236            self.entries = tmpEntries
237            self.ids = tmpIds
238            tmpUnread = 0
239            
240
241            ids = self.ids[:]
242            for id in ids:
243                if not self.readItems.has_key(id):
244                    self.readItems[id] = False
245                if self.readItems[id]==False:
246                   tmpUnread = tmpUnread + 1
247            keys = self.readItems.keys()
248            for id in keys:
249                if not id in self.ids:
250                    del self.readItems[id]
251            del tmp
252            self.countUnread = tmpUnread
253            self.updateTime = time.asctime()
254            self.saveFeed(configdir)
255            from glob import glob
256            from os import stat
257            for file in glob(configdir+self.uniqueId+".d/*"):
258                 #
259                 stats = stat(file)
260                 #
261                 # put the two dates into matching format
262                 #
263                 lastmodDate = stats[8]
264                 #
265                 expDate = time.time()-expiry*3
266                 # check if image-last-modified-date is outdated
267                 #
268                 if expDate > lastmodDate:
269                     #
270                     try:
271                         #
272                         #print 'Removing', file
273                         #
274                         remove(file) # commented out for testing
275                         #
276                     except OSError:
277                         #
278                         print 'Could not remove', file
279            
280
281     def extractContent(self, entry):
282         content = ""
283         if entry.has_key('summary'):
284             content = entry.get('summary', '')
285         if entry.has_key('content'):
286             if len(entry.content[0].value) > len(content):
287                 content = entry.content[0].value
288         if content == "":
289             content = entry.get('description', '')
290         return content
291         
292     def extractDate(self, entry):
293         if entry.has_key("updated_parsed"):
294             date1 = entry["updated_parsed"]
295             date = time.strftime("%a, %d %b %Y %H:%M:%S",entry["updated_parsed"])
296         elif entry.has_key("published_parsed"):
297             date1 = entry["published_parsed"]
298             date = time.strftime("%a, %d %b %Y %H:%M:%S", entry["published_parsed"])
299         else:
300             date1= ""
301             date = ""
302         #print date1, date
303         return (date1, date)
304
305     def setEntryRead(self, id):
306         if self.readItems[id]==False:
307             self.countUnread = self.countUnread - 1
308             self.readItems[id] = True
309             
310     def setEntryUnread(self, id):
311         if self.readItems[id]==True:
312             self.countUnread = self.countUnread + 1
313             self.readItems[id] = False
314     
315     def isEntryRead(self, id):
316         # Check if an entry is read; return False if the read
317         # status of an entry is unknown (id not in readItems)
318         return self.readItems.get(id, False)
319     
320     def getTitle(self, id):
321         return self.entries[id]["title"]
322     
323     def getContentLink(self, id):
324         if self.entries[id].has_key("contentLink"):
325             return self.entries[id]["contentLink"]
326         return self.entries[id]["link"]
327     
328     def getExternalLink(self, id):
329         return self.entries[id]["link"]
330     
331     def getDate(self, id):
332         return self.entries[id]["date"]
333
334     def getDateTuple(self, id):
335         return self.entries[id]["dateTuple"]
336  
337     def getUniqueId(self, index):
338         return self.ids[index]
339     
340     def generateUniqueId(self, entry):
341         return getId(entry["date"] + entry["title"])
342     
343     def getUpdateTime(self):
344         return self.updateTime
345     
346     def getEntries(self):
347         return self.entries
348     
349     def getIds(self):
350         return self.ids
351     
352     def getNextId(self, id):
353         return self.ids[(self.ids.index(id)+1) % self.getNumberOfEntries()]
354     
355     def getPreviousId(self, id):
356         return self.ids[(self.ids.index(id)-1) % self.getNumberOfEntries()]
357     
358     def getNumberOfUnreadItems(self):
359         return self.countUnread
360     
361     def getNumberOfEntries(self):
362         return len(self.ids)
363     
364     def getItem(self, id):
365         try:
366             return self.entries[id]
367         except:
368             return []
369     
370     def getContent(self, id):
371         if self.entries[id].has_key("contentLink"):
372             file = open(self.entries[id]["contentLink"])
373             content = file.read()
374             file.close()
375             return content
376         return self.entries[id]["content"]
377     
378     def removeEntry(self, id):
379         #try:
380         if self.entries.has_key(id):
381             entry = self.entries[id]
382             
383             if entry.has_key("contentLink"):
384                 try:
385                     remove(entry["contentLink"])  #os.remove
386                 except:
387                     print "File not found for deletion: %s" % entry["contentLink"]
388             del self.entries[id]
389         else:
390             print "Entries has no %s key" % id
391         if id in self.ids:
392             self.ids.remove(id)
393         else:
394             print "Ids has no %s key" % id
395         if self.readItems.has_key(id):
396             if self.readItems[id]==False:
397                 self.countUnread = self.countUnread - 1
398             del self.readItems[id]
399         else:
400             print "ReadItems has no %s key" % id
401         #except:
402         #    print "Error removing entry %s" %id
403     
404     def getArticle(self, entry):
405         #self.setEntryRead(id)
406         #entry = self.entries[id]
407         title = entry['title']
408         #content = entry.get('content', entry.get('summary_detail', {}))
409         content = entry["content"]
410
411         link = entry['link']
412         date = entry["date"]
413
414         #text = '''<div style="color: black; background-color: white;">'''
415         text = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">'
416         text += "<html><head><title>" + title + "</title>"
417         text += '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>\n'
418         #text += '<style> body {-webkit-user-select: none;} </style>'
419         text += '</head><body><div><a href=\"' + link + '\">' + title + "</a>"
420         text += "<BR /><small><i>Date: " + date + "</i></small></div>"
421         text += "<BR /><BR />"
422         text += content
423         text += "</body></html>"
424         return text
425         
426 class ArchivedArticles(Feed):    
427     def addArchivedArticle(self, title, link, updated_parsed, configdir):
428         entry = {}
429         entry["title"] = title
430         entry["link"] = link
431         entry["summary"] = '<a href=\"' + link + '\">' + title + "</a>"
432         entry["updated_parsed"] = updated_parsed
433         entry["time"] = time.time()
434         #print entry
435         (dateTuple, date) = self.extractDate(entry)
436         tmpEntry = {"title":entry["title"], "content":self.extractContent(entry),
437                             "date":date, "dateTuple":dateTuple, "link":entry["link"], "images":[], "downloaded":False, "time":entry["time"] }
438         id = self.generateUniqueId(tmpEntry)
439         self.entries[id] = tmpEntry
440         self.ids.append(id)  
441         self.readItems[id] = False
442         self.countUnread = self.countUnread + 1
443         self.saveFeed(configdir)
444         self.saveUnread(configdir)
445         
446     def updateFeed(self, configdir, expiryTime=24, proxy=None, imageCache=False):
447         for id in self.getIds():
448             entry = self.entries[id]
449             if not entry["downloaded"]:
450                 #try:
451                     f = urllib2.urlopen(entry["link"])
452                     #entry["content"] = f.read()
453                     html = f.read()
454                     f.close()
455                     soup = BeautifulSoup(html)
456                     images = soup('img')
457                     baseurl = entry["link"]
458                     for img in images:
459                         filename = self.addImage(configdir, self.uniqueId, baseurl, img['src'])
460                         img['src']=filename
461                     entry["contentLink"] = configdir+self.uniqueId+".d/"+id+".html"
462                     file = open(entry["contentLink"], "w")
463                     file.write(soup.prettify())
464                     file.close()
465                     if len(entry["content"]) > 0:
466                         entry["downloaded"] = True
467                         entry["time"] = time.time()
468                         self.setEntryUnread(id)
469                 #except:
470                 #    pass
471             #currentTime = time.time()
472             #expiry = float(expiryTime) * 3600
473             #if currentTime - entry["time"] > expiry:
474             #    if self.isEntryRead(id):
475             #        self.removeEntry(id)
476             #    else:
477             #        if currentTime - entry["time"] > 2*expiry:
478             #            self.removeEntry(id)
479         self.updateTime = time.asctime()
480         self.saveFeed(configdir)
481         
482     def purgeReadArticles(self):
483         ids = self.getIds()
484         for id in ids:
485             entry = self.entries[id]
486             if self.isEntryRead(id):
487                 self.removeEntry(id)
488                 
489     def removeArticle(self, id):
490         self.removeEntry(id)
491
492     def getArticle(self, index):
493         self.setEntryRead(index)
494         content = self.getContent(index)
495         return content
496
497
498 class Listing:
499     # Lists all the feeds in a dictionary, and expose the data
500     def __init__(self, configdir):
501         self.configdir = configdir
502         #self.feeds = {}
503         if isfile(self.configdir+"feeds.pickle"):
504             file = open(self.configdir+"feeds.pickle")
505             self.listOfFeeds = pickle.load(file)
506             file.close()
507         else:
508             self.listOfFeeds = {getId("Maemo News"):{"title":"Maemo News", "url":"http://maemo.org/news/items.xml", "unread":0, "updateTime":"Never"}, }
509         if self.listOfFeeds.has_key("font"):
510             del self.listOfFeeds["font"]
511         if self.listOfFeeds.has_key("feedingit-order"):
512             self.sortedKeys = self.listOfFeeds["feedingit-order"]
513         else:
514             self.sortedKeys = self.listOfFeeds.keys()
515             if "font" in self.sortedKeys:
516                 self.sortedKeys.remove("font")
517             self.sortedKeys.sort(key=lambda obj: self.getFeedTitle(obj))
518         list = self.sortedKeys[:]
519         #self.closeCurrentlyDisplayedFeed()
520
521     def addArchivedArticle(self, key, index):
522         feed = self.getFeed(key)
523         title = feed.getTitle(index)
524         link = feed.getExternalLink(index)
525         date = feed.getDateTuple(index)
526         if not self.listOfFeeds.has_key("ArchivedArticles"):
527             self.listOfFeeds["ArchivedArticles"] = {"title":"Archived Articles", "url":"", "unread":0, "updateTime":"Never"}
528             self.sortedKeys.append("ArchivedArticles")
529             #self.feeds["Archived Articles"] = ArchivedArticles("Archived Articles", "")
530             self.saveConfig()
531         archFeed = self.getFeed("ArchivedArticles")
532         archFeed.addArchivedArticle(title, link, date, self.configdir)
533         self.listOfFeeds[key]["unread"] = archFeed.getNumberOfUnreadItems()
534         
535     def loadFeed(self, key):
536             if isfile(self.configdir+key+".d/feed"):
537                 file = open(self.configdir+key+".d/feed")
538                 feed = pickle.load(file)
539                 file.close()
540                 try:
541                     feed.uniqueId
542                 except AttributeError:
543                     feed.uniqueId = getId(feed.name)
544                 try:
545                     del feed.imageHandler
546                 except:
547                     pass
548                 try:
549                     feed.etag
550                 except AttributeError:
551                     feed.etag = None
552                 try:
553                     feed.modified
554                 except AttributeError:
555                     feed.modified = None
556                 #feed.reloadUnread(self.configdir)
557             else:
558                 #print key
559                 title = self.listOfFeeds[key]["title"]
560                 url = self.listOfFeeds[key]["url"]
561                 if key == "ArchivedArticles":
562                     feed = ArchivedArticles("ArchivedArticles", title, url)
563                 else:
564                     feed = Feed(getId(title), title, url)
565             return feed
566         
567     def updateFeeds(self, expiryTime=24, proxy=None, imageCache=False):
568         for key in self.getListOfFeeds():
569             feed = self.loadFeed(key)
570             feed.updateFeed(self.configdir, expiryTime, proxy, imageCache)
571             self.listOfFeeds[key]["unread"] = feed.getNumberOfUnreadItems()
572             self.listOfFeeds[key]["updateTime"] = feed.getUpdateTime()
573             
574     def updateFeed(self, key, expiryTime=24, proxy=None, imageCache=False):
575         feed = self.getFeed(key)
576         feed.updateFeed(self.configdir, expiryTime, proxy, imageCache)
577         self.listOfFeeds[key]["unread"] = feed.getNumberOfUnreadItems()
578         self.listOfFeeds[key]["updateTime"] = feed.getUpdateTime()
579         
580     def editFeed(self, key, title, url):
581         self.listOfFeeds[key]["title"] = title
582         self.listOfFeeds[key]["url"] = url
583         feed = self.loadFeed(key)
584         feed.editFeed(url)
585
586     def getFeed(self, key):
587         try:
588             feed = self.loadFeed(key)
589             feed.reloadUnread(self.configdir)
590         except:
591             # If the feed file gets corrupted, we need to reset the feed.
592             import traceback
593             traceback.print_exc()
594             import dbus
595             bus = dbus.SessionBus()
596             remote_object = bus.get_object("org.freedesktop.Notifications", # Connection name
597                                "/org/freedesktop/Notifications" # Object's path
598                               )
599             iface = dbus.Interface(remote_object, 'org.freedesktop.Notifications')
600             iface.SystemNoteInfoprint("Error opening feed %s, it has been reset." % self.getFeedTitle(key))
601             if isdir(self.configdir+key+".d/"):
602                 rmtree(self.configdir+key+".d/")
603             feed = self.loadFeed(key)
604         return feed
605     
606     def getFeedUpdateTime(self, key):
607         #print self.listOfFeeds.has_key(key)
608         if not self.listOfFeeds[key].has_key("updateTime"):
609             self.listOfFeeds[key]["updateTime"] = "Never"
610         return self.listOfFeeds[key]["updateTime"]
611     
612     def getFeedNumberOfUnreadItems(self, key):
613         if not self.listOfFeeds[key].has_key("unread"):
614             self.listOfFeeds[key]["unread"] = 0
615         return self.listOfFeeds[key]["unread"]
616
617     def updateUnread(self, key, unreadItems):
618         self.listOfFeeds[key]["unread"] = unreadItems
619    
620     def getFeedTitle(self, key):
621         return self.listOfFeeds[key]["title"]
622     
623     def getFeedUrl(self, key):
624         return self.listOfFeeds[key]["url"]
625     
626     def getListOfFeeds(self):
627         return self.sortedKeys
628     
629     def getFavicon(self, key):
630         filename = self.configdir+key+".d/favicon.ico"
631         if isfile(filename):
632             return filename
633         else:
634             return False
635     
636     def addFeed(self, title, url):
637         if not self.listOfFeeds.has_key(getId(title)):
638             self.listOfFeeds[getId(title)] = {"title":title, "url":url, "unread":0, "updateTime":"Never"}
639             self.sortedKeys.append(getId(title))
640             self.saveConfig()
641             #self.feeds[getId(title)] = Feed(title, url)
642             return True
643         else:
644             return False
645         
646     def removeFeed(self, key):
647         del self.listOfFeeds[key]
648         self.sortedKeys.remove(key)
649         #del self.feeds[key]
650         if isdir(self.configdir+key+".d/"):
651            rmtree(self.configdir+key+".d/")
652         self.saveConfig()
653     
654     def saveConfig(self):
655         self.listOfFeeds["feedingit-order"] = self.sortedKeys
656         file = open(self.configdir+"feeds.pickle", "w")
657         pickle.dump(self.listOfFeeds, file)
658         file.close()
659         
660     def moveUp(self, key):
661         index = self.sortedKeys.index(key)
662         self.sortedKeys[index] = self.sortedKeys[index-1]
663         self.sortedKeys[index-1] = key
664         
665     def moveDown(self, key):
666         index = self.sortedKeys.index(key)
667         index2 = (index+1)%len(self.sortedKeys)
668         self.sortedKeys[index] = self.sortedKeys[index2]
669         self.sortedKeys[index2] = key
670     
671 if __name__ == "__main__":
672     listing = Listing('/home/user/.feedingit/')
673     list = listing.getListOfFeeds()[:]
674         #list.reverse()
675     for key in list:
676         if key.startswith('d8'):
677             print listing.getFeedUpdateTime(key)