Added THP's patches (5841) and Nelson's patch (#5782)
[feedingit] / src / rss.py
1 #!/usr/bin/env python2.5
2
3
4 # Copyright (c) 2007-2008 INdT.
5 # This program is free software: you can redistribute it and/or modify
6 # it under the terms of the GNU Lesser General Public License as published by
7 # the Free Software Foundation, either version 3 of the License, or
8 # (at your option) any later version.
9 #
10 #  This program is distributed in the hope that it will be useful,
11 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
12 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 #  GNU Lesser General Public License for more details.
14 #
15 #  You should have received a copy of the GNU Lesser General Public License
16 #  along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 #
18
19 # ============================================================================
20 # Name        : FeedingIt.py
21 # Author      : Yves Marcoz
22 # Version     : 0.5.4
23 # Description : Simple RSS Reader
24 # ============================================================================
25
26 from os.path import isfile, isdir
27 from shutil import rmtree
28 from os import mkdir, remove, utime
29 import pickle
30 import md5
31 import feedparser
32 import time
33 import urllib2
34 from BeautifulSoup import BeautifulSoup
35 from urlparse import urljoin
36
37 #CONFIGDIR="/home/user/.feedingit/"
38
39 def getId(string):
40     return md5.new(string).hexdigest()
41
42 #def getProxy():
43 #    import gconf
44 #    if gconf.client_get_default().get_bool('/system/http_proxy/use_http_proxy'):
45 #        port = gconf.client_get_default().get_int('/system/http_proxy/port')
46 #        http = gconf.client_get_default().get_string('/system/http_proxy/host')
47 #        proxy = proxy = urllib2.ProxyHandler( {"http":"http://%s:%s/"% (http,port)} )
48 #        return (True, proxy)
49 #    return (False, None)
50
51 # Enable proxy support for images and ArchivedArticles
52 #(proxy_support, proxy) = getProxy()
53 #if proxy_support:
54 #    opener = urllib2.build_opener(proxy)
55 #    urllib2.install_opener(opener)
56
57 # Entry = {"title":XXX, "content":XXX, "date":XXX, "link":XXX, images = [] }
58
59 class ImageHandler:
60     def __init__(self, configdir):
61         pass
62
63 class Feed:
64     def __init__(self, uniqueId, name, url):
65         self.titles = []
66         self.entries = {}
67         self.ids = []
68         self.readItems = {}
69         self.name = name
70         self.url = url
71         self.countUnread = 0
72         self.updateTime = "Never"
73         self.uniqueId = uniqueId
74
75     def addImage(self, configdir, key, baseurl, url):
76         filename = configdir+key+".d/"+getId(url)
77         if not isfile(filename):
78             try:
79                 #if url.startswith("http"):
80                 #    f = urllib2.urlopen(url)
81                 #else:
82                 f = urllib2.urlopen(urljoin(baseurl,url))
83                 outf = open(filename, "w")
84                 outf.write(f.read())
85                 f.close()
86                 outf.close()
87             except:
88                 print "Could not download " + url
89         else:
90             #open(filename,"a").close()  # "Touch" the file
91             file = open(filename,"a")
92             utime(filename, None)
93             file.close()
94         return filename
95
96     def editFeed(self, url):
97         self.url = url
98
99     def saveFeed(self, configdir):
100         if not isdir(configdir+self.uniqueId+".d"):
101              mkdir(configdir+self.uniqueId+".d")
102         file = open(configdir+self.uniqueId+".d/feed", "w")
103         pickle.dump(self, file )
104         file.close()
105         self.saveUnread(configdir)
106         
107     def saveUnread(self, configdir):
108         if not isdir(configdir+self.uniqueId+".d"):
109             mkdir(configdir+self.uniqueId+".d")
110         file = open(configdir+self.uniqueId+".d/unread", "w")
111         pickle.dump(self.readItems, file )
112         file.close()
113
114     def reloadUnread(self, configdir):
115         try:
116             file = open(configdir+self.uniqueId+".d/unread", "r")
117             self.readItems = pickle.load( file )
118             file.close()
119             self.countUnread = 0
120             for id in self.getIds():
121                if self.readItems[id]==False:
122                   self.countUnread = self.countUnread + 1
123         except:
124             pass
125         return self.countUnread
126
127     def updateFeed(self, configdir, expiryTime=24, proxy=None, imageCache=False):
128         # Expiry time is in hours
129         if proxy == None:
130             tmp=feedparser.parse(self.url)
131         else:
132             tmp=feedparser.parse(self.url, handlers = [proxy])
133         expiry = float(expiryTime) * 3600.
134         # Check if the parse was succesful (number of entries > 0, else do nothing)
135         if len(tmp["entries"])>0:
136            if not isdir(configdir+self.uniqueId+".d"):
137                mkdir(configdir+self.uniqueId+".d")
138            try:
139                f = urllib2.urlopen(urljoin(tmp["feed"]["link"],"/favicon.ico"))
140                data = f.read()
141                f.close()
142                outf = open(configdir+self.uniqueId+".d/favicon.ico", "w")
143                outf.write(data)
144                outf.close()
145                del data
146            except:
147                 import traceback
148                 traceback.print_exc()
149
150
151            #reversedEntries = self.getEntries()
152            #reversedEntries.reverse()
153
154            currentTime = time.time()
155            tmpEntries = {}
156            tmpIds = []
157            for entry in tmp["entries"]:
158                (dateTuple, date) = self.extractDate(entry)
159                try:
160                    entry["title"]
161                except:
162                    entry["title"] = "No Title"
163                try:
164                    entry["link"]
165                except:
166                    entry["link"] = ""
167                tmpEntry = {"title":entry["title"], "content":self.extractContent(entry),
168                             "date":date, "dateTuple":dateTuple, "link":entry["link"], "images":[] }
169                id = self.generateUniqueId(tmpEntry)
170                
171                #articleTime = time.mktime(self.entries[id]["dateTuple"])
172                if not id in self.ids:
173                    soup = BeautifulSoup(self.getArticle(tmpEntry)) #tmpEntry["content"])
174                    images = soup('img')
175                    baseurl = tmpEntry["link"]
176                    if imageCache:
177                       for img in images:
178                           try:
179                             filename = self.addImage(configdir, self.uniqueId, baseurl, img['src'])
180                             img['src']=filename
181                             tmpEntry["images"].append(filename)
182                           except:
183                               print "Error downloading image %s" % img
184                    tmpEntry["contentLink"] = configdir+self.uniqueId+".d/"+id+".html"
185                    file = open(tmpEntry["contentLink"], "w")
186                    file.write(soup.prettify())
187                    file.close()
188                    tmpEntries[id] = tmpEntry
189                    tmpIds.append(id)
190                    if id not in self.readItems:
191                        self.readItems[id] = False
192                else:
193                    try:
194                        filename = configdir+self.uniqueId+".d/"+id+".html"
195                        file = open(filename,"a")
196                        utime(filename, None)
197                        file.close()
198                        for image in self.entries[id]["images"]:
199                             file = open(image,"a")
200                             utime(image, None)
201                             file.close()
202                    except:
203                        pass
204                    tmpEntries[id] = self.entries[id]
205                    tmpIds.append(id)
206             
207            oldIds = self.ids[:]
208            for entryId in oldIds:
209                 if not entryId in tmpIds:
210                     try:
211                         articleTime = time.mktime(self.entries[entryId]["dateTuple"])
212                         if (currentTime - articleTime > 2*expiry):
213                             self.removeEntry(entryId)
214                             continue
215                         if (currentTime - articleTime > expiry) and (self.isEntryRead(entryId)):
216                             # Entry is over 24 hours, and already read
217                             self.removeEntry(entryId)
218                             continue
219                         tmpEntries[entryId] = self.entries[entryId]
220                         tmpIds.append(entryId)
221                     except:
222                         print "Error purging old articles %s" % entryId
223                         self.removeEntry(entryId)
224
225            self.entries = tmpEntries
226            self.ids = tmpIds
227            tmpUnread = 0
228            
229
230            ids = self.ids[:]
231            for id in ids:
232                if not self.readItems.has_key(id):
233                    self.readItems[id] = False
234                if self.readItems[id]==False:
235                   tmpUnread = tmpUnread + 1
236            keys = self.readItems.keys()
237            for id in keys:
238                if not id in self.ids:
239                    del self.readItems[id]
240            del tmp
241            self.countUnread = tmpUnread
242            self.updateTime = time.asctime()
243            self.saveFeed(configdir)
244            from glob import glob
245            from os import stat
246            for file in glob(configdir+self.uniqueId+".d/*"):
247                 #
248                 stats = stat(file)
249                 #
250                 # put the two dates into matching format
251                 #
252                 lastmodDate = stats[8]
253                 #
254                 expDate = time.time()-expiry*3
255                 # check if image-last-modified-date is outdated
256                 #
257                 if expDate > lastmodDate:
258                     #
259                     try:
260                         #
261                         #print 'Removing', file
262                         #
263                         remove(file) # commented out for testing
264                         #
265                     except OSError:
266                         #
267                         print 'Could not remove', file
268            
269
270     def extractContent(self, entry):
271         content = ""
272         if entry.has_key('summary'):
273             content = entry.get('summary', '')
274         if entry.has_key('content'):
275             if len(entry.content[0].value) > len(content):
276                 content = entry.content[0].value
277         if content == "":
278             content = entry.get('description', '')
279         return content
280         
281     def extractDate(self, entry):
282         if entry.has_key("updated_parsed"):
283             date1 = entry["updated_parsed"]
284             date = time.strftime("%a, %d %b %Y %H:%M:%S",entry["updated_parsed"])
285         elif entry.has_key("published_parsed"):
286             date1 = entry["published_parsed"]
287             date = time.strftime("%a, %d %b %Y %H:%M:%S", entry["published_parsed"])
288         else:
289             date1= ""
290             date = ""
291         #print date1, date
292         return (date1, date)
293
294     def setEntryRead(self, id):
295         if self.readItems[id]==False:
296             self.countUnread = self.countUnread - 1
297             self.readItems[id] = True
298             
299     def setEntryUnread(self, id):
300         if self.readItems[id]==True:
301             self.countUnread = self.countUnread + 1
302             self.readItems[id] = False
303     
304     def isEntryRead(self, id):
305         return self.readItems[id]
306     
307     def getTitle(self, id):
308         return self.entries[id]["title"]
309     
310     def getContentLink(self, id):
311         if self.entries[id].has_key("contentLink"):
312             return self.entries[id]["contentLink"]
313         return self.entries[id]["link"]
314     
315     def getExternalLink(self, id):
316         return self.entries[id]["link"]
317     
318     def getDate(self, id):
319         return self.entries[id]["date"]
320
321     def getDateTuple(self, id):
322         return self.entries[id]["dateTuple"]
323  
324     def getUniqueId(self, index):
325         return self.ids[index]
326     
327     def generateUniqueId(self, entry):
328         return getId(entry["date"] + entry["title"])
329     
330     def getUpdateTime(self):
331         return self.updateTime
332     
333     def getEntries(self):
334         return self.entries
335     
336     def getIds(self):
337         return self.ids
338     
339     def getNextId(self, id):
340         return self.ids[(self.ids.index(id)+1) % self.getNumberOfEntries()]
341     
342     def getPreviousId(self, id):
343         return self.ids[(self.ids.index(id)-1) % self.getNumberOfEntries()]
344     
345     def getNumberOfUnreadItems(self):
346         return self.countUnread
347     
348     def getNumberOfEntries(self):
349         return len(self.ids)
350     
351     def getItem(self, id):
352         try:
353             return self.entries[id]
354         except:
355             return []
356     
357     def getContent(self, id):
358         if self.entries[id].has_key("contentLink"):
359             file = open(self.entries[id]["contentLink"])
360             content = file.read()
361             file.close()
362             return content
363         return self.entries[id]["content"]
364     
365     def removeEntry(self, id):
366         #try:
367         if self.entries.has_key(id):
368             entry = self.entries[id]
369             
370             if entry.has_key("contentLink"):
371                 try:
372                     remove(entry["contentLink"])  #os.remove
373                 except:
374                     print "File not found for deletion: %s" % entry["contentLink"]
375             del self.entries[id]
376         else:
377             print "Entries has no %s key" % id
378         if id in self.ids:
379             self.ids.remove(id)
380         else:
381             print "Ids has no %s key" % id
382         if self.readItems.has_key(id):
383             if self.readItems[id]==False:
384                 self.countUnread = self.countUnread - 1
385             del self.readItems[id]
386         else:
387             print "ReadItems has no %s key" % id
388         #except:
389         #    print "Error removing entry %s" %id
390     
391     def getArticle(self, entry):
392         #self.setEntryRead(id)
393         #entry = self.entries[id]
394         title = entry['title']
395         #content = entry.get('content', entry.get('summary_detail', {}))
396         content = entry["content"]
397
398         link = entry['link']
399         date = entry["date"]
400
401         #text = '''<div style="color: black; background-color: white;">'''
402         text = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">'
403         text += "<html><head><title>" + title + "</title>"
404         text += '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>\n'
405         #text += '<style> body {-webkit-user-select: none;} </style>'
406         text += '</head><body><div><a href=\"' + link + '\">' + title + "</a>"
407         text += "<BR /><small><i>Date: " + date + "</i></small></div>"
408         text += "<BR /><BR />"
409         text += content
410         text += "</body></html>"
411         return text
412         
413 class ArchivedArticles(Feed):    
414     def addArchivedArticle(self, title, link, updated_parsed, configdir):
415         entry = {}
416         entry["title"] = title
417         entry["link"] = link
418         entry["summary"] = '<a href=\"' + link + '\">' + title + "</a>"
419         entry["updated_parsed"] = updated_parsed
420         entry["time"] = time.time()
421         #print entry
422         (dateTuple, date) = self.extractDate(entry)
423         tmpEntry = {"title":entry["title"], "content":self.extractContent(entry),
424                             "date":date, "dateTuple":dateTuple, "link":entry["link"], "images":[], "downloaded":False, "time":entry["time"] }
425         id = self.generateUniqueId(tmpEntry)
426         self.entries[id] = tmpEntry
427         self.ids.append(id)  
428         self.readItems[id] = False
429         self.countUnread = self.countUnread + 1
430         self.saveFeed(configdir)
431         self.saveUnread(configdir)
432         
433     def updateFeed(self, configdir, expiryTime=24, proxy=None, imageCache=False):
434         for id in self.getIds():
435             entry = self.entries[id]
436             if not entry["downloaded"]:
437                 #try:
438                     f = urllib2.urlopen(entry["link"])
439                     #entry["content"] = f.read()
440                     html = f.read()
441                     f.close()
442                     soup = BeautifulSoup(html)
443                     images = soup('img')
444                     baseurl = entry["link"]
445                     for img in images:
446                         filename = self.addImage(configdir, self.uniqueId, baseurl, img['src'])
447                         img['src']=filename
448                     entry["contentLink"] = configdir+self.uniqueId+".d/"+id+".html"
449                     file = open(entry["contentLink"], "w")
450                     file.write(soup.prettify())
451                     file.close()
452                     if len(entry["content"]) > 0:
453                         entry["downloaded"] = True
454                         entry["time"] = time.time()
455                         self.setEntryUnread(id)
456                 #except:
457                 #    pass
458             #currentTime = time.time()
459             #expiry = float(expiryTime) * 3600
460             #if currentTime - entry["time"] > expiry:
461             #    if self.isEntryRead(id):
462             #        self.removeEntry(id)
463             #    else:
464             #        if currentTime - entry["time"] > 2*expiry:
465             #            self.removeEntry(id)
466         self.updateTime = time.asctime()
467         self.saveFeed(configdir)
468         
469     def purgeReadArticles(self):
470         ids = self.getIds()
471         for id in ids:
472             entry = self.entries[id]
473             if self.isEntryRead(id):
474                 self.removeEntry(id)
475                 
476     def removeArticle(self, id):
477         self.removeEntry(id)
478
479     def getArticle(self, index):
480         self.setEntryRead(index)
481         content = self.getContent(index)
482         return content
483
484
485 class Listing:
486     # Lists all the feeds in a dictionary, and expose the data
487     def __init__(self, configdir):
488         self.configdir = configdir
489         #self.feeds = {}
490         if isfile(self.configdir+"feeds.pickle"):
491             file = open(self.configdir+"feeds.pickle")
492             self.listOfFeeds = pickle.load(file)
493             file.close()
494         else:
495             self.listOfFeeds = {getId("Slashdot"):{"title":"Slashdot", "url":"http://rss.slashdot.org/Slashdot/slashdot", "unread":0, "updateTime":"Never"}, }
496         if self.listOfFeeds.has_key("font"):
497             del self.listOfFeeds["font"]
498         if self.listOfFeeds.has_key("feedingit-order"):
499             self.sortedKeys = self.listOfFeeds["feedingit-order"]
500         else:
501             self.sortedKeys = self.listOfFeeds.keys()
502             if "font" in self.sortedKeys:
503                 self.sortedKeys.remove("font")
504             self.sortedKeys.sort(key=lambda obj: self.getFeedTitle(obj))
505         list = self.sortedKeys[:]
506         #self.closeCurrentlyDisplayedFeed()
507
508     def addArchivedArticle(self, key, index):
509         feed = self.getFeed(key)
510         title = feed.getTitle(index)
511         link = feed.getExternalLink(index)
512         date = feed.getDateTuple(index)
513         if not self.listOfFeeds.has_key("ArchivedArticles"):
514             self.listOfFeeds["ArchivedArticles"] = {"title":"Archived Articles", "url":"", "unread":0, "updateTime":"Never"}
515             self.sortedKeys.append("ArchivedArticles")
516             #self.feeds["Archived Articles"] = ArchivedArticles("Archived Articles", "")
517             self.saveConfig()
518         archFeed = self.getFeed("ArchivedArticles")
519         archFeed.addArchivedArticle(title, link, date, self.configdir)
520         self.listOfFeeds[key]["unread"] = archFeed.getNumberOfUnreadItems()
521         
522     def loadFeed(self, key):
523             if isfile(self.configdir+key+".d/feed"):
524                 file = open(self.configdir+key+".d/feed")
525                 feed = pickle.load(file)
526                 file.close()
527                 try:
528                     feed.uniqueId
529                 except AttributeError:
530                     feed.uniqueId = getId(feed.name)
531                 try:
532                     del feed.imageHandler
533                 except:
534                     pass
535                 #feed.reloadUnread(self.configdir)
536             else:
537                 #print key
538                 title = self.listOfFeeds[key]["title"]
539                 url = self.listOfFeeds[key]["url"]
540                 if key == "ArchivedArticles":
541                     feed = ArchivedArticles("ArchivedArticles", title, url)
542                 else:
543                     feed = Feed(getId(title), title, url)
544             return feed
545         
546     def updateFeeds(self, expiryTime=24, proxy=None, imageCache=False):
547         for key in self.getListOfFeeds():
548             feed = self.loadFeed(key)
549             feed.updateFeed(self.configdir, expiryTime, proxy, imageCache)
550             self.listOfFeeds[key]["unread"] = feed.getNumberOfUnreadItems()
551             self.listOfFeeds[key]["updateTime"] = feed.getUpdateTime()
552             
553     def updateFeed(self, key, expiryTime=24, proxy=None, imageCache=False):
554         feed = self.getFeed(key)
555         feed.updateFeed(self.configdir, expiryTime, proxy, imageCache)
556         self.listOfFeeds[key]["unread"] = feed.getNumberOfUnreadItems()
557         self.listOfFeeds[key]["updateTime"] = feed.getUpdateTime()
558         
559     def editFeed(self, key, title, url):
560         self.listOfFeeds[key]["title"] = title
561         self.listOfFeeds[key]["url"] = url
562         feed = self.loadFeed(key)
563         feed.editFeed(url)
564
565     def getFeed(self, key):
566         try:
567             feed = self.loadFeed(key)
568             feed.reloadUnread(self.configdir)
569         except:
570             # If the feed file gets corrupted, we need to reset the feed.
571             import traceback
572             traceback.print_exc()
573             import dbus
574             bus = dbus.SessionBus()
575             remote_object = bus.get_object("org.freedesktop.Notifications", # Connection name
576                                "/org/freedesktop/Notifications" # Object's path
577                               )
578             iface = dbus.Interface(remote_object, 'org.freedesktop.Notifications')
579             iface.SystemNoteInfoprint("Error opening feed %s, it has been reset." % self.getFeedTitle(key))
580             if isdir(self.configdir+key+".d/"):
581                 rmtree(self.configdir+key+".d/")
582             feed = self.loadFeed(key)
583         return feed
584     
585     def getFeedUpdateTime(self, key):
586         #print self.listOfFeeds.has_key(key)
587         if not self.listOfFeeds[key].has_key("updateTime"):
588             self.listOfFeeds[key]["updateTime"] = "Never"
589         return self.listOfFeeds[key]["updateTime"]
590     
591     def getFeedNumberOfUnreadItems(self, key):
592         if not self.listOfFeeds[key].has_key("unread"):
593             self.listOfFeeds[key]["unread"] = 0
594         return self.listOfFeeds[key]["unread"]
595
596     def updateUnread(self, key, unreadItems):
597         self.listOfFeeds[key]["unread"] = unreadItems
598    
599     def getFeedTitle(self, key):
600         return self.listOfFeeds[key]["title"]
601     
602     def getFeedUrl(self, key):
603         return self.listOfFeeds[key]["url"]
604     
605     def getListOfFeeds(self):
606         return self.sortedKeys
607     
608     def getFavicon(self, key):
609         filename = self.configdir+key+".d/favicon.ico"
610         if isfile(filename):
611             return filename
612         else:
613             return False
614     
615     def addFeed(self, title, url):
616         if not self.listOfFeeds.has_key(getId(title)):
617             self.listOfFeeds[getId(title)] = {"title":title, "url":url, "unread":0, "updateTime":"Never"}
618             self.sortedKeys.append(getId(title))
619             self.saveConfig()
620             #self.feeds[getId(title)] = Feed(title, url)
621             return True
622         else:
623             return False
624         
625     def removeFeed(self, key):
626         del self.listOfFeeds[key]
627         self.sortedKeys.remove(key)
628         #del self.feeds[key]
629         if isdir(self.configdir+key+".d/"):
630            rmtree(self.configdir+key+".d/")
631         self.saveConfig()
632     
633     def saveConfig(self):
634         self.listOfFeeds["feedingit-order"] = self.sortedKeys
635         file = open(self.configdir+"feeds.pickle", "w")
636         pickle.dump(self.listOfFeeds, file)
637         file.close()
638         
639     def moveUp(self, key):
640         index = self.sortedKeys.index(key)
641         self.sortedKeys[index] = self.sortedKeys[index-1]
642         self.sortedKeys[index-1] = key
643         
644     def moveDown(self, key):
645         index = self.sortedKeys.index(key)
646         index2 = (index+1)%len(self.sortedKeys)
647         self.sortedKeys[index] = self.sortedKeys[index2]
648         self.sortedKeys[index2] = key
649     
650 if __name__ == "__main__":
651     listing = Listing('/home/user/.feedingit/')
652     list = listing.getListOfFeeds()[:]
653         #list.reverse()
654     for key in list:
655         if key.startswith('d8'):
656             print listing.getFeedUpdateTime(key)