0.6.1-1 Added dbus locking mechanism, and widget changes
[feedingit] / src / rss.py
1 #!/usr/bin/env python2.5
2
3
4 # Copyright (c) 2007-2008 INdT.
5 # This program is free software: you can redistribute it and/or modify
6 # it under the terms of the GNU Lesser General Public License as published by
7 # the Free Software Foundation, either version 3 of the License, or
8 # (at your option) any later version.
9 #
10 #  This program is distributed in the hope that it will be useful,
11 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
12 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 #  GNU Lesser General Public License for more details.
14 #
15 #  You should have received a copy of the GNU Lesser General Public License
16 #  along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 #
18
19 # ============================================================================
20 # Name        : FeedingIt.py
21 # Author      : Yves Marcoz
22 # Version     : 0.5.4
23 # Description : Simple RSS Reader
24 # ============================================================================
25
26 from os.path import isfile, isdir
27 from shutil import rmtree
28 from os import mkdir, remove
29 import pickle
30 import md5
31 import feedparser
32 import time
33 import urllib2
34 from BeautifulSoup import BeautifulSoup
35 from urlparse import urlparse
36
37 #CONFIGDIR="/home/user/.feedingit/"
38
39 def getId(string):
40     return md5.new(string).hexdigest()
41
42 #def getProxy():
43 #    import gconf
44 #    if gconf.client_get_default().get_bool('/system/http_proxy/use_http_proxy'):
45 #        port = gconf.client_get_default().get_int('/system/http_proxy/port')
46 #        http = gconf.client_get_default().get_string('/system/http_proxy/host')
47 #        proxy = proxy = urllib2.ProxyHandler( {"http":"http://%s:%s/"% (http,port)} )
48 #        return (True, proxy)
49 #    return (False, None)
50
51 # Enable proxy support for images and ArchivedArticles
52 #(proxy_support, proxy) = getProxy()
53 #if proxy_support:
54 #    opener = urllib2.build_opener(proxy)
55 #    urllib2.install_opener(opener)
56
57 # Entry = {"title":XXX, "content":XXX, "date":XXX, "link":XXX, images = [] }
58
59 class ImageHandler:
60     def __init__(self, configdir):
61         self.configdir = configdir
62         self.images = {}
63         
64     def addImage(self, key, baseurl, url):
65         filename = self.configdir+key+".d/"+getId(url)
66         if not isfile(filename):
67             try:
68                 if url.startswith("http"):
69                     f = urllib2.urlopen(url)
70                 else:
71                     f = urllib2.urlopen(baseurl+"/"+url)
72                 outf = open(filename, "w")
73                 outf.write(f.read())
74                 f.close()
75                 outf.close()
76             except:
77                 print "Could not download" + url
78         if filename in self.images:
79             self.images[filename] += 1
80         else:
81             self.images[filename] = 1
82         return filename
83         
84     def removeImage(self, key, filename):
85         #filename = self.configdir+key+".d/"+getId(url)
86         try:
87             self.images[filename] -= 1
88         except:
89             self.images[filename] = 0 #Delete image
90         try:
91             if self.images[filename] == 0:
92                 remove(filename) #os.remove
93                 del self.images[filename]
94         except:
95             print "Could not remove image %s" % filename
96
97 class Feed:
98     def __init__(self, uniqueId, name, url, imageHandler):
99         self.titles = []
100         self.entries = {}
101         self.ids = []
102         self.readItems = {}
103         self.name = name
104         self.url = url
105         self.countUnread = 0
106         self.updateTime = "Never"
107         self.uniqueId = uniqueId
108         self.imageHandler = imageHandler
109
110     def editFeed(self, url):
111         self.url = url
112
113     def saveFeed(self, configdir):
114         if not isdir(configdir+self.uniqueId+".d"):
115              mkdir(configdir+self.uniqueId+".d")
116         file = open(configdir+self.uniqueId+".d/feed", "w")
117         pickle.dump(self, file )
118         file.close()
119         self.saveUnread(configdir)
120         
121     def saveUnread(self, configdir):
122         if not isdir(configdir+self.uniqueId+".d"):
123             mkdir(configdir+self.uniqueId+".d")
124         file = open(configdir+self.uniqueId+".d/unread", "w")
125         pickle.dump(self.readItems, file )
126         file.close()
127
128     def reloadUnread(self, configdir):
129         try:
130             file = open(configdir+self.uniqueId+".d/unread", "r")
131             self.readItems = pickle.load( file )
132             file.close()
133             self.countUnread = 0
134             for id in self.getIds():
135                if self.readItems[id]==False:
136                   self.countUnread = self.countUnread + 1
137         except:
138             pass
139         return self.countUnread
140
141     def updateFeed(self, configdir, expiryTime=24, proxy=None, imageCache=False):
142         # Expiry time is in hours
143         if proxy == None:
144             tmp=feedparser.parse(self.url)
145         else:
146             tmp=feedparser.parse(self.url, handlers = [proxy])
147         expiry = float(expiryTime) * 3600.
148         # Check if the parse was succesful (number of entries > 0, else do nothing)
149         if len(tmp["entries"])>0:
150            #reversedEntries = self.getEntries()
151            #reversedEntries.reverse()
152            if not isdir(configdir+self.uniqueId+".d"):
153                mkdir(configdir+self.uniqueId+".d")
154            currentTime = time.time()
155            tmpEntries = {}
156            tmpIds = []
157            for entry in tmp["entries"]:
158                (dateTuple, date) = self.extractDate(entry)
159                try:
160                    entry["title"]
161                except:
162                    entry["title"] = "No Title"
163                try:
164                    entry["link"]
165                except:
166                    entry["link"] = ""
167                tmpEntry = {"title":entry["title"], "content":self.extractContent(entry),
168                             "date":date, "dateTuple":dateTuple, "link":entry["link"], "images":[] }
169                id = self.generateUniqueId(tmpEntry)
170                
171                #articleTime = time.mktime(self.entries[id]["dateTuple"])
172                if not id in self.ids:
173                    soup = BeautifulSoup(self.getArticle(tmpEntry)) #tmpEntry["content"])
174                    images = soup('img')
175                    baseurl = ''.join(urlparse(tmpEntry["link"])[:-1])
176                    if imageCache:
177                       for img in images:
178                           try:
179                             filename = self.imageHandler.addImage(self.uniqueId, baseurl, img['src'])
180                             img['src']=filename
181                             tmpEntry["images"].append(filename)
182                           except:
183                               print "Error downloading image %s" %img
184                    tmpEntry["contentLink"] = configdir+self.uniqueId+".d/"+id+".html"
185                    file = open(tmpEntry["contentLink"], "w")
186                    file.write(soup.prettify())
187                    file.close()
188                    tmpEntries[id] = tmpEntry
189                    tmpIds.append(id)
190                    if id not in self.readItems:
191                        self.readItems[id] = False
192                else:
193                     tmpEntries[id] = self.entries[id]
194                     tmpIds.append(id)
195             
196            oldIds = self.ids[:]
197            for entryId in oldIds:
198                 if not entryId in tmpIds:
199                     try:
200                         articleTime = time.mktime(self.entries[entryId]["dateTuple"])
201                         if (currentTime - articleTime > 2*expiry):
202                             self.removeEntry(entryId)
203                             continue
204                         if (currentTime - articleTime > expiry) and (self.isEntryRead(entryId)):
205                             # Entry is over 24 hours, and already read
206                             self.removeEntry(entryId)
207                             continue
208                         tmpEntries[entryId] = self.entries[entryId]
209                         tmpIds.append(entryId)
210                     except:
211                         print "Error purging old articles %s" % entryId
212                         self.removeEntry(entryId)
213
214            self.entries = tmpEntries
215            self.ids = tmpIds
216            tmpUnread = 0
217            
218
219            ids = self.ids[:]
220            for id in ids:
221                if not self.readItems.has_key(id):
222                    self.readItems[id] = False
223                if self.readItems[id]==False:
224                   tmpUnread = tmpUnread + 1
225            del tmp
226            self.countUnread = tmpUnread
227            self.updateTime = time.asctime()
228            self.saveFeed(configdir)
229
230     def extractContent(self, entry):
231         content = ""
232         if entry.has_key('summary'):
233             content = entry.get('summary', '')
234         if entry.has_key('content'):
235             if len(entry.content[0].value) > len(content):
236                 content = entry.content[0].value
237         if content == "":
238             content = entry.get('description', '')
239         return content
240         
241     def extractDate(self, entry):
242         if entry.has_key("updated_parsed"):
243             date1 = entry["updated_parsed"]
244             date = time.strftime("%a, %d %b %Y %H:%M:%S",entry["updated_parsed"])
245         elif entry.has_key("published_parsed"):
246             date1 = entry["published_parsed"]
247             date = time.strftime("%a, %d %b %Y %H:%M:%S", entry["published_parsed"])
248         else:
249             date1= ""
250             date = ""
251         #print date1, date
252         return (date1, date)
253
254     def setEntryRead(self, id):
255         if self.readItems[id]==False:
256             self.countUnread = self.countUnread - 1
257             self.readItems[id] = True
258             
259     def setEntryUnread(self, id):
260         if self.readItems[id]==True:
261             self.countUnread = self.countUnread + 1
262             self.readItems[id] = False
263     
264     def isEntryRead(self, id):
265         return self.readItems[id]
266     
267     def getTitle(self, id):
268         return self.entries[id]["title"]
269     
270     def getContentLink(self, id):
271         if self.entries[id].has_key("contentLink"):
272             return self.entries[id]["contentLink"]
273         return self.entries[id]["link"]
274     
275     def getExternalLink(self, id):
276         return self.entries[id]["link"]
277     
278     def getDate(self, id):
279         return self.entries[id]["date"]
280
281     def getDateTuple(self, id):
282         return self.entries[id]["dateTuple"]
283  
284     def getUniqueId(self, index):
285         return self.ids[index]
286     
287     def generateUniqueId(self, entry):
288         return getId(entry["date"] + entry["title"])
289     
290     def getUpdateTime(self):
291         return self.updateTime
292     
293     def getEntries(self):
294         return self.entries
295     
296     def getIds(self):
297         return self.ids
298     
299     def getNextId(self, id):
300         return self.ids[(self.ids.index(id)+1) % self.getNumberOfEntries()]
301     
302     def getPreviousId(self, id):
303         return self.ids[(self.ids.index(id)-1) % self.getNumberOfEntries()]
304     
305     def getNumberOfUnreadItems(self):
306         return self.countUnread
307     
308     def getNumberOfEntries(self):
309         return len(self.ids)
310     
311     def getItem(self, id):
312         try:
313             return self.entries[id]
314         except:
315             return []
316     
317     def getContent(self, id):
318         if self.entries[id].has_key("contentLink"):
319             file = open(self.entries[id]["contentLink"])
320             content = file.read()
321             file.close()
322             return content
323         return self.entries[id]["content"]
324     
325     def removeEntry(self, id):
326         #try:
327         if self.entries.has_key(id):
328             entry = self.entries[id]
329             if entry.has_key("images"):
330                 for img in entry["images"]:
331                     self.imageHandler.removeImage(self.uniqueId, img)
332             
333             if entry.has_key("contentLink"):
334                 try:
335                     remove(entry["contentLink"])  #os.remove
336                 except:
337                     print "File not found for deletion: %s" % entry["contentLink"]
338             del self.entries[id]
339         else:
340             print "Entries has no %s key" % id
341         if id in self.ids:
342             self.ids.remove(id)
343         else:
344             print "Ids has no %s key" % id
345         if self.readItems.has_key(id):
346             if self.readItems[id]==False:
347                 self.countUnread = self.countUnread - 1
348             del self.readItems[id]
349         else:
350             print "ReadItems has no %s key" % id
351         #except:
352         #    print "Error removing entry %s" %id
353     
354     def getArticle(self, entry):
355         #self.setEntryRead(id)
356         #entry = self.entries[id]
357         title = entry['title']
358         #content = entry.get('content', entry.get('summary_detail', {}))
359         content = entry["content"]
360
361         link = entry['link']
362         date = entry["date"]
363
364         #text = '''<div style="color: black; background-color: white;">'''
365         text = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">'
366         text += "<html><head><title>" + title + "</title>"
367         text += '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>\n'
368         #text += '<style> body {-webkit-user-select: none;} </style>'
369         text += '</head><body><div><a href=\"' + link + '\">' + title + "</a>"
370         text += "<BR /><small><i>Date: " + date + "</i></small></div>"
371         text += "<BR /><BR />"
372         text += content
373         text += "</body></html>"
374         return text
375         
376 class ArchivedArticles(Feed):    
377     def addArchivedArticle(self, title, link, updated_parsed, configdir):
378         entry = {}
379         entry["title"] = title
380         entry["link"] = link
381         entry["summary"] = '<a href=\"' + link + '\">' + title + "</a>"
382         entry["updated_parsed"] = updated_parsed
383         entry["time"] = time.time()
384         #print entry
385         (dateTuple, date) = self.extractDate(entry)
386         tmpEntry = {"title":entry["title"], "content":self.extractContent(entry),
387                             "date":date, "dateTuple":dateTuple, "link":entry["link"], "images":[], "downloaded":False, "time":entry["time"] }
388         id = self.generateUniqueId(tmpEntry)
389         self.entries[id] = tmpEntry
390         self.ids.append(id)  
391         self.readItems[id] = False
392         self.countUnread = self.countUnread + 1
393         self.saveFeed(configdir)
394         self.saveUnread(configdir)
395         
396     def updateFeed(self, configdir, expiryTime=24, proxy=None, imageCache=False):
397         for id in self.getIds():
398             entry = self.entries[id]
399             if not entry["downloaded"]:
400                 #try:
401                     f = urllib2.urlopen(entry["link"])
402                     #entry["content"] = f.read()
403                     html = f.read()
404                     f.close()
405                     soup = BeautifulSoup(html)
406                     images = soup('img')
407                     baseurl = ''.join(urlparse(entry["link"])[:-1])
408                     for img in images:
409                         filename = self.imageHandler.addImage(self.uniqueId, baseurl, img['src'])
410                         #filename = configdir+self.uniqueId+".d/"+getId(img['src'])
411                         #if not isfile(filename):
412                         #    try:
413                         #        if img['src'].startswith("http"):
414                         #            f = urllib2.urlopen(img['src'])
415                         #        else:
416                         #            f = urllib2.urlopen(baseurl+"/"+img['src'])
417                         #            #print baseurl+"/"+img['src']
418                         #        print filename
419                         #        outf = open(filename, "w")
420                         #        outf.write(f.read())
421                         #        f.close()
422                         #        outf.close()
423                         #    except:
424                         #        print "Could not download" + img['src']
425                         img['src']=filename
426                         entry["images"].append(filename)
427                     entry["contentLink"] = configdir+self.uniqueId+".d/"+id+".html"
428                     file = open(entry["contentLink"], "w")
429                     file.write(soup.prettify())
430                     file.close()
431                     if len(entry["content"]) > 0:
432                         entry["downloaded"] = True
433                         entry["time"] = time.time()
434                         self.setEntryUnread(id)
435                 #except:
436                 #    pass
437             currentTime = time.time()
438             expiry = float(expiryTime) * 3600
439             if currentTime - entry["time"] > expiry:
440                 if self.isEntryRead(id):
441                     self.removeEntry(id)
442                 else:
443                     if currentTime - entry["time"] > 2*expiry:
444                         self.removeEntry(id)
445         self.updateTime = time.asctime()
446         self.saveFeed(configdir)
447
448     def getArticle(self, index):
449         self.setEntryRead(index)
450         content = self.getContent(index)
451         return content
452
453
454 class Listing:
455     # Lists all the feeds in a dictionary, and expose the data
456     def __init__(self, configdir):
457         self.configdir = configdir
458         #self.feeds = {}
459         if isfile(self.configdir+"feeds.pickle"):
460             file = open(self.configdir+"feeds.pickle")
461             self.listOfFeeds = pickle.load(file)
462             file.close()
463         else:
464             self.listOfFeeds = {getId("Slashdot"):{"title":"Slashdot", "url":"http://rss.slashdot.org/Slashdot/slashdot", "unread":0, "updateTime":"Never"}, }
465         try:
466             file = open(self.configdir+"images.pickle")
467             self.imageHandler = pickle.load(file)
468             file.close()
469         except:
470             self.imageHandler = ImageHandler(self.configdir)
471         if self.listOfFeeds.has_key("font"):
472             del self.listOfFeeds["font"]
473         if self.listOfFeeds.has_key("feedingit-order"):
474             self.sortedKeys = self.listOfFeeds["feedingit-order"]
475         else:
476             self.sortedKeys = self.listOfFeeds.keys()
477             if "font" in self.sortedKeys:
478                 self.sortedKeys.remove("font")
479             self.sortedKeys.sort(key=lambda obj: self.getFeedTitle(obj))
480         list = self.sortedKeys[:]
481         self.closeCurrentlyDisplayedFeed()
482
483     def addArchivedArticle(self, key, index):
484         feed = self.getFeed(key)
485         title = feed.getTitle(index)
486         link = feed.getExternalLink(index)
487         date = feed.getDateTuple(index)
488         if not self.listOfFeeds.has_key("ArchivedArticles"):
489             self.listOfFeeds["ArchivedArticles"] = {"title":"Archived Articles", "url":"", "unread":0, "updateTime":"Never"}
490             self.sortedKeys.append("ArchivedArticles")
491             #self.feeds["Archived Articles"] = ArchivedArticles("Archived Articles", "")
492             self.saveConfig()
493         archFeed = self.getFeed("ArchivedArticles")
494         archFeed.addArchivedArticle(title, link, date, self.configdir)
495         self.listOfFeeds[key]["unread"] = archFeed.getNumberOfUnreadItems()
496         
497     def loadFeed(self, key):
498             if isfile(self.configdir+key+".d/feed"):
499                 file = open(self.configdir+key+".d/feed")
500                 feed = pickle.load(file)
501                 file.close()
502                 try:
503                     feed.uniqueId
504                     feed.imageHandler
505                 except AttributeError:
506                     feed.uniqueId = getId(feed.name)
507                     feed.imageHandler = self.imageHandler
508                 #feed.reloadUnread(self.configdir)
509             else:
510                 #print key
511                 title = self.listOfFeeds[key]["title"]
512                 url = self.listOfFeeds[key]["url"]
513                 if key == "ArchivedArticles":
514                     feed = ArchivedArticles("ArchivedArticles", title, url, self.imageHandler)
515                 else:
516                     feed = Feed(getId(title), title, url, self.imageHandler)
517             return feed
518         
519     def updateFeeds(self, expiryTime=24, proxy=None, imageCache=False):
520         for key in self.getListOfFeeds():
521             feed = self.loadFeed(key)
522             feed.updateFeed(self.configdir, expiryTime, proxy, imageCache)
523             self.listOfFeeds[key]["unread"] = feed.getNumberOfUnreadItems()
524             self.listOfFeeds[key]["updateTime"] = feed.getUpdateTime()
525             
526     def updateFeed(self, key, expiryTime=24, proxy=None, imageCache=False):
527         feed = self.getFeed(key)
528         feed.updateFeed(self.configdir, expiryTime, proxy, imageCache)
529         self.listOfFeeds[key]["unread"] = feed.getNumberOfUnreadItems()
530         self.listOfFeeds[key]["updateTime"] = feed.getUpdateTime()
531         
532     def editFeed(self, key, title, url):
533         self.listOfFeeds[key]["title"] = title
534         self.listOfFeeds[key]["url"] = url
535         feed = self.loadFeed(key)
536         feed.editFeed(url)
537
538     def getFeed(self, key):
539         feed = self.loadFeed(key)
540         feed.reloadUnread(self.configdir)
541         return feed
542     
543     def getFeedUpdateTime(self, key):
544         #print self.listOfFeeds.has_key(key)
545         if not self.listOfFeeds[key].has_key("updateTime"):
546             self.listOfFeeds[key]["updateTime"] = "Never"
547         return self.listOfFeeds[key]["updateTime"]
548     
549     def getFeedNumberOfUnreadItems(self, key):
550         if not self.listOfFeeds[key].has_key("unread"):
551             self.listOfFeeds[key]["unread"] = 0
552         return self.listOfFeeds[key]["unread"]
553
554     def updateUnread(self, key, unreadItems):
555         self.listOfFeeds[key]["unread"] = unreadItems
556    
557     def getFeedTitle(self, key):
558         return self.listOfFeeds[key]["title"]
559     
560     def getFeedUrl(self, key):
561         return self.listOfFeeds[key]["url"]
562     
563     def getListOfFeeds(self):
564         return self.sortedKeys
565     
566     def addFeed(self, title, url):
567         if not self.listOfFeeds.has_key(getId(title)):
568             self.listOfFeeds[getId(title)] = {"title":title, "url":url, "unread":0, "updateTime":"Never"}
569             self.sortedKeys.append(getId(title))
570             self.saveConfig()
571             #self.feeds[getId(title)] = Feed(title, url)
572             return True
573         else:
574             return False
575         
576     def removeFeed(self, key):
577         del self.listOfFeeds[key]
578         self.sortedKeys.remove(key)
579         #del self.feeds[key]
580         if isdir(self.configdir+key+".d/"):
581            rmtree(self.configdir+key+".d/")
582         self.saveConfig()
583     
584     def saveConfig(self):
585         self.listOfFeeds["feedingit-order"] = self.sortedKeys
586         file = open(self.configdir+"feeds.pickle", "w")
587         pickle.dump(self.listOfFeeds, file)
588         file.close()
589         file = open(self.configdir+"images.pickle", "w")
590         pickle.dump(self.imageHandler, file)
591         file.close()
592         
593     def moveUp(self, key):
594         index = self.sortedKeys.index(key)
595         self.sortedKeys[index] = self.sortedKeys[index-1]
596         self.sortedKeys[index-1] = key
597         
598     def moveDown(self, key):
599         index = self.sortedKeys.index(key)
600         index2 = (index+1)%len(self.sortedKeys)
601         self.sortedKeys[index] = self.sortedKeys[index2]
602         self.sortedKeys[index2] = key
603         
604     def setCurrentlyDisplayedFeed(self, key):
605         self.currentlyDisplayedFeed = key
606     def closeCurrentlyDisplayedFeed(self):
607         self.currentlyDisplayedFeed = False
608     def getCurrentlyDisplayedFeed(self):
609         return self.currentlyDisplayedFeed
610     
611 if __name__ == "__main__":
612     listing = Listing('/home/user/.feedingit/')
613     list = listing.getListOfFeeds()[:]
614         #list.reverse()
615     for key in list:
616         if key.startswith('d8'):
617             print listing.getFeedUpdateTime(key)