0319999efd8ace73236d7ab3afe27ad143aec740
[feedingit] / src / rss_sqlite.py
1 #!/usr/bin/env python2.5
2
3
4 # Copyright (c) 2007-2008 INdT.
5 # Copyright (c) 2011 Neal H. Walfield
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU Lesser General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or
9 # (at your option) any later version.
10 #
11 #  This program is distributed in the hope that it will be useful,
12 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
13 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 #  GNU Lesser General Public License for more details.
15 #
16 #  You should have received a copy of the GNU Lesser General Public License
17 #  along with this program.  If not, see <http://www.gnu.org/licenses/>.
18 #
19
20 # ============================================================================
21 # Name        : FeedingIt.py
22 # Author      : Yves Marcoz
23 # Version     : 0.5.4
24 # Description : Simple RSS Reader
25 # ============================================================================
26
27 from __future__ import with_statement
28
29 import sqlite3
30 from os.path import isfile, isdir
31 from shutil import rmtree
32 from os import mkdir, remove, utime
33 import os
34 import md5
35 import feedparser
36 import time
37 import urllib2
38 from BeautifulSoup import BeautifulSoup
39 from urlparse import urljoin
40 from calendar import timegm
41 import threading
42 import traceback
43 from wc import wc, wc_init, woodchuck
44 import subprocess
45 import dbus
46 from updatedbus import update_server_object
47
48 from jobmanager import JobManager
49 import mainthread
50 from httpprogresshandler import HTTPProgressHandler
51 import random
52 import sys
53 import logging
54 logger = logging.getLogger(__name__)
55
56 def getId(string):
57     return md5.new(string).hexdigest()
58
59 def download_callback(connection):
60     if JobManager().do_quit:
61         raise KeyboardInterrupt
62
63 def downloader(progress_handler=None, proxy=None):
64     openers = []
65
66     if progress_handler is not None:
67         openers.append(progress_handler)
68     else:
69         openers.append(HTTPProgressHandler(download_callback))
70
71     if proxy:
72         openers.append(proxy)
73
74     return urllib2.build_opener(*openers)
75
76 def transfer_stats(sent, received, **kwargs):
77     """
78     This function takes two arguments: sent is the number of bytes
79     sent so far, received is the number of bytes received.  The
80     function returns a continuation that you can call later.
81
82     The continuation takes the same two arguments.  It returns a tuple
83     of the number of bytes sent, the number of bytes received and the
84     time since the original function was invoked.
85     """
86     start_time = time.time()
87     start_sent = sent
88     start_received = received
89
90     def e(sent, received, **kwargs):
91         return (sent - start_sent,
92                 received - start_received,
93                 time.time() - start_time)
94
95     return e
96
97 # If not None, a subprocess.Popen object corresponding to a
98 # update_feeds.py process.
99 update_feed_process = None
100
101 update_feeds_iface = None
102
103 jobs_at_start = 0
104
105 class BaseObject(object):
106     # Columns to cache.  Classes that inherit from this and use the
107     # cache mechanism should set this to a list of tuples, each of
108     # which contains two entries: the table and the column.  Note that
109     # both are case sensitive.
110     cached_columns = ()
111
112     def cache_invalidate(self, table=None):
113         """
114         Invalidate the cache.
115
116         If table is not None, invalidate only the specified table.
117         Otherwise, drop the whole cache.
118         """
119         if not hasattr(self, 'cache'):
120             return
121
122         if table is None:
123             del self.cache
124         else:
125             if table in self.cache:
126                 del self.cache[table]
127
128     def lookup(self, table, column, id=None):
129         """
130         Look up a column or value.  Uses a cache for columns in
131         cached_columns.  Note: the column is returned unsorted.
132         """
133         if not hasattr(self, 'cache'):
134             self.cache = {}
135
136         # Cache data for at most 60 seconds.
137         now = time.time()
138         try:
139             cache = self.cache[table]
140
141             if time.time() - cache[None] > 60:
142                 self.cache[table].clear()
143         except KeyError:
144             cache = None
145
146         if (cache is None
147             or (table, column) not in self.cached_columns):
148             # The cache is empty or the caller wants a column that we
149             # don't cache.
150             if (table, column) in self.cached_columns:
151                 do_cache = True
152
153                 self.cache[table] = cache = {}
154                 columns = []
155                 for t, c in self.cached_columns:
156                     if table == t:
157                         cache[c] = {}
158                         columns.append(c)
159
160                 columns.append('id')
161                 where = ""
162             else:
163                 do_cache = False
164
165                 columns = (colums,)
166                 if id is not None:
167                     where = "where id = '%s'" % id
168                 else:
169                     where = ""
170
171             results = self.db.execute(
172                 "SELECT %s FROM %s %s" % (','.join(columns), table, where))
173
174             if do_cache:
175                 for r in results:
176                     values = list(r)
177                     i = values.pop()
178                     for index, value in enumerate(values):
179                         cache[columns[index]][i] = value
180
181                 cache[None] = now
182             else:
183                 results = []
184                 for r in results:
185                     if id is not None:
186                         return values[0]
187
188                     results.append(values[0])
189
190                 return results
191         else:
192             cache = self.cache[table]
193
194         try:
195             if id is not None:
196                 return cache[column][id]
197             else:
198                 return cache[column].values()
199         except KeyError:
200             return None
201
202 class Feed(BaseObject):
203     # Columns to cache.
204     cached_columns = (('feed', 'read'),
205                       ('feed', 'title'))
206
207     serial_execution_lock = threading.Lock()
208
209     def _getdb(self):
210         try:
211             db = self.tls.db
212         except AttributeError:
213             db = sqlite3.connect("%s/%s.db" % (self.dir, self.key), timeout=120)
214             self.tls.db = db
215         return db
216     db = property(_getdb)
217
218     def __init__(self, configdir, key):
219         self.key = key
220         self.configdir = configdir
221         self.dir = "%s/%s.d" %(self.configdir, self.key)
222         self.tls = threading.local ()
223
224         if not isdir(self.dir):
225             mkdir(self.dir)
226         if not isfile("%s/%s.db" %(self.dir, self.key)):
227             self.db.execute("CREATE TABLE feed (id text, title text, contentLink text, date float, updated float, link text, read int);")
228             self.db.execute("CREATE TABLE images (id text, imagePath text);")
229             self.db.commit()
230
231     def addImage(self, configdir, key, baseurl, url, proxy=None, opener=None):
232         filename = configdir+key+".d/"+getId(url)
233         if not isfile(filename):
234             try:
235                 if not opener:
236                     opener = downloader(proxy=proxy)
237
238                 abs_url = urljoin(baseurl,url)
239                 f = opener.open(abs_url)
240                 try:
241                     with open(filename, "w") as outf:
242                         for data in f:
243                             outf.write(data)
244                 finally:
245                     f.close()
246             except (urllib2.HTTPError, urllib2.URLError, IOError), exception:
247                 logger.info("Could not download image %s: %s"
248                             % (abs_url, str (exception)))
249                 return None
250             except:
251                 exception = sys.exc_info()[0]
252
253                 logger.info("Downloading image %s: %s" %
254                             (abs_url, traceback.format_exc()))
255                 try:
256                     remove(filename)
257                 except OSError:
258                     pass
259
260                 raise exception
261         else:
262             #open(filename,"a").close()  # "Touch" the file
263             file = open(filename,"a")
264             utime(filename, None)
265             file.close()
266         return filename
267
268     def updateFeed(self, configdir, url, etag, modified, expiryTime=24, proxy=None, imageCache=False, priority=0, postFeedUpdateFunc=None, *postFeedUpdateFuncArgs):
269         if (os.path.basename(sys.argv[0]) == 'update_feeds.py'):
270             def doit():
271                 def it():
272                     self._updateFeed(configdir, url, etag, modified, expiryTime, proxy, imageCache, postFeedUpdateFunc, *postFeedUpdateFuncArgs)
273                 return it
274             JobManager().execute(doit(), self.key, priority=priority)
275         else:
276             def send_update_request():
277                 global update_feeds_iface
278                 if update_feeds_iface is None:
279                     bus=dbus.SessionBus()
280                     remote_object = bus.get_object(
281                         "org.marcoz.feedingit", # Connection name
282                         "/org/marcoz/feedingit/update" # Object's path
283                         )
284                     update_feeds_iface = dbus.Interface(
285                         remote_object, 'org.marcoz.feedingit')
286
287                 try:
288                     update_feeds_iface.Update(self.key)
289                 except Exception, e:
290                     logger.error("Invoking org.marcoz.feedingit.Update: %s"
291                                  % str(e))
292                     update_feeds_iface = None
293                 else:
294                     return True
295
296             if send_update_request():
297                 # Success!  It seems we were able to start the update
298                 # daemon via dbus (or, it was already running).
299                 return
300
301             global update_feed_process
302             if (update_feed_process is None
303                 or update_feed_process.poll() is not None):
304                 # The update_feeds process is not running.  Start it.
305                 update_feeds = os.path.join(os.path.dirname(__file__),
306                                             'update_feeds.py')
307                 argv = ['/usr/bin/env', 'python', update_feeds, '--daemon' ]
308                 logger.debug("Starting update_feeds: running %s"
309                              % (str(argv),))
310                 update_feed_process = subprocess.Popen(argv)
311                 # Make sure the dbus calls go to the right process:
312                 # rebind.
313                 update_feeds_iface = None
314
315             for _ in xrange(5):
316                 if send_update_request():
317                     break
318                 time.sleep(1)
319
320     def _updateFeed(self, configdir, url, etag, modified, expiryTime=24, proxy=None, imageCache=False, postFeedUpdateFunc=None, *postFeedUpdateFuncArgs):
321         logger.debug("Updating %s" % url)
322
323         success = False
324         have_serial_execution_lock = False
325         try:
326             update_start = time.time ()
327
328             progress_handler = HTTPProgressHandler(download_callback)
329
330             openers = [progress_handler]
331             if proxy:
332                 openers.append (proxy)
333             kwargs = {'handlers':openers}
334             
335             feed_transfer_stats = transfer_stats(0, 0)
336
337             tmp=feedparser.parse(url, etag=etag, modified=modified, **kwargs)
338             download_duration = time.time () - update_start
339
340             opener = downloader(progress_handler, proxy)
341
342             if JobManager().do_quit:
343                 raise KeyboardInterrupt
344
345             process_start = time.time()
346
347             # Expiry time is in hours
348             expiry = float(expiryTime) * 3600.
349     
350             currentTime = 0
351     
352             def wc_success():
353                 try:
354                     wc().stream_register (self.key, "", 6 * 60 * 60)
355                 except woodchuck.ObjectExistsError:
356                     pass
357                 try:
358                     wc()[self.key].updated (
359                         indicator=(woodchuck.Indicator.ApplicationVisual
360                                    |woodchuck.Indicator.StreamWide),
361                         transferred_down=progress_handler.stats['received'],
362                         transferred_up=progress_handler.stats['sent'],
363                         transfer_time=update_start,
364                         transfer_duration=download_duration,
365                         new_objects=len (tmp.entries),
366                         objects_inline=len (tmp.entries))
367                 except KeyError:
368                     logger.warn(
369                         "Failed to register update of %s with woodchuck!"
370                         % (self.key))
371     
372             http_status = tmp.get ('status', 200)
373     
374             # Check if the parse was succesful.  If the http status code
375             # is 304, then the download was successful, but there is
376             # nothing new.  Indeed, no content is returned.  This make a
377             # 304 look like an error because there are no entries and the
378             # parse fails.  But really, everything went great!  Check for
379             # this first.
380             if http_status == 304:
381                 logger.debug("%s: No changes to feed." % (self.key,))
382                 mainthread.execute(wc_success, async=True)
383                 success = True
384             elif len(tmp["entries"])==0 and not tmp.version:
385                 # An error occured fetching or parsing the feed.  (Version
386                 # will be either None if e.g. the connection timed our or
387                 # '' if the data is not a proper feed)
388                 logger.error(
389                     "Error fetching %s: version is: %s: error: %s"
390                     % (url, str (tmp.version),
391                        str (tmp.get ('bozo_exception', 'Unknown error'))))
392                 logger.debug(tmp)
393                 def register_stream_update_failed(http_status):
394                     def doit():
395                         logger.debug("%s: stream update failed!" % self.key)
396     
397                         try:
398                             # It's not easy to get the feed's title from here.
399                             # At the latest, the next time the application is
400                             # started, we'll fix up the human readable name.
401                             wc().stream_register (self.key, "", 6 * 60 * 60)
402                         except woodchuck.ObjectExistsError:
403                             pass
404                         ec = woodchuck.TransferStatus.TransientOther
405                         if 300 <= http_status and http_status < 400:
406                             ec = woodchuck.TransferStatus.TransientNetwork
407                         if 400 <= http_status and http_status < 500:
408                             ec = woodchuck.TransferStatus.FailureGone
409                         if 500 <= http_status and http_status < 600:
410                             ec = woodchuck.TransferStatus.TransientNetwork
411                         wc()[self.key].update_failed(ec)
412                     return doit
413                 if wc().available:
414                     mainthread.execute(
415                         register_stream_update_failed(
416                             http_status=http_status),
417                         async=True)
418             else:
419                currentTime = time.time()
420                # The etag and modified value should only be updated if the content was not null
421                try:
422                    etag = tmp["etag"]
423                except KeyError:
424                    etag = None
425                try:
426                    modified = tmp["modified"]
427                except KeyError:
428                    modified = None
429                try:
430                    abs_url = urljoin(tmp["feed"]["link"],"/favicon.ico")
431                    f = opener.open(abs_url)
432                    data = f.read()
433                    f.close()
434                    outf = open(self.dir+"/favicon.ico", "w")
435                    outf.write(data)
436                    outf.close()
437                    del data
438                except (urllib2.HTTPError, urllib2.URLError), exception:
439                    logger.debug("Could not download favicon %s: %s"
440                                 % (abs_url, str (exception)))
441     
442                self.serial_execution_lock.acquire ()
443                have_serial_execution_lock = True
444
445                #reversedEntries = self.getEntries()
446                #reversedEntries.reverse()
447     
448                ids = self.getIds()
449     
450                tmp["entries"].reverse()
451                for entry in tmp["entries"]:
452                    # Yield so as to make the main thread a bit more
453                    # responsive.
454                    time.sleep(0)
455     
456                    entry_transfer_stats = transfer_stats(
457                        *feed_transfer_stats(**progress_handler.stats)[0:2])
458
459                    if JobManager().do_quit:
460                        raise KeyboardInterrupt
461
462                    object_size = 0
463
464                    date = self.extractDate(entry)
465                    try:
466                        entry["title"]
467                    except KeyError:
468                        entry["title"] = "No Title"
469                    try :
470                        entry["link"]
471                    except KeyError:
472                        entry["link"] = ""
473                    try:
474                        entry["author"]
475                    except KeyError:
476                        entry["author"] = None
477                    if(not(entry.has_key("id"))):
478                        entry["id"] = None
479                    content = self.extractContent(entry)
480                    object_size = len (content)
481                    tmpEntry = {"title":entry["title"], "content":content,
482                                 "date":date, "link":entry["link"], "author":entry["author"], "id":entry["id"]}
483                    id = self.generateUniqueId(tmpEntry)
484                    
485                    #articleTime = time.mktime(self.entries[id]["dateTuple"])
486                    soup = BeautifulSoup(self.getArticle(tmpEntry)) #tmpEntry["content"])
487                    images = soup('img')
488                    baseurl = tmpEntry["link"]
489                    #if not id in ids:
490                    if imageCache and len(images) > 0:
491                        self.serial_execution_lock.release ()
492                        have_serial_execution_lock = False
493                        for img in images:
494                            filename = self.addImage(
495                                configdir, self.key, baseurl, img['src'],
496                                opener=opener)
497                            if filename:
498                                 img['src']="file://%s" %filename
499                                 count = self.db.execute("SELECT count(1) FROM images where id=? and imagePath=?;", (id, filename )).fetchone()[0]
500                                 if count == 0:
501                                     self.db.execute("INSERT INTO images (id, imagePath) VALUES (?, ?);", (id, filename) )
502                                     self.db.commit()
503     
504                                 try:
505                                     object_size += os.path.getsize (filename)
506                                 except os.error, exception:
507                                     logger.error ("Error getting size of %s: %s"
508                                                   % (filename, exception))
509                        self.serial_execution_lock.acquire ()
510                        have_serial_execution_lock = True
511     
512                    tmpEntry["contentLink"] = configdir+self.key+".d/"+id+".html"
513                    file = open(tmpEntry["contentLink"], "w")
514                    file.write(soup.prettify())
515                    file.close()
516                    if id in ids:
517                        self.db.execute("UPDATE feed SET updated=? WHERE id=?;", (currentTime, id) )
518                        self.db.commit()
519                    else:
520                        values = (id, tmpEntry["title"], tmpEntry["contentLink"], tmpEntry["date"], currentTime, tmpEntry["link"], 0)
521                        self.db.execute("INSERT INTO feed (id, title, contentLink, date, updated, link, read) VALUES (?, ?, ?, ?, ?, ?, ?);", values)
522                        self.db.commit()
523 #                   else:
524 #                       try:
525 #                           self.db.execute("UPDATE feed SET updated=? WHERE id=?;", (currentTime, id) )
526 #                           self.db.commit()
527 #                           filename = configdir+self.key+".d/"+id+".html"
528 #                           file = open(filename,"a")
529 #                           utime(filename, None)
530 #                           file.close()
531 #                           images = self.db.execute("SELECT imagePath FROM images where id=?;", (id, )).fetchall()
532 #                           for image in images:
533 #                                file = open(image[0],"a")
534 #                                utime(image[0], None)
535 #                                file.close()
536 #                       except:
537 #                           pass
538     
539                    # Register the object with Woodchuck and mark it as
540                    # downloaded.
541                    def register_object_transferred(
542                            id, title, publication_time,
543                            sent, received, object_size):
544                        def doit():
545                            logger.debug("Registering transfer of object %s"
546                                         % title)
547                            try:
548                                obj = wc()[self.key].object_register(
549                                    object_identifier=id,
550                                    human_readable_name=title)
551                            except woodchuck.ObjectExistsError:
552                                obj = wc()[self.key][id]
553                            else:
554                                obj.publication_time = publication_time
555                                obj.transferred(
556                                    indicator=(
557                                        woodchuck.Indicator.ApplicationVisual
558                                        |woodchuck.Indicator.StreamWide),
559                                    transferred_down=received,
560                                    transferred_up=sent,
561                                    object_size=object_size)
562                        return doit
563                    if wc().available:
564                        # If the entry does not contain a publication
565                        # time, the attribute won't exist.
566                        pubtime = entry.get('date_parsed', None)
567                        if pubtime:
568                            publication_time = time.mktime (pubtime)
569                        else:
570                            publication_time = None
571
572                        sent, received, _ \
573                            = entry_transfer_stats(**progress_handler.stats)
574                        # sent and received are for objects (in
575                        # particular, images) associated with this
576                        # item.  We also want to attribute the data
577                        # transferred for the item's content.  This is
578                        # a good first approximation.
579                        received += len(content)
580
581                        mainthread.execute(
582                            register_object_transferred(
583                                id=id,
584                                title=tmpEntry["title"],
585                                publication_time=publication_time,
586                                sent=sent, received=received,
587                                object_size=object_size),
588                            async=True)
589                self.db.commit()
590
591                sent, received, _ \
592                    = feed_transfer_stats(**progress_handler.stats)
593                logger.debug (
594                    "%s: Update successful: transferred: %d/%d; objects: %d)"
595                    % (url, sent, received, len (tmp.entries)))
596                mainthread.execute (wc_success, async=True)
597                success = True
598
599             rows = self.db.execute("SELECT id FROM feed WHERE (read=0 AND updated<?) OR (read=1 AND updated<?);", (currentTime-2*expiry, currentTime-expiry))
600             for row in rows:
601                self.removeEntry(row[0])
602             
603             from glob import glob
604             from os import stat
605             for file in glob(configdir+self.key+".d/*"):
606                 #
607                 stats = stat(file)
608                 #
609                 # put the two dates into matching format
610                 #
611                 lastmodDate = stats[8]
612                 #
613                 expDate = time.time()-expiry*3
614                 # check if image-last-modified-date is outdated
615                 #
616                 if expDate > lastmodDate:
617                     #
618                     try:
619                         #
620                         #print 'Removing', file
621                         #
622                         # XXX: Tell woodchuck.
623                         remove(file) # commented out for testing
624                         #
625                     except OSError, exception:
626                         #
627                         logger.error('Could not remove %s: %s'
628                                      % (file, str (exception)))
629             logger.debug("updated %s: %fs in download, %fs in processing"
630                          % (self.key, download_duration,
631                             time.time () - process_start))
632         except:
633             logger.error("Updating %s: %s" % (self.key, traceback.format_exc()))
634         finally:
635             self.db.commit ()
636
637             if have_serial_execution_lock:
638                 self.serial_execution_lock.release ()
639
640             updateTime = 0
641             try:
642                 rows = self.db.execute("SELECT MAX(date) FROM feed;")
643                 for row in rows:
644                     updateTime=row[0]
645             except Exception, e:
646                 logger.error("Fetching update time: %s: %s"
647                              % (str(e), traceback.format_exc()))
648             finally:
649                 if not success:
650                     etag = None
651                     modified = None
652                 title = None
653                 try:
654                     title = tmp.feed.title
655                 except (AttributeError, UnboundLocalError), exception:
656                     pass
657                 if postFeedUpdateFunc is not None:
658                     postFeedUpdateFunc (self.key, updateTime, etag, modified,
659                                         title, *postFeedUpdateFuncArgs)
660
661         self.cache_invalidate()
662
663     def setEntryRead(self, id):
664         self.db.execute("UPDATE feed SET read=1 WHERE id=?;", (id,) )
665         self.db.commit()
666
667         def doit():
668             try:
669                 wc()[self.key][id].used()
670             except KeyError:
671                 pass
672         if wc().available():
673             mainthread.execute(doit, async=True)
674         self.cache_invalidate('feed')
675
676     def setEntryUnread(self, id):
677         self.db.execute("UPDATE feed SET read=0 WHERE id=?;", (id,) )
678         self.db.commit()     
679         self.cache_invalidate('feed')
680         
681     def markAllAsRead(self):
682         self.db.execute("UPDATE feed SET read=1 WHERE read=0;")
683         self.db.commit()
684         self.cache_invalidate('feed')
685
686     def isEntryRead(self, id):
687         return self.lookup('feed', 'read', id) == 1
688     
689     def getTitle(self, id):
690         return self.lookup('feed', 'title', id)
691     
692     def getContentLink(self, id):
693         return self.db.execute("SELECT contentLink FROM feed WHERE id=?;", (id,) ).fetchone()[0]
694     
695     def getExternalLink(self, id):
696         return self.db.execute("SELECT link FROM feed WHERE id=?;", (id,) ).fetchone()[0]
697     
698     def getDate(self, id):
699         dateStamp = self.db.execute("SELECT date FROM feed WHERE id=?;", (id,) ).fetchone()[0]
700         return time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime(dateStamp))
701
702     def getDateTuple(self, id):
703         dateStamp = self.db.execute("SELECT date FROM feed WHERE id=?;", (id,) ).fetchone()[0]
704         return time.localtime(dateStamp)
705     
706     def getDateStamp(self, id):
707         return self.db.execute("SELECT date FROM feed WHERE id=?;", (id,) ).fetchone()[0]
708     
709     def generateUniqueId(self, entry):
710         """
711         Generate a stable identifier for the article.  For the same
712         entry, this should result in the same identifier.  If
713         possible, the identifier should remain the same even if the
714         article is updated.
715         """
716         # Prefer the entry's id, which is supposed to be globally
717         # unique.
718         key = entry.get('id', None)
719         if not key:
720             # Next, try the link to the content.
721             key = entry.get('link', None)
722         if not key:
723             # Ok, the title and the date concatenated are likely to be
724             # relatively stable.
725             key = entry.get('title', None) + entry.get('date', None)
726         if not key:
727             # Hmm, the article's content will at least guarantee no
728             # false negatives (i.e., missing articles)
729             key = entry.get('content', None)
730         if not key:
731             # If all else fails, just use a random number.
732             key = str (random.random ())
733         return getId (key)
734     
735     def getIds(self, onlyUnread=False):
736         if onlyUnread:
737             rows = self.db.execute("SELECT id FROM feed where read=0 ORDER BY date DESC;").fetchall()
738         else:
739             rows = self.db.execute("SELECT id FROM feed ORDER BY date DESC;").fetchall()
740         ids = []
741         for row in rows:
742             ids.append(row[0])
743         #ids.reverse()
744         return ids
745     
746     def getNextId(self, id, forward=True):
747         if forward:
748             delta = 1
749         else:
750             delta = -1
751         ids = self.getIds()
752         index = ids.index(id)
753         return ids[(index + delta) % len(ids)]
754         
755     def getPreviousId(self, id):
756         return self.getNextId(id, forward=False)
757     
758     def getNumberOfUnreadItems(self):
759         return self.db.execute("SELECT count(*) FROM feed WHERE read=0;").fetchone()[0]
760     
761     def getNumberOfEntries(self):
762         return self.db.execute("SELECT count(*) FROM feed;").fetchone()[0]
763
764     def getArticle(self, entry):
765         #self.setEntryRead(id)
766         #entry = self.entries[id]
767         title = entry['title']
768         #content = entry.get('content', entry.get('summary_detail', {}))
769         content = entry["content"]
770
771         link = entry['link']
772         author = entry['author']
773         date = time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime(entry["date"]) )
774
775         #text = '''<div style="color: black; background-color: white;">'''
776         text = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">'
777         text += "<html><head><title>" + title + "</title>"
778         text += '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>\n'
779         #text += '<style> body {-webkit-user-select: none;} </style>'
780         text += '</head><body bgcolor=\"#ffffff\"><div><a href=\"' + link + '\">' + title + "</a>"
781         if author != None:
782             text += "<BR /><small><i>Author: " + author + "</i></small>"
783         text += "<BR /><small><i>Date: " + date + "</i></small></div>"
784         text += "<BR /><BR />"
785         text += content
786         text += "</body></html>"
787         return text
788    
789     def getContent(self, id):
790         contentLink = self.db.execute("SELECT contentLink FROM feed WHERE id=?;", (id,)).fetchone()[0]
791         try:
792             file = open(self.entries[id]["contentLink"])
793             content = file.read()
794             file.close()
795         except:
796             content = "Content unavailable"
797         return content
798     
799     def extractDate(self, entry):
800         if entry.has_key("updated_parsed"):
801             return timegm(entry["updated_parsed"])
802         elif entry.has_key("published_parsed"):
803             return timegm(entry["published_parsed"])
804         else:
805             return time.time()
806         
807     def extractContent(self, entry):
808         content = ""
809         if entry.has_key('summary'):
810             content = entry.get('summary', '')
811         if entry.has_key('content'):
812             if len(entry.content[0].value) > len(content):
813                 content = entry.content[0].value
814         if content == "":
815             content = entry.get('description', '')
816         return content
817     
818     def removeEntry(self, id):
819         contentLink = self.db.execute("SELECT contentLink FROM feed WHERE id=?;", (id,)).fetchone()[0]
820         if contentLink:
821             try:
822                 remove(contentLink)
823             except OSError, exception:
824                 logger.error("Deleting %s: %s" % (contentLink, str (exception)))
825         self.db.execute("DELETE FROM feed WHERE id=?;", (id,) )
826         self.db.execute("DELETE FROM images WHERE id=?;", (id,) )
827         self.db.commit()
828
829         def doit():
830             try:
831                 wc()[self.key][id].files_deleted (
832                     woodchuck.DeletionResponse.Deleted)
833                 del wc()[self.key][id]
834             except KeyError:
835                 pass
836         if wc().available():
837             mainthread.execute (doit, async=True)
838  
839 class ArchivedArticles(Feed):    
840     def addArchivedArticle(self, title, link, date, configdir):
841         id = self.generateUniqueId({"date":date, "title":title})
842         values = (id, title, link, date, 0, link, 0)
843         self.db.execute("INSERT INTO feed (id, title, contentLink, date, updated, link, read) VALUES (?, ?, ?, ?, ?, ?, ?);", values)
844         self.db.commit()
845
846     def updateFeed(self, configdir, url, etag, modified, expiryTime=24, proxy=None, imageCache=False):
847         currentTime = 0
848         rows = self.db.execute("SELECT id, link FROM feed WHERE updated=0;")
849         for row in rows:
850             currentTime = time.time()
851             id = row[0]
852             link = row[1]
853             f = urllib2.urlopen(link)
854             #entry["content"] = f.read()
855             html = f.read()
856             f.close()
857             soup = BeautifulSoup(html)
858             images = soup('img')
859             baseurl = link
860             for img in images:
861                 filename = self.addImage(configdir, self.key, baseurl, img['src'], proxy=proxy)
862                 img['src']=filename
863                 self.db.execute("INSERT INTO images (id, imagePath) VALUES (?, ?);", (id, filename) )
864                 self.db.commit()
865             contentLink = configdir+self.key+".d/"+id+".html"
866             file = open(contentLink, "w")
867             file.write(soup.prettify())
868             file.close()
869             
870             self.db.execute("UPDATE feed SET read=0, contentLink=?, updated=? WHERE id=?;", (contentLink, time.time(), id) )
871             self.db.commit()
872         return (currentTime, None, None)
873     
874     def purgeReadArticles(self):
875         rows = self.db.execute("SELECT id FROM feed WHERE read=1;")
876         #ids = self.getIds()
877         for row in rows:
878             self.removeArticle(row[0])
879
880     def removeArticle(self, id):
881         rows = self.db.execute("SELECT imagePath FROM images WHERE id=?;", (id,) )
882         for row in rows:
883             try:
884                 count = self.db.execute("SELECT count(*) FROM images WHERE id!=? and imagePath=?;", (id,row[0]) ).fetchone()[0]
885                 if count == 0:
886                     os.remove(row[0])
887             except:
888                 pass
889         self.removeEntry(id)
890
891 class Listing(BaseObject):
892     # Columns to cache.
893     cached_columns = (('feeds', 'updateTime'),
894                       ('feeds', 'unread'),
895                       ('feeds', 'title'),
896                       ('categories', 'title'))
897
898     def _getdb(self):
899         try:
900             db = self.tls.db
901         except AttributeError:
902             db = sqlite3.connect("%s/feeds.db" % self.configdir, timeout=120)
903             self.tls.db = db
904         return db
905     db = property(_getdb)
906
907     # Lists all the feeds in a dictionary, and expose the data
908     def __init__(self, config, configdir):
909         self.config = config
910         self.configdir = configdir
911
912         self.tls = threading.local ()
913         
914         try:
915             table = self.db.execute("SELECT sql FROM sqlite_master").fetchone()
916             if table == None:
917                 self.db.execute("CREATE TABLE feeds(id text, url text, title text, unread int, updateTime float, rank int, etag text, modified text, widget int, category int);")
918                 self.db.execute("CREATE TABLE categories(id text, title text, unread int, rank int);")
919                 self.addCategory("Default Category")
920                 if isfile(self.configdir+"feeds.pickle"):
921                     self.importOldFormatFeeds()
922                 else:
923                     self.addFeed("Maemo News", "http://maemo.org/news/items.xml")    
924             else:
925                 from string import find, upper
926                 if find(upper(table[0]), "WIDGET")<0:
927                     self.db.execute("ALTER TABLE feeds ADD COLUMN widget int;")
928                     self.db.execute("UPDATE feeds SET widget=1;")
929                     self.db.commit()
930                 if find(upper(table[0]), "CATEGORY")<0:
931                     self.db.execute("CREATE TABLE categories(id text, title text, unread int, rank int);")
932                     self.addCategory("Default Category")
933                     self.db.execute("ALTER TABLE feeds ADD COLUMN category int;")
934                     self.db.execute("UPDATE feeds SET category=1;")
935             self.db.commit()
936         except:
937             pass
938
939         # Check that Woodchuck's state is up to date with respect our
940         # state.
941         updater = os.path.basename(sys.argv[0]) == 'update_feeds.py'
942         wc_init (self, True if updater else False)
943         if wc().available() and updater:
944             # The list of known streams.
945             streams = wc().streams_list ()
946             stream_ids = [s.identifier for s in streams]
947
948             # Register any unknown streams.  Remove known streams from
949             # STREAMS_IDS.
950             for key in self.getListOfFeeds():
951                 title = self.getFeedTitle(key)
952                 # XXX: We should also check whether the list of
953                 # articles/objects in each feed/stream is up to date.
954                 if key not in stream_ids:
955                     logger.debug(
956                         "Registering previously unknown channel: %s (%s)"
957                         % (key, title,))
958                     # Use a default refresh interval of 6 hours.
959                     wc().stream_register (key, title, 6 * 60 * 60)
960                 else:
961                     # Make sure the human readable name is up to date.
962                     if wc()[key].human_readable_name != title:
963                         wc()[key].human_readable_name = title
964                     stream_ids.remove (key)
965                     
966
967             # Unregister any streams that are no longer subscribed to.
968             for id in stream_ids:
969                 logger.debug("Unregistering %s" % (id,))
970                 w.stream_unregister (id)
971
972     def importOldFormatFeeds(self):
973         """This function loads feeds that are saved in an outdated format, and converts them to sqlite"""
974         import rss
975         listing = rss.Listing(self.configdir)
976         rank = 0
977         for id in listing.getListOfFeeds():
978             try:
979                 rank += 1
980                 values = (id, listing.getFeedTitle(id) , listing.getFeedUrl(id), 0, time.time(), rank, None, "None", 1)
981                 self.db.execute("INSERT INTO feeds (id, title, url, unread, updateTime, rank, etag, modified, widget, category) VALUES (?, ?, ? ,? ,? ,?, ?, ?, ?, 1);", values)
982                 self.db.commit()
983                 
984                 feed = listing.getFeed(id)
985                 new_feed = self.getFeed(id)
986                 
987                 items = feed.getIds()[:]
988                 items.reverse()
989                 for item in items:
990                         if feed.isEntryRead(item):
991                             read_status = 1
992                         else:
993                             read_status = 0 
994                         date = timegm(feed.getDateTuple(item))
995                         title = feed.getTitle(item)
996                         newId = new_feed.generateUniqueId({"date":date, "title":title})
997                         values = (newId, title , feed.getContentLink(item), date, tuple(time.time()), feed.getExternalLink(item), read_status)
998                         new_feed.db.execute("INSERT INTO feed (id, title, contentLink, date, updated, link, read) VALUES (?, ?, ?, ?, ?, ?, ?);", values)
999                         new_feed.db.commit()
1000                         try:
1001                             images = feed.getImages(item)
1002                             for image in images:
1003                                 new_feed.db.execute("INSERT INTO images (id, imagePath) VALUES (?, ?);", (item, image) )
1004                                 new_feed.db.commit()
1005                         except:
1006                             pass
1007                 self.updateUnread(id)
1008             except:
1009                 logger.error("importOldFormatFeeds: %s"
1010                              % (traceback.format_exc(),))
1011         remove(self.configdir+"feeds.pickle")
1012                 
1013         
1014     def addArchivedArticle(self, key, index):
1015         feed = self.getFeed(key)
1016         title = feed.getTitle(index)
1017         link = feed.getExternalLink(index)
1018         date = feed.getDate(index)
1019         count = self.db.execute("SELECT count(*) FROM feeds where id=?;", ("ArchivedArticles",) ).fetchone()[0]
1020         if count == 0:
1021             self.addFeed("Archived Articles", "", id="ArchivedArticles")
1022
1023         archFeed = self.getFeed("ArchivedArticles")
1024         archFeed.addArchivedArticle(title, link, date, self.configdir)
1025         self.updateUnread("ArchivedArticles")
1026         
1027     def updateFeed(self, key, expiryTime=None, proxy=None, imageCache=None,
1028                    priority=0):
1029         if expiryTime is None:
1030             expiryTime = self.config.getExpiry()
1031         if not expiryTime:
1032             # Default to 24 hours
1033             expriyTime = 24
1034         if proxy is None:
1035             (use_proxy, proxy) = self.config.getProxy()
1036             if not use_proxy:
1037                 proxy = None
1038         if imageCache is None:
1039             imageCache = self.config.getImageCache()
1040
1041         feed = self.getFeed(key)
1042         (url, etag, modified) = self.db.execute("SELECT url, etag, modified FROM feeds WHERE id=?;", (key,) ).fetchone()
1043         try:
1044             modified = time.struct_time(eval(modified))
1045         except:
1046             modified = None
1047         feed.updateFeed(
1048             self.configdir, url, etag, modified, expiryTime, proxy, imageCache,
1049             priority, postFeedUpdateFunc=self._queuePostFeedUpdate)
1050
1051     def _queuePostFeedUpdate(self, *args, **kwargs):
1052         mainthread.execute (self._postFeedUpdate, async=True, *args, **kwargs)
1053
1054     def _postFeedUpdate(self, key, updateTime, etag, modified, title):
1055         if modified==None:
1056             modified="None"
1057         else:
1058             modified=str(tuple(modified))
1059         if updateTime > 0:
1060             self.db.execute("UPDATE feeds SET updateTime=?, etag=?, modified=? WHERE id=?;", (updateTime, etag, modified, key) )
1061         else:
1062             self.db.execute("UPDATE feeds SET etag=?, modified=? WHERE id=?;", (etag, modified, key) )
1063
1064         if title is not None:
1065             self.db.execute("UPDATE feeds SET title=(case WHEN title=='' THEN ? ELSE title END) where id=?;",
1066                             (title, key))
1067         self.db.commit()
1068         self.cache_invalidate('feeds')
1069         self.updateUnread(key)
1070
1071         update_server_object().ArticleCountUpdated()
1072
1073         stats = JobManager().stats()
1074         global jobs_at_start
1075         completed = stats['jobs-completed'] - jobs_at_start
1076         in_progress = stats['jobs-in-progress']
1077         queued = stats['jobs-queued']
1078
1079         percent = (100 * ((completed + in_progress / 2.))
1080                    / (completed + in_progress + queued))
1081
1082         update_server_object().UpdateProgress(
1083             percent, completed, in_progress, queued, 0, 0, 0, key)
1084
1085         if in_progress == 0 and queued == 0:
1086             jobs_at_start = stats['jobs-completed']
1087         
1088     def getFeed(self, key):
1089         if key == "ArchivedArticles":
1090             return ArchivedArticles(self.configdir, key)
1091         return Feed(self.configdir, key)
1092         
1093     def editFeed(self, key, title, url, category=None):
1094         if category:
1095             self.db.execute("UPDATE feeds SET title=?, url=?, category=? WHERE id=?;", (title, url, category, key))
1096         else:
1097             self.db.execute("UPDATE feeds SET title=?, url=? WHERE id=?;", (title, url, key))
1098         self.db.commit()
1099         self.cache_invalidate('feeds')
1100
1101         if wc().available():
1102             try:
1103                 wc()[key].human_readable_name = title
1104             except KeyError:
1105                 logger.debug("Feed %s (%s) unknown." % (key, title))
1106         
1107     def getFeedUpdateTime(self, key):
1108         update_time = self.lookup('feeds', 'updateTime', key)
1109
1110         if not update_time:
1111             return "Never"
1112
1113         delta = time.time() - update_time
1114
1115         delta_hours = delta / (60. * 60.)
1116         if delta_hours < .1:
1117             return "A few minutes ago"
1118         if delta_hours < .75:
1119             return "Less than an hour ago"
1120         if delta_hours < 1.5:
1121             return "About an hour ago"
1122         if delta_hours < 18:
1123             return "About %d hours ago" % (int(delta_hours + 0.5),)
1124
1125         delta_days = delta_hours / 24.
1126         if delta_days < 1.5:
1127             return "About a day ago"
1128         if delta_days < 18:
1129             return "%d days ago" % (int(delta_days + 0.5),)
1130
1131         delta_weeks = delta_days / 7.
1132         if delta_weeks <= 8:
1133             return "%d weeks ago" % int(delta_weeks + 0.5)
1134
1135         delta_months = delta_days / 30.
1136         if delta_months <= 30:
1137             return "%d months ago" % int(delta_months + 0.5)
1138
1139         return time.strftime("%x", time.gmtime(update_time))
1140         
1141     def getFeedNumberOfUnreadItems(self, key):
1142         return self.lookup('feeds', 'unread', key)
1143         
1144     def getFeedTitle(self, key):
1145         title = self.lookup('feeds', 'title', key)
1146         if title:
1147             return title
1148
1149         return self.getFeedUrl(key)
1150         
1151     def getFeedUrl(self, key):
1152         return self.db.execute("SELECT url FROM feeds WHERE id=?;", (key,)).fetchone()[0]
1153     
1154     def getFeedCategory(self, key):
1155         return self.db.execute("SELECT category FROM feeds WHERE id=?;", (key,)).fetchone()[0]
1156         
1157     def getListOfFeeds(self, category=None):
1158         if category:
1159             rows = self.db.execute("SELECT id FROM feeds WHERE category=? ORDER BY rank;", (category, ) )
1160         else:
1161             rows = self.db.execute("SELECT id FROM feeds ORDER BY rank;" )
1162         keys = []
1163         for row in rows:
1164             if row[0]:
1165                 keys.append(row[0])
1166         return keys
1167     
1168     def getListOfCategories(self):
1169         return list(row[0] for row in self.db.execute(
1170                 "SELECT id FROM categories ORDER BY rank;"))
1171     
1172     def getCategoryTitle(self, id):
1173         return self.lookup('categories', 'title', id)
1174     
1175     def getSortedListOfKeys(self, order, onlyUnread=False, category=1):
1176         if   order == "Most unread":
1177             tmp = "ORDER BY unread DESC"
1178             #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][1], reverse=True)
1179         elif order == "Least unread":
1180             tmp = "ORDER BY unread"
1181             #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][1])
1182         elif order == "Most recent":
1183             tmp = "ORDER BY updateTime DESC"
1184             #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][2], reverse=True)
1185         elif order == "Least recent":
1186             tmp = "ORDER BY updateTime"
1187             #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][2])
1188         else: # order == "Manual" or invalid value...
1189             tmp = "ORDER BY rank"
1190             #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][0])
1191         if onlyUnread:
1192             sql = "SELECT id FROM feeds WHERE unread>0 AND category=%s " %category + tmp 
1193         else:
1194             sql = "SELECT id FROM feeds WHERE category=%s " %category + tmp
1195         rows = self.db.execute(sql)
1196         keys = []
1197         for row in rows:
1198             if row[0]:
1199                 keys.append(row[0])
1200         return keys
1201     
1202     def getFavicon(self, key):
1203         filename = "%s%s.d/favicon.ico" % (self.configdir, key)
1204         if isfile(filename):
1205             return filename
1206         else:
1207             return False
1208         
1209     def updateUnread(self, key):
1210         feed = self.getFeed(key)
1211         self.db.execute("UPDATE feeds SET unread=? WHERE id=?;", (feed.getNumberOfUnreadItems(), key))
1212         self.db.commit()
1213         self.cache_invalidate('feeds')
1214
1215     def addFeed(self, title, url, id=None, category=1):
1216         if not id:
1217             id = getId(url)
1218         count = self.db.execute("SELECT count(*) FROM feeds WHERE id=?;", (id,) ).fetchone()[0]
1219         if count == 0:
1220             max_rank = self.db.execute("SELECT MAX(rank) FROM feeds;").fetchone()[0]
1221             if max_rank == None:
1222                 max_rank = 0
1223             values = (id, title, url, 0, 0, max_rank+1, None, "None", 1, category)
1224             self.db.execute("INSERT INTO feeds (id, title, url, unread, updateTime, rank, etag, modified, widget, category) VALUES (?, ?, ? ,? ,? ,?, ?, ?, ?,?);", values)
1225             self.db.commit()
1226             # Ask for the feed object, it will create the necessary tables
1227             self.getFeed(id)
1228
1229             if wc().available():
1230                 # Register the stream with Woodchuck.  Update approximately
1231                 # every 6 hours.
1232                 wc().stream_register(stream_identifier=id,
1233                                      human_readable_name=title,
1234                                      freshness=6*60*60)
1235
1236             return True
1237         else:
1238             return False
1239         
1240     def addCategory(self, title):
1241         rank = self.db.execute("SELECT MAX(rank)+1 FROM categories;").fetchone()[0]
1242         if rank==None:
1243             rank=1
1244         id = self.db.execute("SELECT MAX(id)+1 FROM categories;").fetchone()[0]
1245         if id==None:
1246             id=1
1247         self.db.execute("INSERT INTO categories (id, title, unread, rank) VALUES (?, ?, 0, ?)", (id, title, rank))
1248         self.db.commit()
1249     
1250     def removeFeed(self, key):
1251         if wc().available ():
1252             try:
1253                 del wc()[key]
1254             except KeyError:
1255                 logger.debug("Removing unregistered feed %s failed" % (key,))
1256
1257         rank = self.db.execute("SELECT rank FROM feeds WHERE id=?;", (key,) ).fetchone()[0]
1258         self.db.execute("DELETE FROM feeds WHERE id=?;", (key, ))
1259         self.db.execute("UPDATE feeds SET rank=rank-1 WHERE rank>?;", (rank,) )
1260         self.db.commit()
1261
1262         if isdir(self.configdir+key+".d/"):
1263            rmtree(self.configdir+key+".d/")
1264            
1265     def removeCategory(self, key):
1266         if self.db.execute("SELECT count(*) FROM categories;").fetchone()[0] > 1:
1267             rank = self.db.execute("SELECT rank FROM categories WHERE id=?;", (key,) ).fetchone()[0]
1268             self.db.execute("DELETE FROM categories WHERE id=?;", (key, ))
1269             self.db.execute("UPDATE categories SET rank=rank-1 WHERE rank>?;", (rank,) )
1270             self.db.execute("UPDATE feeds SET category=1 WHERE category=?;", (key,) )
1271             self.db.commit()
1272         
1273     #def saveConfig(self):
1274     #    self.listOfFeeds["feedingit-order"] = self.sortedKeys
1275     #    file = open(self.configdir+"feeds.pickle", "w")
1276     #    pickle.dump(self.listOfFeeds, file)
1277     #    file.close()
1278         
1279     def moveUp(self, key):
1280         rank = self.db.execute("SELECT rank FROM feeds WHERE id=?;", (key,)).fetchone()[0]
1281         if rank>0:
1282             self.db.execute("UPDATE feeds SET rank=? WHERE rank=?;", (rank, rank-1) )
1283             self.db.execute("UPDATE feeds SET rank=? WHERE id=?;", (rank-1, key) )
1284             self.db.commit()
1285             
1286     def moveCategoryUp(self, key):
1287         rank = self.db.execute("SELECT rank FROM categories WHERE id=?;", (key,)).fetchone()[0]
1288         if rank>0:
1289             self.db.execute("UPDATE categories SET rank=? WHERE rank=?;", (rank, rank-1) )
1290             self.db.execute("UPDATE categories SET rank=? WHERE id=?;", (rank-1, key) )
1291             self.db.commit()
1292         
1293     def moveDown(self, key):
1294         rank = self.db.execute("SELECT rank FROM feeds WHERE id=?;", (key,)).fetchone()[0]
1295         max_rank = self.db.execute("SELECT MAX(rank) FROM feeds;").fetchone()[0]
1296         if rank<max_rank:
1297             self.db.execute("UPDATE feeds SET rank=? WHERE rank=?;", (rank, rank+1) )
1298             self.db.execute("UPDATE feeds SET rank=? WHERE id=?;", (rank+1, key) )
1299             self.db.commit()
1300             
1301     def moveCategoryDown(self, key):
1302         rank = self.db.execute("SELECT rank FROM categories WHERE id=?;", (key,)).fetchone()[0]
1303         max_rank = self.db.execute("SELECT MAX(rank) FROM categories;").fetchone()[0]
1304         if rank<max_rank:
1305             self.db.execute("UPDATE categories SET rank=? WHERE rank=?;", (rank, rank+1) )
1306             self.db.execute("UPDATE categories SET rank=? WHERE id=?;", (rank+1, key) )
1307             self.db.commit()
1308             
1309