vcs.maemo.org Git - feedingit/blob - src/rss_sqlite.py

   1 #!/usr/bin/env python2.5
   2
   3 #
   4 # Copyright (c) 2007-2008 INdT.
   5 # Copyright (c) 2011 Neal H. Walfield
   6 # This program is free software: you can redistribute it and/or modify
   7 # it under the terms of the GNU Lesser General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or
   9 # (at your option) any later version.
  10 #
  11 #  This program is distributed in the hope that it will be useful,
  12 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 #  GNU Lesser General Public License for more details.
  15 #
  16 #  You should have received a copy of the GNU Lesser General Public License
  17 #  along with this program.  If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 # ============================================================================
  21 # Name        : FeedingIt.py
  22 # Author      : Yves Marcoz
  23 # Version     : 0.5.4
  24 # Description : Simple RSS Reader
  25 # ============================================================================
  26
  27 import sqlite3
  28 from os.path import isfile, isdir
  29 from shutil import rmtree
  30 from os import mkdir, remove, utime
  31 import os
  32 import md5
  33 import feedparser
  34 import time
  35 import urllib2
  36 from BeautifulSoup import BeautifulSoup
  37 from urlparse import urljoin
  38 from calendar import timegm
  39 import threading
  40 import traceback
  41 from wc import wc, wc_init, woodchuck
  42 import subprocess
  43 import dbus
  44 from updatedbus import update_server_object
  45
  46 from jobmanager import JobManager
  47 import mainthread
  48 from httpprogresshandler import HTTPProgressHandler
  49 import random
  50 import sys
  51 import logging
  52 logger = logging.getLogger(__name__)
  53
  54 def getId(string):
  55     return md5.new(string).hexdigest()
  56
  57 def download_callback(connection):
  58     if JobManager().do_quit:
  59         raise KeyboardInterrupt
  60
  61 def downloader(progress_handler=None, proxy=None):
  62     openers = []
  63
  64     if progress_handler:
  65         openers.append (progress_handler)
  66     else:
  67         openers.append(HTTPProgressHandler(download_callback))
  68
  69     if proxy:
  70         openers.append (proxy)
  71
  72     return urllib2.build_opener (*openers)
  73
  74 # If not None, a subprocess.Popen object corresponding to a
  75 # update_feeds.py process.
  76 update_feed_process = None
  77
  78 update_feeds_iface = None
  79
  80 jobs_at_start = 0
  81
  82 class Feed:
  83     serial_execution_lock = threading.Lock()
  84
  85     def _getdb(self):
  86         try:
  87             db = self.tls.db
  88         except AttributeError:
  89             db = sqlite3.connect("%s/%s.db" % (self.dir, self.key), timeout=120)
  90             self.tls.db = db
  91         return db
  92     db = property(_getdb)
  93
  94     def __init__(self, configdir, key):
  95         self.key = key
  96         self.configdir = configdir
  97         self.dir = "%s/%s.d" %(self.configdir, self.key)
  98         self.tls = threading.local ()
  99
 100         if not isdir(self.dir):
 101             mkdir(self.dir)
 102         if not isfile("%s/%s.db" %(self.dir, self.key)):
 103             self.db.execute("CREATE TABLE feed (id text, title text, contentLink text, date float, updated float, link text, read int);")
 104             self.db.execute("CREATE TABLE images (id text, imagePath text);")
 105             self.db.commit()
 106
 107     def addImage(self, configdir, key, baseurl, url, proxy=None, opener=None):
 108         filename = configdir+key+".d/"+getId(url)
 109         if not isfile(filename):
 110             try:
 111                 if not opener:
 112                     opener = downloader(proxy=proxy)
 113
 114                 abs_url = urljoin(baseurl,url)
 115                 f = opener.open(abs_url)
 116                 outf = open(filename, "w")
 117                 outf.write(f.read())
 118                 f.close()
 119                 outf.close()
 120             except (urllib2.HTTPError, urllib2.URLError, IOError), exception:
 121                 logger.info("Could not download image %s: %s"
 122                             % (abs_url, str (exception)))
 123                 return None
 124             except:
 125                 exception = sys.exc_info()[0]
 126
 127                 logger.info("Downloading image %s: %s" %
 128                             (abs_url, traceback.format_exc()))
 129                 try:
 130                     remove(filename)
 131                 except OSError:
 132                     pass
 133
 134                 raise exception
 135         else:
 136             #open(filename,"a").close()  # "Touch" the file
 137             file = open(filename,"a")
 138             utime(filename, None)
 139             file.close()
 140         return filename
 141
 142     def updateFeed(self, configdir, url, etag, modified, expiryTime=24, proxy=None, imageCache=False, priority=0, postFeedUpdateFunc=None, *postFeedUpdateFuncArgs):
 143         if (os.path.basename(sys.argv[0]) == 'update_feeds.py'):
 144             def doit():
 145                 def it():
 146                     self._updateFeed(configdir, url, etag, modified, expiryTime, proxy, imageCache, postFeedUpdateFunc, *postFeedUpdateFuncArgs)
 147                 return it
 148             JobManager().execute(doit(), self.key, priority=priority)
 149         else:
 150             def send_update_request():
 151                 global update_feeds_iface
 152                 if update_feeds_iface is None:
 153                     bus=dbus.SessionBus()
 154                     remote_object = bus.get_object(
 155                         "org.marcoz.feedingit", # Connection name
 156                         "/org/marcoz/feedingit/update" # Object's path
 157                         )
 158                     update_feeds_iface = dbus.Interface(
 159                         remote_object, 'org.marcoz.feedingit')
 160
 161                 try:
 162                     update_feeds_iface.Update(self.key)
 163                 except Exception, e:
 164                     logger.error("Invoking org.marcoz.feedingit.Update: %s"
 165                                  % str(e))
 166                     update_feeds_iface = None
 167                 else:
 168                     return True
 169
 170             if send_update_request():
 171                 # Success!  It seems we were able to start the update
 172                 # daemon via dbus (or, it was already running).
 173                 return
 174
 175             global update_feed_process
 176             if (update_feed_process is None
 177                 or update_feed_process.poll() is not None):
 178                 # The update_feeds process is not running.  Start it.
 179                 update_feeds = os.path.join(os.path.dirname(__file__),
 180                                             'update_feeds.py')
 181                 argv = ['/usr/bin/env', 'python', update_feeds, '--daemon' ]
 182                 logger.debug("Starting update_feeds: running %s"
 183                              % (str(argv),))
 184                 update_feed_process = subprocess.Popen(argv)
 185                 # Make sure the dbus calls go to the right process:
 186                 # rebind.
 187                 update_feeds_iface = None
 188
 189             for _ in xrange(5):
 190                 if send_update_request():
 191                     break
 192                 time.sleep(1)
 193
 194     def _updateFeed(self, configdir, url, etag, modified, expiryTime=24, proxy=None, imageCache=False, postFeedUpdateFunc=None, *postFeedUpdateFuncArgs):
 195         success = False
 196         have_serial_execution_lock = False
 197         try:
 198             download_start = time.time ()
 199
 200             progress_handler = HTTPProgressHandler(download_callback)
 201
 202             openers = [progress_handler]
 203             if proxy:
 204                 openers.append (proxy)
 205             kwargs = {'handlers':openers}
 206
 207             tmp=feedparser.parse(url, etag=etag, modified=modified, **kwargs)
 208             download_duration = time.time () - download_start
 209
 210             opener = downloader(progress_handler, proxy)
 211
 212             if JobManager().do_quit:
 213                 raise KeyboardInterrupt
 214
 215             process_start = time.time()
 216
 217             # Expiry time is in hours
 218             expiry = float(expiryTime) * 3600.
 219
 220             currentTime = 0
 221
 222             have_woodchuck = mainthread.execute (wc().available)
 223
 224             def wc_success():
 225                 try:
 226                     wc().stream_register (self.key, "", 6 * 60 * 60)
 227                 except woodchuck.ObjectExistsError:
 228                     pass
 229                 try:
 230                     wc()[self.key].updated (
 231                         indicator=(woodchuck.Indicator.ApplicationVisual
 232                                    |woodchuck.Indicator.StreamWide),
 233                         transferred_down=progress_handler.stats['received'],
 234                         transferred_up=progress_handler.stats['sent'],
 235                         transfer_time=download_start,
 236                         transfer_duration=download_duration,
 237                         new_objects=len (tmp.entries),
 238                         objects_inline=len (tmp.entries))
 239                 except KeyError:
 240                     logger.warn(
 241                         "Failed to register update of %s with woodchuck!"
 242                         % (self.key))
 243
 244             http_status = tmp.get ('status', 200)
 245
 246             # Check if the parse was succesful.  If the http status code
 247             # is 304, then the download was successful, but there is
 248             # nothing new.  Indeed, no content is returned.  This make a
 249             # 304 look like an error because there are no entries and the
 250             # parse fails.  But really, everything went great!  Check for
 251             # this first.
 252             if http_status == 304:
 253                 logger.debug("%s: No changes to feed." % (self.key,))
 254                 mainthread.execute (wc_success, async=True)
 255                 success = True
 256             elif len(tmp["entries"])==0 and not tmp.version:
 257                 # An error occured fetching or parsing the feed.  (Version
 258                 # will be either None if e.g. the connection timed our or
 259                 # '' if the data is not a proper feed)
 260                 logger.error(
 261                     "Error fetching %s: version is: %s: error: %s"
 262                     % (url, str (tmp.version),
 263                        str (tmp.get ('bozo_exception', 'Unknown error'))))
 264                 logger.debug(tmp)
 265                 if have_woodchuck:
 266                     def e():
 267                         logger.debug("%s: stream update failed!" % self.key)
 268
 269                         try:
 270                             # It's not easy to get the feed's title from here.
 271                             # At the latest, the next time the application is
 272                             # started, we'll fix up the human readable name.
 273                             wc().stream_register (self.key, "", 6 * 60 * 60)
 274                         except woodchuck.ObjectExistsError:
 275                             pass
 276                         ec = woodchuck.TransferStatus.TransientOther
 277                         if 300 <= http_status and http_status < 400:
 278                             ec = woodchuck.TransferStatus.TransientNetwork
 279                         if 400 <= http_status and http_status < 500:
 280                             ec = woodchuck.TransferStatus.FailureGone
 281                         if 500 <= http_status and http_status < 600:
 282                             ec = woodchuck.TransferStatus.TransientNetwork
 283                         wc()[self.key].update_failed(ec)
 284                     mainthread.execute (e, async=True)
 285             else:
 286                currentTime = time.time()
 287                # The etag and modified value should only be updated if the content was not null
 288                try:
 289                    etag = tmp["etag"]
 290                except KeyError:
 291                    etag = None
 292                try:
 293                    modified = tmp["modified"]
 294                except KeyError:
 295                    modified = None
 296                try:
 297                    abs_url = urljoin(tmp["feed"]["link"],"/favicon.ico")
 298                    f = opener.open(abs_url)
 299                    data = f.read()
 300                    f.close()
 301                    outf = open(self.dir+"/favicon.ico", "w")
 302                    outf.write(data)
 303                    outf.close()
 304                    del data
 305                except (urllib2.HTTPError, urllib2.URLError), exception:
 306                    logger.debug("Could not download favicon %s: %s"
 307                                 % (abs_url, str (exception)))
 308
 309                self.serial_execution_lock.acquire ()
 310                have_serial_execution_lock = True
 311
 312                #reversedEntries = self.getEntries()
 313                #reversedEntries.reverse()
 314
 315                ids = self.getIds()
 316
 317                tmp["entries"].reverse()
 318                for entry in tmp["entries"]:
 319                    # Yield so as to make the main thread a bit more
 320                    # responsive.
 321                    time.sleep(0)
 322
 323                    if JobManager().do_quit:
 324                        raise KeyboardInterrupt
 325
 326                    received_base = progress_handler.stats['received']
 327                    sent_base = progress_handler.stats['sent']
 328                    object_size = 0
 329
 330                    date = self.extractDate(entry)
 331                    try:
 332                        entry["title"]
 333                    except KeyError:
 334                        entry["title"] = "No Title"
 335                    try :
 336                        entry["link"]
 337                    except KeyError:
 338                        entry["link"] = ""
 339                    try:
 340                        entry["author"]
 341                    except KeyError:
 342                        entry["author"] = None
 343                    if(not(entry.has_key("id"))):
 344                        entry["id"] = None
 345                    content = self.extractContent(entry)
 346                    object_size = len (content)
 347                    received_base -= len (content)
 348                    tmpEntry = {"title":entry["title"], "content":content,
 349                                 "date":date, "link":entry["link"], "author":entry["author"], "id":entry["id"]}
 350                    id = self.generateUniqueId(tmpEntry)
 351
 352                    #articleTime = time.mktime(self.entries[id]["dateTuple"])
 353                    soup = BeautifulSoup(self.getArticle(tmpEntry)) #tmpEntry["content"])
 354                    images = soup('img')
 355                    baseurl = tmpEntry["link"]
 356                    #if not id in ids:
 357                    if imageCache and len(images) > 0:
 358                        self.serial_execution_lock.release ()
 359                        have_serial_execution_lock = False
 360                        for img in images:
 361                             filename = self.addImage(configdir, self.key, baseurl, img['src'], proxy=proxy)
 362                             if filename:
 363                                 img['src']="file://%s" %filename
 364                                 count = self.db.execute("SELECT count(1) FROM images where id=? and imagePath=?;", (id, filename )).fetchone()[0]
 365                                 if count == 0:
 366                                     self.db.execute("INSERT INTO images (id, imagePath) VALUES (?, ?);", (id, filename) )
 367                                     self.db.commit()
 368
 369                                 try:
 370                                     object_size += os.path.getsize (filename)
 371                                 except os.error, exception:
 372                                     logger.error ("Error getting size of %s: %s"
 373                                                   % (filename, exception))
 374                        self.serial_execution_lock.acquire ()
 375                        have_serial_execution_lock = True
 376
 377                    tmpEntry["contentLink"] = configdir+self.key+".d/"+id+".html"
 378                    file = open(tmpEntry["contentLink"], "w")
 379                    file.write(soup.prettify())
 380                    file.close()
 381                    if id in ids:
 382                        self.db.execute("UPDATE feed SET updated=? WHERE id=?;", (currentTime, id) )
 383                        self.db.commit()
 384                    else:
 385                        values = (id, tmpEntry["title"], tmpEntry["contentLink"], tmpEntry["date"], currentTime, tmpEntry["link"], 0)
 386                        self.db.execute("INSERT INTO feed (id, title, contentLink, date, updated, link, read) VALUES (?, ?, ?, ?, ?, ?, ?);", values)
 387                        self.db.commit()
 388 #                   else:
 389 #                       try:
 390 #                           self.db.execute("UPDATE feed SET updated=? WHERE id=?;", (currentTime, id) )
 391 #                           self.db.commit()
 392 #                           filename = configdir+self.key+".d/"+id+".html"
 393 #                           file = open(filename,"a")
 394 #                           utime(filename, None)
 395 #                           file.close()
 396 #                           images = self.db.execute("SELECT imagePath FROM images where id=?;", (id, )).fetchall()
 397 #                           for image in images:
 398 #                                file = open(image[0],"a")
 399 #                                utime(image[0], None)
 400 #                                file.close()
 401 #                       except:
 402 #                           pass
 403
 404                    # Register the object with Woodchuck and mark it as
 405                    # downloaded.
 406                    if have_woodchuck:
 407                        def e():
 408                            try:
 409                                obj = wc()[self.key].object_register(
 410                                    object_identifier=id,
 411                                    human_readable_name=tmpEntry["title"])
 412                            except woodchuck.ObjectExistsError:
 413                                obj = wc()[self.key][id]
 414                            else:
 415                                # If the entry does not contain a publication
 416                                # time, the attribute won't exist.
 417                                pubtime = entry.get ('date_parsed', None)
 418                                if pubtime:
 419                                    obj.publication_time = time.mktime (pubtime)
 420
 421                                received = (progress_handler.stats['received']
 422                                            - received_base)
 423                                sent = progress_handler.stats['sent'] - sent_base
 424                                obj.transferred (
 425                                    indicator=(woodchuck.Indicator.ApplicationVisual
 426                                               |woodchuck.Indicator.StreamWide),
 427                                    transferred_down=received,
 428                                    transferred_up=sent,
 429                                    object_size=object_size)
 430                        mainthread.execute(e, async=True)
 431                self.db.commit()
 432
 433                logger.debug (
 434                    "%s: Update successful: transferred: %d/%d; objects: %d)"
 435                    % (self.key,
 436                       progress_handler.stats['sent'],
 437                       progress_handler.stats['received'],
 438                       len (tmp.entries)))
 439                mainthread.execute (wc_success, async=True)
 440                success = True
 441
 442             rows = self.db.execute("SELECT id FROM feed WHERE (read=0 AND updated<?) OR (read=1 AND updated<?);", (currentTime-2*expiry, currentTime-expiry))
 443             for row in rows:
 444                self.removeEntry(row[0])
 445
 446             from glob import glob
 447             from os import stat
 448             for file in glob(configdir+self.key+".d/*"):
 449                 #
 450                 stats = stat(file)
 451                 #
 452                 # put the two dates into matching format
 453                 #
 454                 lastmodDate = stats[8]
 455                 #
 456                 expDate = time.time()-expiry*3
 457                 # check if image-last-modified-date is outdated
 458                 #
 459                 if expDate > lastmodDate:
 460                     #
 461                     try:
 462                         #
 463                         #print 'Removing', file
 464                         #
 465                         # XXX: Tell woodchuck.
 466                         remove(file) # commented out for testing
 467                         #
 468                     except OSError, exception:
 469                         #
 470                         logger.error('Could not remove %s: %s'
 471                                      % (file, str (exception)))
 472             logger.debug("updated %s: %fs in download, %fs in processing"
 473                          % (self.key, download_duration,
 474                             time.time () - process_start))
 475         except:
 476             logger.error("Updating %s: %s" % (self.key, traceback.format_exc()))
 477         finally:
 478             self.db.commit ()
 479
 480             if have_serial_execution_lock:
 481                 self.serial_execution_lock.release ()
 482
 483             updateTime = 0
 484             try:
 485                 rows = self.db.execute("SELECT MAX(date) FROM feed;")
 486                 for row in rows:
 487                     updateTime=row[0]
 488             except Exception, e:
 489                 logger.error("Fetching update time: %s: %s"
 490                              % (str(e), traceback.format_exc()))
 491             finally:
 492                 if not success:
 493                     etag = None
 494                     modified = None
 495                 title = None
 496                 try:
 497                     title = tmp.feed.title
 498                 except (AttributeError, UnboundLocalError), exception:
 499                     pass
 500                 if postFeedUpdateFunc is not None:
 501                     postFeedUpdateFunc (self.key, updateTime, etag, modified,
 502                                         title, *postFeedUpdateFuncArgs)
 503
 504     def setEntryRead(self, id):
 505         self.db.execute("UPDATE feed SET read=1 WHERE id=?;", (id,) )
 506         self.db.commit()
 507
 508         def e():
 509             if wc().available():
 510                 try:
 511                     wc()[self.key][id].used()
 512                 except KeyError:
 513                     pass
 514
 515     def setEntryUnread(self, id):
 516         self.db.execute("UPDATE feed SET read=0 WHERE id=?;", (id,) )
 517         self.db.commit()
 518
 519     def markAllAsRead(self):
 520         self.db.execute("UPDATE feed SET read=1 WHERE read=0;")
 521         self.db.commit()
 522
 523     def isEntryRead(self, id):
 524         read_status = self.db.execute("SELECT read FROM feed WHERE id=?;", (id,) ).fetchone()[0]
 525         return read_status==1  # Returns True if read==1, and False if read==0
 526
 527     def getTitle(self, id):
 528         return self.db.execute("SELECT title FROM feed WHERE id=?;", (id,) ).fetchone()[0]
 529
 530     def getContentLink(self, id):
 531         return self.db.execute("SELECT contentLink FROM feed WHERE id=?;", (id,) ).fetchone()[0]
 532
 533     def getExternalLink(self, id):
 534         return self.db.execute("SELECT link FROM feed WHERE id=?;", (id,) ).fetchone()[0]
 535
 536     def getDate(self, id):
 537         dateStamp = self.db.execute("SELECT date FROM feed WHERE id=?;", (id,) ).fetchone()[0]
 538         return time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime(dateStamp))
 539
 540     def getDateTuple(self, id):
 541         dateStamp = self.db.execute("SELECT date FROM feed WHERE id=?;", (id,) ).fetchone()[0]
 542         return time.localtime(dateStamp)
 543
 544     def getDateStamp(self, id):
 545         return self.db.execute("SELECT date FROM feed WHERE id=?;", (id,) ).fetchone()[0]
 546
 547     def generateUniqueId(self, entry):
 548         """
 549         Generate a stable identifier for the article.  For the same
 550         entry, this should result in the same identifier.  If
 551         possible, the identifier should remain the same even if the
 552         article is updated.
 553         """
 554         # Prefer the entry's id, which is supposed to be globally
 555         # unique.
 556         key = entry.get('id', None)
 557         if not key:
 558             # Next, try the link to the content.
 559             key = entry.get('link', None)
 560         if not key:
 561             # Ok, the title and the date concatenated are likely to be
 562             # relatively stable.
 563             key = entry.get('title', None) + entry.get('date', None)
 564         if not key:
 565             # Hmm, the article's content will at least guarantee no
 566             # false negatives (i.e., missing articles)
 567             key = entry.get('content', None)
 568         if not key:
 569             # If all else fails, just use a random number.
 570             key = str (random.random ())
 571         return getId (key)
 572
 573     def getIds(self, onlyUnread=False):
 574         if onlyUnread:
 575             rows = self.db.execute("SELECT id FROM feed where read=0 ORDER BY date DESC;").fetchall()
 576         else:
 577             rows = self.db.execute("SELECT id FROM feed ORDER BY date DESC;").fetchall()
 578         ids = []
 579         for row in rows:
 580             ids.append(row[0])
 581         #ids.reverse()
 582         return ids
 583
 584     def getNextId(self, id):
 585         ids = self.getIds()
 586         index = ids.index(id)
 587         return ids[(index+1)%len(ids)]
 588
 589     def getPreviousId(self, id):
 590         ids = self.getIds()
 591         index = ids.index(id)
 592         return ids[(index-1)%len(ids)]
 593
 594     def getNumberOfUnreadItems(self):
 595         return self.db.execute("SELECT count(*) FROM feed WHERE read=0;").fetchone()[0]
 596
 597     def getNumberOfEntries(self):
 598         return self.db.execute("SELECT count(*) FROM feed;").fetchone()[0]
 599
 600     def getArticle(self, entry):
 601         #self.setEntryRead(id)
 602         #entry = self.entries[id]
 603         title = entry['title']
 604         #content = entry.get('content', entry.get('summary_detail', {}))
 605         content = entry["content"]
 606
 607         link = entry['link']
 608         author = entry['author']
 609         date = time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime(entry["date"]) )
 610
 611         #text = '''<div style="color: black; background-color: white;">'''
 612         text = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">'
 613         text += "<html><head><title>" + title + "</title>"
 614         text += '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>\n'
 615         #text += '<style> body {-webkit-user-select: none;} </style>'
 616         text += '</head><body bgcolor=\"#ffffff\"><div><a href=\"' + link + '\">' + title + "</a>"
 617         if author != None:
 618             text += "<BR /><small><i>Author: " + author + "</i></small>"
 619         text += "<BR /><small><i>Date: " + date + "</i></small></div>"
 620         text += "<BR /><BR />"
 621         text += content
 622         text += "</body></html>"
 623         return text
 624
 625     def getContent(self, id):
 626         contentLink = self.db.execute("SELECT contentLink FROM feed WHERE id=?;", (id,)).fetchone()[0]
 627         try:
 628             file = open(self.entries[id]["contentLink"])
 629             content = file.read()
 630             file.close()
 631         except:
 632             content = "Content unavailable"
 633         return content
 634
 635     def extractDate(self, entry):
 636         if entry.has_key("updated_parsed"):
 637             return timegm(entry["updated_parsed"])
 638         elif entry.has_key("published_parsed"):
 639             return timegm(entry["published_parsed"])
 640         else:
 641             return time.time()
 642
 643     def extractContent(self, entry):
 644         content = ""
 645         if entry.has_key('summary'):
 646             content = entry.get('summary', '')
 647         if entry.has_key('content'):
 648             if len(entry.content[0].value) > len(content):
 649                 content = entry.content[0].value
 650         if content == "":
 651             content = entry.get('description', '')
 652         return content
 653
 654     def removeEntry(self, id):
 655         contentLink = self.db.execute("SELECT contentLink FROM feed WHERE id=?;", (id,)).fetchone()[0]
 656         if contentLink:
 657             try:
 658                 remove(contentLink)
 659             except OSError, exception:
 660                 logger.error("Deleting %s: %s" % (contentLink, str (exception)))
 661         self.db.execute("DELETE FROM feed WHERE id=?;", (id,) )
 662         self.db.execute("DELETE FROM images WHERE id=?;", (id,) )
 663         self.db.commit()
 664
 665         def e():
 666             if wc().available():
 667                 try:
 668                     wc()[self.key][id].files_deleted (
 669                         woodchuck.DeletionResponse.Deleted)
 670                     del wc()[self.key][id]
 671                 except KeyError:
 672                     pass
 673         mainthread.execute (e, async=True)
 674
 675 class ArchivedArticles(Feed):
 676     def addArchivedArticle(self, title, link, date, configdir):
 677         id = self.generateUniqueId({"date":date, "title":title})
 678         values = (id, title, link, date, 0, link, 0)
 679         self.db.execute("INSERT INTO feed (id, title, contentLink, date, updated, link, read) VALUES (?, ?, ?, ?, ?, ?, ?);", values)
 680         self.db.commit()
 681
 682     def updateFeed(self, configdir, url, etag, modified, expiryTime=24, proxy=None, imageCache=False):
 683         currentTime = 0
 684         rows = self.db.execute("SELECT id, link FROM feed WHERE updated=0;")
 685         for row in rows:
 686             currentTime = time.time()
 687             id = row[0]
 688             link = row[1]
 689             f = urllib2.urlopen(link)
 690             #entry["content"] = f.read()
 691             html = f.read()
 692             f.close()
 693             soup = BeautifulSoup(html)
 694             images = soup('img')
 695             baseurl = link
 696             for img in images:
 697                 filename = self.addImage(configdir, self.key, baseurl, img['src'], proxy=proxy)
 698                 img['src']=filename
 699                 self.db.execute("INSERT INTO images (id, imagePath) VALUES (?, ?);", (id, filename) )
 700                 self.db.commit()
 701             contentLink = configdir+self.key+".d/"+id+".html"
 702             file = open(contentLink, "w")
 703             file.write(soup.prettify())
 704             file.close()
 705
 706             self.db.execute("UPDATE feed SET read=0, contentLink=?, updated=? WHERE id=?;", (contentLink, time.time(), id) )
 707             self.db.commit()
 708         return (currentTime, None, None)
 709
 710     def purgeReadArticles(self):
 711         rows = self.db.execute("SELECT id FROM feed WHERE read=1;")
 712         #ids = self.getIds()
 713         for row in rows:
 714             self.removeArticle(row[0])
 715
 716     def removeArticle(self, id):
 717         rows = self.db.execute("SELECT imagePath FROM images WHERE id=?;", (id,) )
 718         for row in rows:
 719             try:
 720                 count = self.db.execute("SELECT count(*) FROM images WHERE id!=? and imagePath=?;", (id,row[0]) ).fetchone()[0]
 721                 if count == 0:
 722                     os.remove(row[0])
 723             except:
 724                 pass
 725         self.removeEntry(id)
 726
 727 class Listing:
 728     def _getdb(self):
 729         try:
 730             db = self.tls.db
 731         except AttributeError:
 732             db = sqlite3.connect("%s/feeds.db" % self.configdir, timeout=120)
 733             self.tls.db = db
 734         return db
 735     db = property(_getdb)
 736
 737     # Lists all the feeds in a dictionary, and expose the data
 738     def __init__(self, config, configdir):
 739         self.config = config
 740         self.configdir = configdir
 741
 742         self.tls = threading.local ()
 743
 744         try:
 745             table = self.db.execute("SELECT sql FROM sqlite_master").fetchone()
 746             if table == None:
 747                 self.db.execute("CREATE TABLE feeds(id text, url text, title text, unread int, updateTime float, rank int, etag text, modified text, widget int, category int);")
 748                 self.db.execute("CREATE TABLE categories(id text, title text, unread int, rank int);")
 749                 self.addCategory("Default Category")
 750                 if isfile(self.configdir+"feeds.pickle"):
 751                     self.importOldFormatFeeds()
 752                 else:
 753                     self.addFeed("Maemo News", "http://maemo.org/news/items.xml")
 754             else:
 755                 from string import find, upper
 756                 if find(upper(table[0]), "WIDGET")<0:
 757                     self.db.execute("ALTER TABLE feeds ADD COLUMN widget int;")
 758                     self.db.execute("UPDATE feeds SET widget=1;")
 759                     self.db.commit()
 760                 if find(upper(table[0]), "CATEGORY")<0:
 761                     self.db.execute("CREATE TABLE categories(id text, title text, unread int, rank int);")
 762                     self.addCategory("Default Category")
 763                     self.db.execute("ALTER TABLE feeds ADD COLUMN category int;")
 764                     self.db.execute("UPDATE feeds SET category=1;")
 765             self.db.commit()
 766         except:
 767             pass
 768
 769         # Check that Woodchuck's state is up to date with respect our
 770         # state.
 771         updater = os.path.basename(sys.argv[0]) == 'update_feeds.py'
 772         wc_init (self, True if updater else False)
 773         if wc().available() and updater:
 774             # The list of known streams.
 775             streams = wc().streams_list ()
 776             stream_ids = [s.identifier for s in streams]
 777
 778             # Register any unknown streams.  Remove known streams from
 779             # STREAMS_IDS.
 780             for key in self.getListOfFeeds():
 781                 title = self.getFeedTitle(key)
 782                 # XXX: We should also check whether the list of
 783                 # articles/objects in each feed/stream is up to date.
 784                 if key not in stream_ids:
 785                     logger.debug(
 786                         "Registering previously unknown channel: %s (%s)"
 787                         % (key, title,))
 788                     # Use a default refresh interval of 6 hours.
 789                     wc().stream_register (key, title, 6 * 60 * 60)
 790                 else:
 791                     # Make sure the human readable name is up to date.
 792                     if wc()[key].human_readable_name != title:
 793                         wc()[key].human_readable_name = title
 794                     stream_ids.remove (key)
 795
 796
 797             # Unregister any streams that are no longer subscribed to.
 798             for id in stream_ids:
 799                 logger.debug("Unregistering %s" % (id,))
 800                 w.stream_unregister (id)
 801
 802     def importOldFormatFeeds(self):
 803         """This function loads feeds that are saved in an outdated format, and converts them to sqlite"""
 804         import rss
 805         listing = rss.Listing(self.configdir)
 806         rank = 0
 807         for id in listing.getListOfFeeds():
 808             try:
 809                 rank += 1
 810                 values = (id, listing.getFeedTitle(id) , listing.getFeedUrl(id), 0, time.time(), rank, None, "None", 1)
 811                 self.db.execute("INSERT INTO feeds (id, title, url, unread, updateTime, rank, etag, modified, widget, category) VALUES (?, ?, ? ,? ,? ,?, ?, ?, ?, 1);", values)
 812                 self.db.commit()
 813
 814                 feed = listing.getFeed(id)
 815                 new_feed = self.getFeed(id)
 816
 817                 items = feed.getIds()[:]
 818                 items.reverse()
 819                 for item in items:
 820                         if feed.isEntryRead(item):
 821                             read_status = 1
 822                         else:
 823                             read_status = 0
 824                         date = timegm(feed.getDateTuple(item))
 825                         title = feed.getTitle(item)
 826                         newId = new_feed.generateUniqueId({"date":date, "title":title})
 827                         values = (newId, title , feed.getContentLink(item), date, tuple(time.time()), feed.getExternalLink(item), read_status)
 828                         new_feed.db.execute("INSERT INTO feed (id, title, contentLink, date, updated, link, read) VALUES (?, ?, ?, ?, ?, ?, ?);", values)
 829                         new_feed.db.commit()
 830                         try:
 831                             images = feed.getImages(item)
 832                             for image in images:
 833                                 new_feed.db.execute("INSERT INTO images (id, imagePath) VALUES (?, ?);", (item, image) )
 834                                 new_feed.db.commit()
 835                         except:
 836                             pass
 837                 self.updateUnread(id)
 838             except:
 839                 logger.error("importOldFormatFeeds: %s"
 840                              % (traceback.format_exc(),))
 841         remove(self.configdir+"feeds.pickle")
 842
 843
 844     def addArchivedArticle(self, key, index):
 845         feed = self.getFeed(key)
 846         title = feed.getTitle(index)
 847         link = feed.getExternalLink(index)
 848         date = feed.getDate(index)
 849         count = self.db.execute("SELECT count(*) FROM feeds where id=?;", ("ArchivedArticles",) ).fetchone()[0]
 850         if count == 0:
 851             self.addFeed("Archived Articles", "", id="ArchivedArticles")
 852
 853         archFeed = self.getFeed("ArchivedArticles")
 854         archFeed.addArchivedArticle(title, link, date, self.configdir)
 855         self.updateUnread("ArchivedArticles")
 856
 857     def updateFeed(self, key, expiryTime=None, proxy=None, imageCache=None,
 858                    priority=0):
 859         if expiryTime is None:
 860             expiryTime = self.config.getExpiry()
 861         if not expiryTime:
 862             # Default to 24 hours
 863             expriyTime = 24
 864         if proxy is None:
 865             (use_proxy, proxy) = self.config.getProxy()
 866             if not use_proxy:
 867                 proxy = None
 868         if imageCache is None:
 869             imageCache = self.config.getImageCache()
 870
 871         feed = self.getFeed(key)
 872         (url, etag, modified) = self.db.execute("SELECT url, etag, modified FROM feeds WHERE id=?;", (key,) ).fetchone()
 873         try:
 874             modified = time.struct_time(eval(modified))
 875         except:
 876             modified = None
 877         feed.updateFeed(
 878             self.configdir, url, etag, modified, expiryTime, proxy, imageCache,
 879             priority, postFeedUpdateFunc=self._queuePostFeedUpdate)
 880
 881     def _queuePostFeedUpdate(self, *args, **kwargs):
 882         mainthread.execute (self._postFeedUpdate, async=True, *args, **kwargs)
 883
 884     def _postFeedUpdate(self, key, updateTime, etag, modified, title):
 885         if modified==None:
 886             modified="None"
 887         else:
 888             modified=str(tuple(modified))
 889         if updateTime > 0:
 890             self.db.execute("UPDATE feeds SET updateTime=?, etag=?, modified=? WHERE id=?;", (updateTime, etag, modified, key) )
 891         else:
 892             self.db.execute("UPDATE feeds SET etag=?, modified=? WHERE id=?;", (etag, modified, key) )
 893
 894         if title is not None:
 895             self.db.execute("UPDATE feeds SET title=(case WHEN title=='' THEN ? ELSE title END) where id=?;",
 896                             (title, key))
 897         self.db.commit()
 898         self.updateUnread(key)
 899
 900         update_server_object().ArticleCountUpdated()
 901
 902         stats = JobManager().stats()
 903         global jobs_at_start
 904         completed = stats['jobs-completed'] - jobs_at_start
 905         in_progress = stats['jobs-in-progress']
 906         queued = stats['jobs-queued']
 907
 908         try:
 909             percent = (100 * ((completed + in_progress / 2.))
 910                        / (completed + in_progress + queued))
 911         except ZeroDivisionError:
 912             percent = 100
 913
 914         update_server_object().UpdateProgress(
 915             percent, completed, in_progress, queued, 0, 0, 0, key)
 916
 917         if in_progress == 0 and queued == 0:
 918             jobs_at_start = stats['jobs-completed']
 919
 920     def getFeed(self, key):
 921         if key == "ArchivedArticles":
 922             return ArchivedArticles(self.configdir, key)
 923         return Feed(self.configdir, key)
 924
 925     def editFeed(self, key, title, url, category=None):
 926         if category:
 927             self.db.execute("UPDATE feeds SET title=?, url=?, category=? WHERE id=?;", (title, url, category, key))
 928         else:
 929             self.db.execute("UPDATE feeds SET title=?, url=? WHERE id=?;", (title, url, key))
 930         self.db.commit()
 931
 932         if wc().available():
 933             try:
 934                 wc()[key].human_readable_name = title
 935             except KeyError:
 936                 logger.debug("Feed %s (%s) unknown." % (key, title))
 937
 938     def getFeedUpdateTime(self, key):
 939         return time.ctime(self.db.execute("SELECT updateTime FROM feeds WHERE id=?;", (key,)).fetchone()[0])
 940
 941     def getFeedNumberOfUnreadItems(self, key):
 942         return self.db.execute("SELECT unread FROM feeds WHERE id=?;", (key,)).fetchone()[0]
 943
 944     def getFeedTitle(self, key):
 945         (title, url) = self.db.execute("SELECT title, url FROM feeds WHERE id=?;", (key,)).fetchone()
 946         if title:
 947             return title
 948         return url
 949
 950     def getFeedUrl(self, key):
 951         return self.db.execute("SELECT url FROM feeds WHERE id=?;", (key,)).fetchone()[0]
 952
 953     def getFeedCategory(self, key):
 954         return self.db.execute("SELECT category FROM feeds WHERE id=?;", (key,)).fetchone()[0]
 955
 956     def getListOfFeeds(self, category=None):
 957         if category:
 958             rows = self.db.execute("SELECT id FROM feeds WHERE category=? ORDER BY rank;", (category, ) )
 959         else:
 960             rows = self.db.execute("SELECT id FROM feeds ORDER BY rank;" )
 961         keys = []
 962         for row in rows:
 963             if row[0]:
 964                 keys.append(row[0])
 965         return keys
 966
 967     def getListOfCategories(self):
 968         rows = self.db.execute("SELECT id FROM categories ORDER BY rank;" )
 969         keys = []
 970         for row in rows:
 971             if row[0]:
 972                 keys.append(row[0])
 973         return keys
 974
 975     def getCategoryTitle(self, id):
 976         row = self.db.execute("SELECT title FROM categories WHERE id=?;", (id, )).fetchone()
 977         return row[0]
 978
 979     def getSortedListOfKeys(self, order, onlyUnread=False, category=1):
 980         if   order == "Most unread":
 981             tmp = "ORDER BY unread DESC"
 982             #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][1], reverse=True)
 983         elif order == "Least unread":
 984             tmp = "ORDER BY unread"
 985             #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][1])
 986         elif order == "Most recent":
 987             tmp = "ORDER BY updateTime DESC"
 988             #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][2], reverse=True)
 989         elif order == "Least recent":
 990             tmp = "ORDER BY updateTime"
 991             #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][2])
 992         else: # order == "Manual" or invalid value...
 993             tmp = "ORDER BY rank"
 994             #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][0])
 995         if onlyUnread:
 996             sql = "SELECT id FROM feeds WHERE unread>0 AND category=%s " %category + tmp
 997         else:
 998             sql = "SELECT id FROM feeds WHERE category=%s " %category + tmp
 999         rows = self.db.execute(sql)
1000         keys = []
1001         for row in rows:
1002             if row[0]:
1003                 keys.append(row[0])
1004         return keys
1005
1006     def getFavicon(self, key):
1007         filename = "%s%s.d/favicon.ico" % (self.configdir, key)
1008         if isfile(filename):
1009             return filename
1010         else:
1011             return False
1012
1013     def updateUnread(self, key):
1014         feed = self.getFeed(key)
1015         self.db.execute("UPDATE feeds SET unread=? WHERE id=?;", (feed.getNumberOfUnreadItems(), key))
1016         self.db.commit()
1017
1018     def addFeed(self, title, url, id=None, category=1):
1019         if not id:
1020             id = getId(url)
1021         count = self.db.execute("SELECT count(*) FROM feeds WHERE id=?;", (id,) ).fetchone()[0]
1022         if count == 0:
1023             max_rank = self.db.execute("SELECT MAX(rank) FROM feeds;").fetchone()[0]
1024             if max_rank == None:
1025                 max_rank = 0
1026             values = (id, title, url, 0, 0, max_rank+1, None, "None", 1, category)
1027             self.db.execute("INSERT INTO feeds (id, title, url, unread, updateTime, rank, etag, modified, widget, category) VALUES (?, ?, ? ,? ,? ,?, ?, ?, ?,?);", values)
1028             self.db.commit()
1029             # Ask for the feed object, it will create the necessary tables
1030             self.getFeed(id)
1031
1032             if wc().available():
1033                 # Register the stream with Woodchuck.  Update approximately
1034                 # every 6 hours.
1035                 wc().stream_register(stream_identifier=id,
1036                                      human_readable_name=title,
1037                                      freshness=6*60*60)
1038
1039             return True
1040         else:
1041             return False
1042
1043     def addCategory(self, title):
1044         rank = self.db.execute("SELECT MAX(rank)+1 FROM categories;").fetchone()[0]
1045         if rank==None:
1046             rank=1
1047         id = self.db.execute("SELECT MAX(id)+1 FROM categories;").fetchone()[0]
1048         if id==None:
1049             id=1
1050         self.db.execute("INSERT INTO categories (id, title, unread, rank) VALUES (?, ?, 0, ?)", (id, title, rank))
1051         self.db.commit()
1052
1053     def removeFeed(self, key):
1054         if wc().available ():
1055             try:
1056                 del wc()[key]
1057             except KeyError:
1058                 logger.debug("Removing unregistered feed %s failed" % (key,))
1059
1060         rank = self.db.execute("SELECT rank FROM feeds WHERE id=?;", (key,) ).fetchone()[0]
1061         self.db.execute("DELETE FROM feeds WHERE id=?;", (key, ))
1062         self.db.execute("UPDATE feeds SET rank=rank-1 WHERE rank>?;", (rank,) )
1063         self.db.commit()
1064
1065         if isdir(self.configdir+key+".d/"):
1066            rmtree(self.configdir+key+".d/")
1067
1068     def removeCategory(self, key):
1069         if self.db.execute("SELECT count(*) FROM categories;").fetchone()[0] > 1:
1070             rank = self.db.execute("SELECT rank FROM categories WHERE id=?;", (key,) ).fetchone()[0]
1071             self.db.execute("DELETE FROM categories WHERE id=?;", (key, ))
1072             self.db.execute("UPDATE categories SET rank=rank-1 WHERE rank>?;", (rank,) )
1073             self.db.execute("UPDATE feeds SET category=1 WHERE category=?;", (key,) )
1074             self.db.commit()
1075
1076     #def saveConfig(self):
1077     #    self.listOfFeeds["feedingit-order"] = self.sortedKeys
1078     #    file = open(self.configdir+"feeds.pickle", "w")
1079     #    pickle.dump(self.listOfFeeds, file)
1080     #    file.close()
1081
1082     def moveUp(self, key):
1083         rank = self.db.execute("SELECT rank FROM feeds WHERE id=?;", (key,)).fetchone()[0]
1084         if rank>0:
1085             self.db.execute("UPDATE feeds SET rank=? WHERE rank=?;", (rank, rank-1) )
1086             self.db.execute("UPDATE feeds SET rank=? WHERE id=?;", (rank-1, key) )
1087             self.db.commit()
1088
1089     def moveCategoryUp(self, key):
1090         rank = self.db.execute("SELECT rank FROM categories WHERE id=?;", (key,)).fetchone()[0]
1091         if rank>0:
1092             self.db.execute("UPDATE categories SET rank=? WHERE rank=?;", (rank, rank-1) )
1093             self.db.execute("UPDATE categories SET rank=? WHERE id=?;", (rank-1, key) )
1094             self.db.commit()
1095
1096     def moveDown(self, key):
1097         rank = self.db.execute("SELECT rank FROM feeds WHERE id=?;", (key,)).fetchone()[0]
1098         max_rank = self.db.execute("SELECT MAX(rank) FROM feeds;").fetchone()[0]
1099         if rank<max_rank:
1100             self.db.execute("UPDATE feeds SET rank=? WHERE rank=?;", (rank, rank+1) )
1101             self.db.execute("UPDATE feeds SET rank=? WHERE id=?;", (rank+1, key) )
1102             self.db.commit()
1103
1104     def moveCategoryDown(self, key):
1105         rank = self.db.execute("SELECT rank FROM categories WHERE id=?;", (key,)).fetchone()[0]
1106         max_rank = self.db.execute("SELECT MAX(rank) FROM categories;").fetchone()[0]
1107         if rank<max_rank:
1108             self.db.execute("UPDATE categories SET rank=? WHERE rank=?;", (rank, rank+1) )
1109             self.db.execute("UPDATE categories SET rank=? WHERE id=?;", (rank+1, key) )
1110             self.db.commit()
1111
1112