--- /dev/null
+include *.desktop
+include qml/*.qml
+include *.png
+include feedingit.longdesc
--- /dev/null
+=========================================
+README - pyside-assistant generated files
+=========================================
+
+All files generated by psa init are described below. The sampleproject
+slug used for them was 'sampleproject', with the following psa call:
+
+ $ psa init sampleproject <fremantle|harmattan|...>
+
+For a complete list of templates available, use:
+
+ $ psa list
+
+* MANIFEST.in: Distutils considers a list of files by default for
+installation, listed at [1]. Additional files to be distributed
+must be supplied in the MANIFEST.in file. The generated file includes
+the .desktop file and any QML files found.
+
+[1] http://docs.python.org/distutils/sourcedist.html#specifying-the-files-to-distribute.
+
+* setup.py: The main file for using Distutils, contains most of the
+information needed to build the package. Information about setup.py
+can be found at http://docs.python.org/distutils/introduction.html.
+Two fields of this file can be modified by psa update parameters:
+ - description: -d or --description
+ - long_description: the contents of sampleproject.longdesc are used
+
+* stdeb.cfg: Configuration file for stdeb, specifying additional
+parameters for the binary package. The specification for this
+file can be found at https://github.com/astraw/stdeb, in the section
+stdeb.cfg configuration file. One field of this file can be modified by
+psa update parameters:
+ - Section: -s or --section
+
+Note that the section should be a valid one,
+as specified in http://wiki.maemo.org/Task:Package_categories#New_list_for_Diablo.
+
+* sampleproject.aegis: Sample credentials file, which initially is empty. Information
+of how to populate it can be found at [2].
+NOTE: this file is used only in harmattan projects.
+
+[2] http://library.developer.nokia.com/topic/MeeGo_1.2_Harmattan_API/html/guide/html/Developer_Library_Developing_for_Harmattan_Harmattan_security_6cbe.html
+
+* sampleproject.desktop: This file specifies how to run the application
+from the application grid, with various fields. Information about
+.desktop files can be found at http://wiki.maemo.org/Desktop_file_format.
+Two fields of this file can be modified by psa update parameters:
+ - Name: -a or --app-name
+ - Category: -c or --category
+
+Note that the category should be a valid one, as specified in
+http://standards.freedesktop.org/menu-spec/latest/apa.html.
+
+* sampleproject.png: The application icon, which is displayed in the
+application grid.
+
+* qml/*.qml: The QML files for the application. Their contents depend
+on the platform.
+
+* sampleproject: Main program. Initializes the application and provide
+support for displaying the QML files contents.
+
+* sampleproject.longdesc: Holds the contents of the long_description field of
+setup.py, which as the name implies is a more detailed description of what the project is.
+
--- /dev/null
+running bdist_deb
+running sdist_dsc
+CALLING dpkg-source -b feedingit-0.1.0 (in dir deb_dist)
+dpkg-source: info: using source format `3.0 (quilt)'
+dpkg-source: info: building feedingit using existing ./feedingit_0.1.0.orig.tar.gz
+dpkg-source: info: building feedingit in feedingit_0.1.0-1.debian.tar.gz
+dpkg-source: info: building feedingit in feedingit_0.1.0-1.dsc
+dpkg-source: warning: extracting unsigned source package (feedingit_0.1.0-1.dsc)
+dpkg-source: info: extracting feedingit in feedingit-0.1.0
+dpkg-source: info: unpacking feedingit_0.1.0.orig.tar.gz
+dpkg-source: info: unpacking feedingit_0.1.0-1.debian.tar.gz
+dpkg-buildpackage: export CFLAGS from dpkg-buildflags (origin: vendor): -g -O2
+dpkg-buildpackage: export CPPFLAGS from dpkg-buildflags (origin: vendor):
+dpkg-buildpackage: export CXXFLAGS from dpkg-buildflags (origin: vendor): -g -O2
+dpkg-buildpackage: export FFLAGS from dpkg-buildflags (origin: vendor): -g -O2
+dpkg-buildpackage: export LDFLAGS from dpkg-buildflags (origin: vendor): -Wl,-Bsymbolic-functions
+dpkg-buildpackage: source package feedingit
+dpkg-buildpackage: source version 0.1.0-1
+dpkg-buildpackage: source changed by Yves <yves@marcoz.org>
+ dpkg-source --before-build feedingit-0.1.0
+dpkg-buildpackage: host architecture i386
+ fakeroot debian/rules clean
+dh clean --with python2 --buildsystem=python_distutils
+ dh_testdir -O--buildsystem=python_distutils
+ dh_auto_clean -O--buildsystem=python_distutils
+running clean
+'build/lib.linux-i686-2.6' does not exist -- can't clean it
+'build/bdist.linux-i686' does not exist -- can't clean it
+'build/scripts-2.6' does not exist -- can't clean it
+ dh_clean -O--buildsystem=python_distutils
+ debian/rules build
+dh build --with python2 --buildsystem=python_distutils
+ dh_testdir -O--buildsystem=python_distutils
+ dh_auto_configure -O--buildsystem=python_distutils
+ dh_auto_build -O--buildsystem=python_distutils
+running build
+running build_scripts
+creating build
+creating build/scripts-2.6
+copying feedingit -> build/scripts-2.6
+changing mode of build/scripts-2.6/feedingit from 644 to 755
+ dh_auto_test -O--buildsystem=python_distutils
+ fakeroot debian/rules binary
+dh binary --with python2 --buildsystem=python_distutils
+ dh_testroot -O--buildsystem=python_distutils
+ dh_prep -O--buildsystem=python_distutils
+ dh_installdirs -O--buildsystem=python_distutils
+ dh_auto_install -O--buildsystem=python_distutils
+running install
+running build
+running build_scripts
+running install_scripts
+creating /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr
+creating /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/bin
+copying build/scripts-2.6/feedingit -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/bin
+changing mode of /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/bin/feedingit to 755
+running install_data
+creating /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share
+creating /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/applications
+copying feedingit.desktop -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/applications
+creating /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/icons
+creating /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/icons/hicolor
+creating /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/icons/hicolor/64x64
+creating /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/icons/hicolor/64x64/apps
+copying feedingit.png -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/icons/hicolor/64x64/apps
+creating /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit
+creating /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml
+copying qml/Feeds.qml -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml
+copying qml/Articles.qml -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml
+copying qml/TestWebview.qml -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml
+copying qml/FeedingIt.qml -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml
+copying qml/Categories.qml -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml
+copying qml/ArticleViewer.qml -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml
+copying qml/MainPage.qml -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml
+copying qml/main.qml -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml
+copying qml/ArticleDisplay.qml -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml
+creating /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml/common
+copying qml/common/AddCat.qml -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml/common
+copying qml/common/ConfirmationMessage.qml -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml/common
+copying qml/common/Switch.qml -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml/common
+copying qml/common/Button.qml -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml/common
+copying qml/common/Menu.qml -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml/common
+copying qml/common/ManageSubs.qml -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml/common
+copying qml/common/LineInput.qml -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml/common
+copying qml/common/ToolBar.qml -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml/common
+copying qml/common/AddFeed.qml -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml/common
+copying qml/common/Slider.qml -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml/common
+creating /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml/common/images
+copying qml/common/images/AppletCloseButton.png -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml/common/images
+copying qml/common/images/toolbutton.sci -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml/common/images
+copying qml/common/images/delete.png -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml/common/images
+copying qml/common/images/loading2.png -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml/common/images
+copying qml/common/images/wmBackIcon.png -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml/common/images
+copying qml/common/images/toolbutton.png -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml/common/images
+copying qml/common/images/plus.png -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml/common/images
+copying qml/common/images/loading.png -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml/common/images
+copying qml/common/images/wmTaskLauncherIcon.png -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml/common/images
+copying qml/common/images/feedingit.png -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml/common/images
+copying qml/common/images/checkmark.png -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml/common/images
+copying qml/common/images/wmCloseIcon.png -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml/common/images
+copying qml/common/images/lineedit.png -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml/common/images
+copying qml/common/images/Zoom-In-icon.png -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml/common/images
+copying qml/common/images/wmEditIcon.png -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml/common/images
+copying qml/common/images/rotate.png -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml/common/images
+copying qml/common/images/lineedit.sci -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml/common/images
+copying qml/common/images/InputMethodShiftButtonNormal.png -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml/common/images
+creating /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml/i18n
+copying qml/i18n/qml_en.ts -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml/i18n
+copying qml/i18n/FeedingIt.ts -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml/i18n
+copying qml/i18n/qml_en.qm -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit/qml/i18n
+creating /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/dbus-1
+creating /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/dbus-1/services
+copying feedingit_status.service -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/dbus-1/services
+copying pysrc/config.py -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit
+copying pysrc/update_feeds.py -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit
+copying pysrc/jobmanager.py -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit
+copying pysrc/__init__.py -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit
+copying pysrc/debugging.py -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit
+copying pysrc/rss_sqlite.py -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit
+copying pysrc/wc.py -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit
+copying pysrc/feedingit.py -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit
+copying pysrc/feedparser.py -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit
+copying pysrc/opml.py -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit
+copying pysrc/XmlHandler.py -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit
+copying pysrc/download.py -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit
+copying pysrc/mainthread.py -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit
+copying pysrc/updatedbus.py -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit
+copying pysrc/httpprogresshandler.py -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit
+copying pysrc/BeautifulSoup.py -> /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/share/feedingit
+running install_egg_info
+Creating /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/lib/python2.6/dist-packages/
+Writing /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit-0.1.0/debian/feedingit/usr/lib/python2.6/dist-packages/feedingit-0.1.0.egg-info
+ dh_install -O--buildsystem=python_distutils
+ dh_installdocs -O--buildsystem=python_distutils
+ dh_installchangelogs -O--buildsystem=python_distutils
+ dh_installexamples -O--buildsystem=python_distutils
+ dh_installman -O--buildsystem=python_distutils
+ dh_installcatalogs -O--buildsystem=python_distutils
+ dh_installcron -O--buildsystem=python_distutils
+ dh_installdebconf -O--buildsystem=python_distutils
+ dh_installemacsen -O--buildsystem=python_distutils
+ dh_installifupdown -O--buildsystem=python_distutils
+ dh_installinfo -O--buildsystem=python_distutils
+ dh_installinit -O--buildsystem=python_distutils
+Duplicate specification "O=s" for option "O"
+ dh_installmenu -O--buildsystem=python_distutils
+ dh_installmime -O--buildsystem=python_distutils
+ dh_installmodules -O--buildsystem=python_distutils
+ dh_installlogcheck -O--buildsystem=python_distutils
+ dh_installlogrotate -O--buildsystem=python_distutils
+ dh_installpam -O--buildsystem=python_distutils
+ dh_installppp -O--buildsystem=python_distutils
+ dh_installudev -O--buildsystem=python_distutils
+ dh_installwm -O--buildsystem=python_distutils
+ dh_installxfonts -O--buildsystem=python_distutils
+ dh_bugfiles -O--buildsystem=python_distutils
+ dh_lintian -O--buildsystem=python_distutils
+ dh_gconf -O--buildsystem=python_distutils
+ dh_icons -O--buildsystem=python_distutils
+ dh_perl -O--buildsystem=python_distutils
+ dh_python2 -O--buildsystem=python_distutils
+ dh_usrlocal -O--buildsystem=python_distutils
+ dh_link -O--buildsystem=python_distutils
+ dh_compress -O--buildsystem=python_distutils
+ dh_fixperms -O--buildsystem=python_distutils
+ dh_strip -O--buildsystem=python_distutils
+ dh_makeshlibs -O--buildsystem=python_distutils
+ dh_shlibdeps -O--buildsystem=python_distutils
+ dh_installdeb -O--buildsystem=python_distutils
+ dh_gencontrol -O--buildsystem=python_distutils
+dpkg-gencontrol: warning: package feedingit: unused substitution variable ${python:Versions}
+ dh_md5sums -O--buildsystem=python_distutils
+ dh_builddeb -O--buildsystem=python_distutils
+dpkg-deb: building package `feedingit' in `../feedingit_0.1.0-1_all.deb'.
+ dpkg-genchanges -b >../feedingit_0.1.0-1_i386.changes
+dpkg-genchanges: binary-only upload - not including any source code
+ dpkg-source --after-build feedingit-0.1.0
+dpkg-buildpackage: binary only upload (no source included)
--- /dev/null
+debian/patches
--- /dev/null
+include *.desktop
+include qml/*.qml
+include *.png
+include feedingit.longdesc
--- /dev/null
+=========================================
+README - pyside-assistant generated files
+=========================================
+
+All files generated by psa init are described below. The sampleproject
+slug used for them was 'sampleproject', with the following psa call:
+
+ $ psa init sampleproject <fremantle|harmattan|...>
+
+For a complete list of templates available, use:
+
+ $ psa list
+
+* MANIFEST.in: Distutils considers a list of files by default for
+installation, listed at [1]. Additional files to be distributed
+must be supplied in the MANIFEST.in file. The generated file includes
+the .desktop file and any QML files found.
+
+[1] http://docs.python.org/distutils/sourcedist.html#specifying-the-files-to-distribute.
+
+* setup.py: The main file for using Distutils, contains most of the
+information needed to build the package. Information about setup.py
+can be found at http://docs.python.org/distutils/introduction.html.
+Two fields of this file can be modified by psa update parameters:
+ - description: -d or --description
+ - long_description: the contents of sampleproject.longdesc are used
+
+* stdeb.cfg: Configuration file for stdeb, specifying additional
+parameters for the binary package. The specification for this
+file can be found at https://github.com/astraw/stdeb, in the section
+stdeb.cfg configuration file. One field of this file can be modified by
+psa update parameters:
+ - Section: -s or --section
+
+Note that the section should be a valid one,
+as specified in http://wiki.maemo.org/Task:Package_categories#New_list_for_Diablo.
+
+* sampleproject.aegis: Sample credentials file, which initially is empty. Information
+of how to populate it can be found at [2].
+NOTE: this file is used only in harmattan projects.
+
+[2] http://library.developer.nokia.com/topic/MeeGo_1.2_Harmattan_API/html/guide/html/Developer_Library_Developing_for_Harmattan_Harmattan_security_6cbe.html
+
+* sampleproject.desktop: This file specifies how to run the application
+from the application grid, with various fields. Information about
+.desktop files can be found at http://wiki.maemo.org/Desktop_file_format.
+Two fields of this file can be modified by psa update parameters:
+ - Name: -a or --app-name
+ - Category: -c or --category
+
+Note that the category should be a valid one, as specified in
+http://standards.freedesktop.org/menu-spec/latest/apa.html.
+
+* sampleproject.png: The application icon, which is displayed in the
+application grid.
+
+* qml/*.qml: The QML files for the application. Their contents depend
+on the platform.
+
+* sampleproject: Main program. Initializes the application and provide
+support for displaying the QML files contents.
+
+* sampleproject.longdesc: Holds the contents of the long_description field of
+setup.py, which as the name implies is a more detailed description of what the project is.
+
--- /dev/null
+running bdist_deb
+running sdist_dsc
--- /dev/null
+#!/bin/sh
+
+case "$1" in
+dbus)
+ nice python /usr/share/feedingit/update_feeds.py
+ ;;
+*)
+ cd /usr/share/feedingit
+ python feedingit.py 2>&1 >/dev/null
+ ;;
+
+esac
+
--- /dev/null
+feedingit (0.1.0-1) unstable; urgency=low
+
+ * source package automatically created by stdeb 0.6.0+git
+
+ -- Yves <yves@marcoz.org> Fri, 07 Oct 2011 20:59:08 -0700
--- /dev/null
+Source: feedingit
+Maintainer: Yves <yves@marcoz.org>
+Section: user/development
+Priority: optional
+Build-Depends: python-all (>= 2.6.6-3), debhelper (>= 7.4.3)
+Standards-Version: 3.9.1
+X-Python-Version: 2.6
+
+Package: feedingit
+Architecture: all
+Depends: ${misc:Depends}, python-pyside.qtgui, python-pyside.qtopengl, python-pyside.qtdeclarative, python-dbus, python-gconf
+Breaks: ${python:Breaks}
+Description: FeedingIt - RSS Reader
+ This file should contain a writeup describing what your application does,
+ and how to use it. The content of this file goes into the long_description
+ field of setup.py, which in turn becomes the long version of the Description
+ field in the debian/control file of the project.
+ .
--- /dev/null
+dh_auto_configure
+dh_auto_build
+dh_auto_test
+dh_prep
+dh_installdirs
+dh_auto_install
+dh_install
+dh_installdocs
+dh_installchangelogs
+dh_installexamples
+dh_installman
+dh_installcatalogs
+dh_installcron
+dh_installdebconf
+dh_installemacsen
+dh_installifupdown
+dh_installinfo
+dh_installinit
+dh_installmenu
+dh_installmime
+dh_installmodules
+dh_installlogcheck
+dh_installlogrotate
+dh_installpam
+dh_installppp
+dh_installudev
+dh_installwm
+dh_installxfonts
+dh_bugfiles
+dh_lintian
+dh_gconf
+dh_icons
+dh_perl
+dh_usrlocal
+dh_link
+dh_compress
+dh_fixperms
+dh_strip
+dh_makeshlibs
+dh_shlibdeps
+dh_installdeb
+dh_gencontrol
+dh_md5sums
+dh_builddeb
--- /dev/null
+
+# Automatically added by dh_python2:
+if which pycompile >/dev/null 2>&1; then
+ pycompile -p feedingit /usr/share/feedingit -V 2.6
+fi
+
+# End automatically added section
--- /dev/null
+
+# Automatically added by dh_python2:
+if which pyclean >/dev/null 2>&1; then
+ pyclean -p feedingit
+else
+ dpkg -L feedingit | grep \.py$ | while read file
+ do
+ rm -f "${file}"[co] >/dev/null
+ done
+fi
+
+# End automatically added section
--- /dev/null
+python:Versions=2.6
+python:Breaks=python (<< 2.6)
+python:Depends=python2.6, python (>= 2.7.1-0ubuntu2), python (>= 2.6), python (<< 2.6)
+misc:Depends=
--- /dev/null
+Package: feedingit
+Version: 0.1.0-1
+Architecture: all
+Maintainer: Yves <yves@marcoz.org>
+Installed-Size: 736
+Depends: python-pyside.qtgui, python-pyside.qtopengl, python-pyside.qtdeclarative, python-dbus, python-gconf
+Breaks: python (<< 2.6)
+Section: user/development
+Priority: optional
+Description: FeedingIt - RSS Reader
+ This file should contain a writeup describing what your application does,
+ and how to use it. The content of this file goes into the long_description
+ field of setup.py, which in turn becomes the long version of the Description
+ field in the debian/control file of the project.
--- /dev/null
+4420c31f88de68fe6e1b7637abb06196 usr/bin/feedingit
+6a42e9aebedfd157062bd5a9616dc935 usr/share/applications/feedingit.desktop
+eda8cc6ffe8d842d6dfe0244b01b3042 usr/share/dbus-1/services/feedingit_status.service
+0cd1fd151889876833e9092d589bbf5d usr/share/doc/feedingit/changelog.Debian.gz
+bac2be6ae9673ee5096e20e8b714c9cd usr/share/feedingit/BeautifulSoup.py
+fa5490f2022424a091b36e75f21e6596 usr/share/feedingit/XmlHandler.py
+d41d8cd98f00b204e9800998ecf8427e usr/share/feedingit/__init__.py
+6b5296119ef6bc859c3e3a8706fa7f0d usr/share/feedingit/config.py
+9cf859c8297e4b0e8466cb5861eb75e7 usr/share/feedingit/debugging.py
+fae02e730b76761d43a626fe19828d5e usr/share/feedingit/download.py
+afd1f073710b2306cadf2bf6791c663f usr/share/feedingit/feedingit.py
+afa4f462892136f59beaf96b6bf1cf96 usr/share/feedingit/feedparser.py
+c1a0c0a9ccefd64d1e27bddb817c72a3 usr/share/feedingit/httpprogresshandler.py
+f1e9ba0f44786f513659a7fa3111fc8a usr/share/feedingit/jobmanager.py
+0201faa30d34c58d71f36ab42a7a8233 usr/share/feedingit/mainthread.py
+d9c0665dfdd5cf19f1529ce88af95134 usr/share/feedingit/opml.py
+af27062fdba0bc7a3df92116e8340d19 usr/share/feedingit/qml/ArticleDisplay.qml
+4bf706b4031938cc244172fb2862703a usr/share/feedingit/qml/ArticleViewer.qml
+15083e9a1fac05c8efaaa085dfabcbcb usr/share/feedingit/qml/Articles.qml
+baf2f683f838f1c2abe723dcc428a8c0 usr/share/feedingit/qml/Categories.qml
+cd30f5eaec0885358261d7a96bfaf8cd usr/share/feedingit/qml/FeedingIt.qml
+aec9982f0b680ec18188df8b8ab42a9e usr/share/feedingit/qml/Feeds.qml
+bec5fe4599a3ad5799ed96d7ed81fb5f usr/share/feedingit/qml/MainPage.qml
+aa3fc0a4edbd17d93a9dc5c39c433c3d usr/share/feedingit/qml/TestWebview.qml
+cef5ae4af926a759f4a233336c00f017 usr/share/feedingit/qml/common/AddCat.qml
+c39cde168ef8d608670c81be7c808701 usr/share/feedingit/qml/common/AddFeed.qml
+ad091804747113acd2a3f0499f3beef2 usr/share/feedingit/qml/common/Button.qml
+0ed4b2901b16f13aca44b0dbea752e51 usr/share/feedingit/qml/common/ConfirmationMessage.qml
+4198bd2cb38fd9bc990070c1ca093bf4 usr/share/feedingit/qml/common/LineInput.qml
+f3c0475ec0d8235072c2a66b89d5d054 usr/share/feedingit/qml/common/ManageSubs.qml
+c3657c5c764fdff5fc6871a2bfa01083 usr/share/feedingit/qml/common/Menu.qml
+f0354312693b50e178179421387b79ff usr/share/feedingit/qml/common/Slider.qml
+4026a88316b81b1c7dfcae5cf19fa123 usr/share/feedingit/qml/common/Switch.qml
+0fb6202135e00bf1fc0ab8de26195900 usr/share/feedingit/qml/common/ToolBar.qml
+684ef0458b5e8647464ec18878fa65c2 usr/share/feedingit/qml/common/images/AppletCloseButton.png
+8c0b90418796b503a635a876e14a72a2 usr/share/feedingit/qml/common/images/InputMethodShiftButtonNormal.png
+b63e3f7951e8f5d700a756773f3cea42 usr/share/feedingit/qml/common/images/Zoom-In-icon.png
+21eeae41ca0c2b6eb2def9dea90941ef usr/share/feedingit/qml/common/images/checkmark.png
+8662e1f132352d5a9673bbb44d6c3eab usr/share/feedingit/qml/common/images/delete.png
+6ea16f563364a8a0830a8fe0f58bf28d usr/share/feedingit/qml/common/images/feedingit.png
+cbc095a502063bab9fa9e1a61bc74c2f usr/share/feedingit/qml/common/images/lineedit.png
+9dd59c472de35acd3ef45822ba6e54c0 usr/share/feedingit/qml/common/images/lineedit.sci
+687b604b477e1283cb95c93ac08b6604 usr/share/feedingit/qml/common/images/loading.png
+3f9acdfc4a501e501cf293a66b66989e usr/share/feedingit/qml/common/images/loading2.png
+c9ea3bf2d46c8fce8b11e50de2e7fe35 usr/share/feedingit/qml/common/images/plus.png
+0a08c36abf9b98db17dcd0766c31ffe3 usr/share/feedingit/qml/common/images/rotate.png
+cba2abe78f3dd3868813c5e5f2afe9ca usr/share/feedingit/qml/common/images/toolbutton.png
+72b930bb01435b3b074e276b8dd6d909 usr/share/feedingit/qml/common/images/toolbutton.sci
+af11b73b195513d08c17723b41db0b04 usr/share/feedingit/qml/common/images/wmBackIcon.png
+2d4f294f9f757f66fd219488c21530aa usr/share/feedingit/qml/common/images/wmCloseIcon.png
+664477d4107d381ab078f84d198a66d0 usr/share/feedingit/qml/common/images/wmEditIcon.png
+74cce780312f54e8e1eeb44ee8206022 usr/share/feedingit/qml/common/images/wmTaskLauncherIcon.png
+1c7751b124aa1bdf4b89ec76cdf815a2 usr/share/feedingit/qml/i18n/FeedingIt.ts
+7790a99425dd7c1046e6ae3b1ee72a03 usr/share/feedingit/qml/i18n/qml_en.qm
+1674fcce45bcf3319e61d19a9adf4fdd usr/share/feedingit/qml/i18n/qml_en.ts
+149f0a44d1807ee6cba9a63b11bea700 usr/share/feedingit/qml/main.qml
+61906dedfd0f86cdb8c7e38972ee4d82 usr/share/feedingit/rss_sqlite.py
+721777a26cd2a5b8466ce2aa2b99fad7 usr/share/feedingit/update_feeds.py
+6ccf12dc4379e91800ae8505b2e86082 usr/share/feedingit/updatedbus.py
+4397d8abcdf0da4eebbd49b1d182388a usr/share/feedingit/wc.py
+8d49c002ad8bb98837e2a642eec86fc5 usr/share/icons/hicolor/64x64/apps/feedingit.png
+035a8a90300ae10602a25bd24a8121c7 usr/share/pyshared/feedingit-0.1.0.egg-info
+1ce7b7194658769bb4173134a725d1ce usr/share/python/runtime.d/feedingit.rtupdate
--- /dev/null
+#!/bin/sh
+set -e
+
+# Automatically added by dh_python2:
+if which pycompile >/dev/null 2>&1; then
+ pycompile -p feedingit /usr/share/feedingit -V 2.6
+fi
+
+# End automatically added section
--- /dev/null
+#!/bin/sh
+set -e
+
+# Automatically added by dh_python2:
+if which pyclean >/dev/null 2>&1; then
+ pyclean -p feedingit
+else
+ dpkg -L feedingit | grep \.py$ | while read file
+ do
+ rm -f "${file}"[co] >/dev/null
+ done
+fi
+
+# End automatically added section
--- /dev/null
+#!/bin/sh
+
+case "$1" in
+dbus)
+ nice python /usr/share/feedingit/update_feeds.py
+ ;;
+*)
+ cd /usr/share/feedingit
+ python feedingit.py 2>&1 >/dev/null
+ ;;
+
+esac
+
--- /dev/null
+../../../share/pyshared/feedingit-0.1.0.egg-info
\ No newline at end of file
--- /dev/null
+[Desktop Entry]
+Encoding=UTF-8
+Version=1.0
+Type=Application
+Name=FeedingIt RSS Reader
+Exec=invoker --single-instance --type=e /usr/bin/feedingit
+Icon=/usr/share/icons/hicolor/64x64/apps/feedingit.png
+Categories=Development;
--- /dev/null
+[D-BUS Service]
+Name=org.marcoz.feedingit
+Exec=/usr/bin/feedingit dbus
--- /dev/null
+"""Beautiful Soup
+Elixir and Tonic
+"The Screen-Scraper's Friend"
+http://www.crummy.com/software/BeautifulSoup/
+
+Beautiful Soup parses a (possibly invalid) XML or HTML document into a
+tree representation. It provides methods and Pythonic idioms that make
+it easy to navigate, search, and modify the tree.
+
+A well-formed XML/HTML document yields a well-formed data
+structure. An ill-formed XML/HTML document yields a correspondingly
+ill-formed data structure. If your document is only locally
+well-formed, you can use this library to find and process the
+well-formed part of it.
+
+Beautiful Soup works with Python 2.2 and up. It has no external
+dependencies, but you'll have more success at converting data to UTF-8
+if you also install these three packages:
+
+* chardet, for auto-detecting character encodings
+ http://chardet.feedparser.org/
+* cjkcodecs and iconv_codec, which add more encodings to the ones supported
+ by stock Python.
+ http://cjkpython.i18n.org/
+
+Beautiful Soup defines classes for two main parsing strategies:
+
+ * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
+ language that kind of looks like XML.
+
+ * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
+ or invalid. This class has web browser-like heuristics for
+ obtaining a sensible parse tree in the face of common HTML errors.
+
+Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
+the encoding of an HTML or XML document, and converting it to
+Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
+
+For more than you ever wanted to know about Beautiful Soup, see the
+documentation:
+http://www.crummy.com/software/BeautifulSoup/documentation.html
+
+Here, have some legalese:
+
+Copyright (c) 2004-2010, Leonard Richardson
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials provided
+ with the distribution.
+
+ * Neither the name of the the Beautiful Soup Consortium and All
+ Night Kosher Bakery nor the names of its contributors may be
+ used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
+
+"""
+from __future__ import generators
+
+__author__ = "Leonard Richardson (leonardr@segfault.org)"
+__version__ = "3.2.0"
+__copyright__ = "Copyright (c) 2004-2010 Leonard Richardson"
+__license__ = "New-style BSD"
+
+from sgmllib import SGMLParser, SGMLParseError
+import codecs
+import markupbase
+import types
+import re
+import sgmllib
+try:
+ from htmlentitydefs import name2codepoint
+except ImportError:
+ name2codepoint = {}
+try:
+ set
+except NameError:
+ from sets import Set as set
+
+#These hacks make Beautiful Soup able to parse XML with namespaces
+sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
+markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
+
+DEFAULT_OUTPUT_ENCODING = "utf-8"
+
+def _match_css_class(str):
+ """Build a RE to match the given CSS class."""
+ return re.compile(r"(^|.*\s)%s($|\s)" % str)
+
+# First, the classes that represent markup elements.
+
+class PageElement(object):
+ """Contains the navigational information for some part of the page
+ (either a tag or a piece of text)"""
+
+ def setup(self, parent=None, previous=None):
+ """Sets up the initial relations between this element and
+ other elements."""
+ self.parent = parent
+ self.previous = previous
+ self.next = None
+ self.previousSibling = None
+ self.nextSibling = None
+ if self.parent and self.parent.contents:
+ self.previousSibling = self.parent.contents[-1]
+ self.previousSibling.nextSibling = self
+
+ def replaceWith(self, replaceWith):
+ oldParent = self.parent
+ myIndex = self.parent.index(self)
+ if hasattr(replaceWith, "parent")\
+ and replaceWith.parent is self.parent:
+ # We're replacing this element with one of its siblings.
+ index = replaceWith.parent.index(replaceWith)
+ if index and index < myIndex:
+ # Furthermore, it comes before this element. That
+ # means that when we extract it, the index of this
+ # element will change.
+ myIndex = myIndex - 1
+ self.extract()
+ oldParent.insert(myIndex, replaceWith)
+
+ def replaceWithChildren(self):
+ myParent = self.parent
+ myIndex = self.parent.index(self)
+ self.extract()
+ reversedChildren = list(self.contents)
+ reversedChildren.reverse()
+ for child in reversedChildren:
+ myParent.insert(myIndex, child)
+
+ def extract(self):
+ """Destructively rips this element out of the tree."""
+ if self.parent:
+ try:
+ del self.parent.contents[self.parent.index(self)]
+ except ValueError:
+ pass
+
+ #Find the two elements that would be next to each other if
+ #this element (and any children) hadn't been parsed. Connect
+ #the two.
+ lastChild = self._lastRecursiveChild()
+ nextElement = lastChild.next
+
+ if self.previous:
+ self.previous.next = nextElement
+ if nextElement:
+ nextElement.previous = self.previous
+ self.previous = None
+ lastChild.next = None
+
+ self.parent = None
+ if self.previousSibling:
+ self.previousSibling.nextSibling = self.nextSibling
+ if self.nextSibling:
+ self.nextSibling.previousSibling = self.previousSibling
+ self.previousSibling = self.nextSibling = None
+ return self
+
+ def _lastRecursiveChild(self):
+ "Finds the last element beneath this object to be parsed."
+ lastChild = self
+ while hasattr(lastChild, 'contents') and lastChild.contents:
+ lastChild = lastChild.contents[-1]
+ return lastChild
+
+ def insert(self, position, newChild):
+ if isinstance(newChild, basestring) \
+ and not isinstance(newChild, NavigableString):
+ newChild = NavigableString(newChild)
+
+ position = min(position, len(self.contents))
+ if hasattr(newChild, 'parent') and newChild.parent is not None:
+ # We're 'inserting' an element that's already one
+ # of this object's children.
+ if newChild.parent is self:
+ index = self.index(newChild)
+ if index > position:
+ # Furthermore we're moving it further down the
+ # list of this object's children. That means that
+ # when we extract this element, our target index
+ # will jump down one.
+ position = position - 1
+ newChild.extract()
+
+ newChild.parent = self
+ previousChild = None
+ if position == 0:
+ newChild.previousSibling = None
+ newChild.previous = self
+ else:
+ previousChild = self.contents[position-1]
+ newChild.previousSibling = previousChild
+ newChild.previousSibling.nextSibling = newChild
+ newChild.previous = previousChild._lastRecursiveChild()
+ if newChild.previous:
+ newChild.previous.next = newChild
+
+ newChildsLastElement = newChild._lastRecursiveChild()
+
+ if position >= len(self.contents):
+ newChild.nextSibling = None
+
+ parent = self
+ parentsNextSibling = None
+ while not parentsNextSibling:
+ parentsNextSibling = parent.nextSibling
+ parent = parent.parent
+ if not parent: # This is the last element in the document.
+ break
+ if parentsNextSibling:
+ newChildsLastElement.next = parentsNextSibling
+ else:
+ newChildsLastElement.next = None
+ else:
+ nextChild = self.contents[position]
+ newChild.nextSibling = nextChild
+ if newChild.nextSibling:
+ newChild.nextSibling.previousSibling = newChild
+ newChildsLastElement.next = nextChild
+
+ if newChildsLastElement.next:
+ newChildsLastElement.next.previous = newChildsLastElement
+ self.contents.insert(position, newChild)
+
+ def append(self, tag):
+ """Appends the given tag to the contents of this tag."""
+ self.insert(len(self.contents), tag)
+
+ def findNext(self, name=None, attrs={}, text=None, **kwargs):
+ """Returns the first item that matches the given criteria and
+ appears after this Tag in the document."""
+ return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
+
+ def findAllNext(self, name=None, attrs={}, text=None, limit=None,
+ **kwargs):
+ """Returns all items that match the given criteria and appear
+ after this Tag in the document."""
+ return self._findAll(name, attrs, text, limit, self.nextGenerator,
+ **kwargs)
+
+ def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
+ """Returns the closest sibling to this Tag that matches the
+ given criteria and appears after this Tag in the document."""
+ return self._findOne(self.findNextSiblings, name, attrs, text,
+ **kwargs)
+
+ def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
+ **kwargs):
+ """Returns the siblings of this Tag that match the given
+ criteria and appear after this Tag in the document."""
+ return self._findAll(name, attrs, text, limit,
+ self.nextSiblingGenerator, **kwargs)
+ fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
+
+ def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
+ """Returns the first item that matches the given criteria and
+ appears before this Tag in the document."""
+ return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
+
+ def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
+ **kwargs):
+ """Returns all items that match the given criteria and appear
+ before this Tag in the document."""
+ return self._findAll(name, attrs, text, limit, self.previousGenerator,
+ **kwargs)
+ fetchPrevious = findAllPrevious # Compatibility with pre-3.x
+
+ def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
+ """Returns the closest sibling to this Tag that matches the
+ given criteria and appears before this Tag in the document."""
+ return self._findOne(self.findPreviousSiblings, name, attrs, text,
+ **kwargs)
+
+ def findPreviousSiblings(self, name=None, attrs={}, text=None,
+ limit=None, **kwargs):
+ """Returns the siblings of this Tag that match the given
+ criteria and appear before this Tag in the document."""
+ return self._findAll(name, attrs, text, limit,
+ self.previousSiblingGenerator, **kwargs)
+ fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
+
+ def findParent(self, name=None, attrs={}, **kwargs):
+ """Returns the closest parent of this Tag that matches the given
+ criteria."""
+ # NOTE: We can't use _findOne because findParents takes a different
+ # set of arguments.
+ r = None
+ l = self.findParents(name, attrs, 1)
+ if l:
+ r = l[0]
+ return r
+
+ def findParents(self, name=None, attrs={}, limit=None, **kwargs):
+ """Returns the parents of this Tag that match the given
+ criteria."""
+
+ return self._findAll(name, attrs, None, limit, self.parentGenerator,
+ **kwargs)
+ fetchParents = findParents # Compatibility with pre-3.x
+
+ #These methods do the real heavy lifting.
+
+ def _findOne(self, method, name, attrs, text, **kwargs):
+ r = None
+ l = method(name, attrs, text, 1, **kwargs)
+ if l:
+ r = l[0]
+ return r
+
+ def _findAll(self, name, attrs, text, limit, generator, **kwargs):
+ "Iterates over a generator looking for things that match."
+
+ if isinstance(name, SoupStrainer):
+ strainer = name
+ # (Possibly) special case some findAll*(...) searches
+ elif text is None and not limit and not attrs and not kwargs:
+ # findAll*(True)
+ if name is True:
+ return [element for element in generator()
+ if isinstance(element, Tag)]
+ # findAll*('tag-name')
+ elif isinstance(name, basestring):
+ return [element for element in generator()
+ if isinstance(element, Tag) and
+ element.name == name]
+ else:
+ strainer = SoupStrainer(name, attrs, text, **kwargs)
+ # Build a SoupStrainer
+ else:
+ strainer = SoupStrainer(name, attrs, text, **kwargs)
+ results = ResultSet(strainer)
+ g = generator()
+ while True:
+ try:
+ i = g.next()
+ except StopIteration:
+ break
+ if i:
+ found = strainer.search(i)
+ if found:
+ results.append(found)
+ if limit and len(results) >= limit:
+ break
+ return results
+
+ #These Generators can be used to navigate starting from both
+ #NavigableStrings and Tags.
+ def nextGenerator(self):
+ i = self
+ while i is not None:
+ i = i.next
+ yield i
+
+ def nextSiblingGenerator(self):
+ i = self
+ while i is not None:
+ i = i.nextSibling
+ yield i
+
+ def previousGenerator(self):
+ i = self
+ while i is not None:
+ i = i.previous
+ yield i
+
+ def previousSiblingGenerator(self):
+ i = self
+ while i is not None:
+ i = i.previousSibling
+ yield i
+
+ def parentGenerator(self):
+ i = self
+ while i is not None:
+ i = i.parent
+ yield i
+
+ # Utility methods
+ def substituteEncoding(self, str, encoding=None):
+ encoding = encoding or "utf-8"
+ return str.replace("%SOUP-ENCODING%", encoding)
+
+ def toEncoding(self, s, encoding=None):
+ """Encodes an object to a string in some encoding, or to Unicode.
+ ."""
+ if isinstance(s, unicode):
+ if encoding:
+ s = s.encode(encoding)
+ elif isinstance(s, str):
+ if encoding:
+ s = s.encode(encoding)
+ else:
+ s = unicode(s)
+ else:
+ if encoding:
+ s = self.toEncoding(str(s), encoding)
+ else:
+ s = unicode(s)
+ return s
+
+class NavigableString(unicode, PageElement):
+
+ def __new__(cls, value):
+ """Create a new NavigableString.
+
+ When unpickling a NavigableString, this method is called with
+ the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
+ passed in to the superclass's __new__ or the superclass won't know
+ how to handle non-ASCII characters.
+ """
+ if isinstance(value, unicode):
+ return unicode.__new__(cls, value)
+ return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
+
+ def __getnewargs__(self):
+ return (NavigableString.__str__(self),)
+
+ def __getattr__(self, attr):
+ """text.string gives you text. This is for backwards
+ compatibility for Navigable*String, but for CData* it lets you
+ get the string without the CData wrapper."""
+ if attr == 'string':
+ return self
+ else:
+ raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
+
+ def __unicode__(self):
+ return str(self).decode(DEFAULT_OUTPUT_ENCODING)
+
+ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ if encoding:
+ return self.encode(encoding)
+ else:
+ return self
+
+class CData(NavigableString):
+
+ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
+
+class ProcessingInstruction(NavigableString):
+ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ output = self
+ if "%SOUP-ENCODING%" in output:
+ output = self.substituteEncoding(output, encoding)
+ return "<?%s?>" % self.toEncoding(output, encoding)
+
+class Comment(NavigableString):
+ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ return "<!--%s-->" % NavigableString.__str__(self, encoding)
+
+class Declaration(NavigableString):
+ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ return "<!%s>" % NavigableString.__str__(self, encoding)
+
+class Tag(PageElement):
+
+ """Represents a found HTML tag with its attributes and contents."""
+
+ def _invert(h):
+ "Cheap function to invert a hash."
+ i = {}
+ for k,v in h.items():
+ i[v] = k
+ return i
+
+ XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
+ "quot" : '"',
+ "amp" : "&",
+ "lt" : "<",
+ "gt" : ">" }
+
+ XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
+
+ def _convertEntities(self, match):
+ """Used in a call to re.sub to replace HTML, XML, and numeric
+ entities with the appropriate Unicode characters. If HTML
+ entities are being converted, any unrecognized entities are
+ escaped."""
+ x = match.group(1)
+ if self.convertHTMLEntities and x in name2codepoint:
+ return unichr(name2codepoint[x])
+ elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
+ if self.convertXMLEntities:
+ return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
+ else:
+ return u'&%s;' % x
+ elif len(x) > 0 and x[0] == '#':
+ # Handle numeric entities
+ if len(x) > 1 and x[1] == 'x':
+ return unichr(int(x[2:], 16))
+ else:
+ return unichr(int(x[1:]))
+
+ elif self.escapeUnrecognizedEntities:
+ return u'&%s;' % x
+ else:
+ return u'&%s;' % x
+
+ def __init__(self, parser, name, attrs=None, parent=None,
+ previous=None):
+ "Basic constructor."
+
+ # We don't actually store the parser object: that lets extracted
+ # chunks be garbage-collected
+ self.parserClass = parser.__class__
+ self.isSelfClosing = parser.isSelfClosingTag(name)
+ self.name = name
+ if attrs is None:
+ attrs = []
+ elif isinstance(attrs, dict):
+ attrs = attrs.items()
+ self.attrs = attrs
+ self.contents = []
+ self.setup(parent, previous)
+ self.hidden = False
+ self.containsSubstitutions = False
+ self.convertHTMLEntities = parser.convertHTMLEntities
+ self.convertXMLEntities = parser.convertXMLEntities
+ self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
+
+ # Convert any HTML, XML, or numeric entities in the attribute values.
+ convert = lambda(k, val): (k,
+ re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
+ self._convertEntities,
+ val))
+ self.attrs = map(convert, self.attrs)
+
+ def getString(self):
+ if (len(self.contents) == 1
+ and isinstance(self.contents[0], NavigableString)):
+ return self.contents[0]
+
+ def setString(self, string):
+ """Replace the contents of the tag with a string"""
+ self.clear()
+ self.append(string)
+
+ string = property(getString, setString)
+
+ def getText(self, separator=u""):
+ if not len(self.contents):
+ return u""
+ stopNode = self._lastRecursiveChild().next
+ strings = []
+ current = self.contents[0]
+ while current is not stopNode:
+ if isinstance(current, NavigableString):
+ strings.append(current.strip())
+ current = current.next
+ return separator.join(strings)
+
+ text = property(getText)
+
+ def get(self, key, default=None):
+ """Returns the value of the 'key' attribute for the tag, or
+ the value given for 'default' if it doesn't have that
+ attribute."""
+ return self._getAttrMap().get(key, default)
+
+ def clear(self):
+ """Extract all children."""
+ for child in self.contents[:]:
+ child.extract()
+
+ def index(self, element):
+ for i, child in enumerate(self.contents):
+ if child is element:
+ return i
+ raise ValueError("Tag.index: element not in tag")
+
+ def has_key(self, key):
+ return self._getAttrMap().has_key(key)
+
+ def __getitem__(self, key):
+ """tag[key] returns the value of the 'key' attribute for the tag,
+ and throws an exception if it's not there."""
+ return self._getAttrMap()[key]
+
+ def __iter__(self):
+ "Iterating over a tag iterates over its contents."
+ return iter(self.contents)
+
+ def __len__(self):
+ "The length of a tag is the length of its list of contents."
+ return len(self.contents)
+
+ def __contains__(self, x):
+ return x in self.contents
+
+ def __nonzero__(self):
+ "A tag is non-None even if it has no contents."
+ return True
+
+ def __setitem__(self, key, value):
+ """Setting tag[key] sets the value of the 'key' attribute for the
+ tag."""
+ self._getAttrMap()
+ self.attrMap[key] = value
+ found = False
+ for i in range(0, len(self.attrs)):
+ if self.attrs[i][0] == key:
+ self.attrs[i] = (key, value)
+ found = True
+ if not found:
+ self.attrs.append((key, value))
+ self._getAttrMap()[key] = value
+
+ def __delitem__(self, key):
+ "Deleting tag[key] deletes all 'key' attributes for the tag."
+ for item in self.attrs:
+ if item[0] == key:
+ self.attrs.remove(item)
+ #We don't break because bad HTML can define the same
+ #attribute multiple times.
+ self._getAttrMap()
+ if self.attrMap.has_key(key):
+ del self.attrMap[key]
+
+ def __call__(self, *args, **kwargs):
+ """Calling a tag like a function is the same as calling its
+ findAll() method. Eg. tag('a') returns a list of all the A tags
+ found within this tag."""
+ return apply(self.findAll, args, kwargs)
+
+ def __getattr__(self, tag):
+ #print "Getattr %s.%s" % (self.__class__, tag)
+ if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
+ return self.find(tag[:-3])
+ elif tag.find('__') != 0:
+ return self.find(tag)
+ raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
+
+ def __eq__(self, other):
+ """Returns true iff this tag has the same name, the same attributes,
+ and the same contents (recursively) as the given tag.
+
+ NOTE: right now this will return false if two tags have the
+ same attributes in a different order. Should this be fixed?"""
+ if other is self:
+ return True
+ if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
+ return False
+ for i in range(0, len(self.contents)):
+ if self.contents[i] != other.contents[i]:
+ return False
+ return True
+
+ def __ne__(self, other):
+ """Returns true iff this tag is not identical to the other tag,
+ as defined in __eq__."""
+ return not self == other
+
+ def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ """Renders this tag as a string."""
+ return self.__str__(encoding)
+
+ def __unicode__(self):
+ return self.__str__(None)
+
+ BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
+ + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
+ + ")")
+
+ def _sub_entity(self, x):
+ """Used with a regular expression to substitute the
+ appropriate XML entity for an XML special character."""
+ return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
+
+ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
+ prettyPrint=False, indentLevel=0):
+ """Returns a string or Unicode representation of this tag and
+ its contents. To get Unicode, pass None for encoding.
+
+ NOTE: since Python's HTML parser consumes whitespace, this
+ method is not certain to reproduce the whitespace present in
+ the original string."""
+
+ encodedName = self.toEncoding(self.name, encoding)
+
+ attrs = []
+ if self.attrs:
+ for key, val in self.attrs:
+ fmt = '%s="%s"'
+ if isinstance(val, basestring):
+ if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
+ val = self.substituteEncoding(val, encoding)
+
+ # The attribute value either:
+ #
+ # * Contains no embedded double quotes or single quotes.
+ # No problem: we enclose it in double quotes.
+ # * Contains embedded single quotes. No problem:
+ # double quotes work here too.
+ # * Contains embedded double quotes. No problem:
+ # we enclose it in single quotes.
+ # * Embeds both single _and_ double quotes. This
+ # can't happen naturally, but it can happen if
+ # you modify an attribute value after parsing
+ # the document. Now we have a bit of a
+ # problem. We solve it by enclosing the
+ # attribute in single quotes, and escaping any
+ # embedded single quotes to XML entities.
+ if '"' in val:
+ fmt = "%s='%s'"
+ if "'" in val:
+ # TODO: replace with apos when
+ # appropriate.
+ val = val.replace("'", "&squot;")
+
+ # Now we're okay w/r/t quotes. But the attribute
+ # value might also contain angle brackets, or
+ # ampersands that aren't part of entities. We need
+ # to escape those to XML entities too.
+ val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
+
+ attrs.append(fmt % (self.toEncoding(key, encoding),
+ self.toEncoding(val, encoding)))
+ close = ''
+ closeTag = ''
+ if self.isSelfClosing:
+ close = ' /'
+ else:
+ closeTag = '</%s>' % encodedName
+
+ indentTag, indentContents = 0, 0
+ if prettyPrint:
+ indentTag = indentLevel
+ space = (' ' * (indentTag-1))
+ indentContents = indentTag + 1
+ contents = self.renderContents(encoding, prettyPrint, indentContents)
+ if self.hidden:
+ s = contents
+ else:
+ s = []
+ attributeString = ''
+ if attrs:
+ attributeString = ' ' + ' '.join(attrs)
+ if prettyPrint:
+ s.append(space)
+ s.append('<%s%s%s>' % (encodedName, attributeString, close))
+ if prettyPrint:
+ s.append("\n")
+ s.append(contents)
+ if prettyPrint and contents and contents[-1] != "\n":
+ s.append("\n")
+ if prettyPrint and closeTag:
+ s.append(space)
+ s.append(closeTag)
+ if prettyPrint and closeTag and self.nextSibling:
+ s.append("\n")
+ s = ''.join(s)
+ return s
+
+ def decompose(self):
+ """Recursively destroys the contents of this tree."""
+ self.extract()
+ if len(self.contents) == 0:
+ return
+ current = self.contents[0]
+ while current is not None:
+ next = current.next
+ if isinstance(current, Tag):
+ del current.contents[:]
+ current.parent = None
+ current.previous = None
+ current.previousSibling = None
+ current.next = None
+ current.nextSibling = None
+ current = next
+
+ def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ return self.__str__(encoding, True)
+
+ def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
+ prettyPrint=False, indentLevel=0):
+ """Renders the contents of this tag as a string in the given
+ encoding. If encoding is None, returns a Unicode string.."""
+ s=[]
+ for c in self:
+ text = None
+ if isinstance(c, NavigableString):
+ text = c.__str__(encoding)
+ elif isinstance(c, Tag):
+ s.append(c.__str__(encoding, prettyPrint, indentLevel))
+ if text and prettyPrint:
+ text = text.strip()
+ if text:
+ if prettyPrint:
+ s.append(" " * (indentLevel-1))
+ s.append(text)
+ if prettyPrint:
+ s.append("\n")
+ return ''.join(s)
+
+ #Soup methods
+
+ def find(self, name=None, attrs={}, recursive=True, text=None,
+ **kwargs):
+ """Return only the first child of this Tag matching the given
+ criteria."""
+ r = None
+ l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
+ if l:
+ r = l[0]
+ return r
+ findChild = find
+
+ def findAll(self, name=None, attrs={}, recursive=True, text=None,
+ limit=None, **kwargs):
+ """Extracts a list of Tag objects that match the given
+ criteria. You can specify the name of the Tag and any
+ attributes you want the Tag to have.
+
+ The value of a key-value pair in the 'attrs' map can be a
+ string, a list of strings, a regular expression object, or a
+ callable that takes a string and returns whether or not the
+ string matches for some custom definition of 'matches'. The
+ same is true of the tag name."""
+ generator = self.recursiveChildGenerator
+ if not recursive:
+ generator = self.childGenerator
+ return self._findAll(name, attrs, text, limit, generator, **kwargs)
+ findChildren = findAll
+
+ # Pre-3.x compatibility methods
+ first = find
+ fetch = findAll
+
+ def fetchText(self, text=None, recursive=True, limit=None):
+ return self.findAll(text=text, recursive=recursive, limit=limit)
+
+ def firstText(self, text=None, recursive=True):
+ return self.find(text=text, recursive=recursive)
+
+ #Private methods
+
+ def _getAttrMap(self):
+ """Initializes a map representation of this tag's attributes,
+ if not already initialized."""
+ if not getattr(self, 'attrMap'):
+ self.attrMap = {}
+ for (key, value) in self.attrs:
+ self.attrMap[key] = value
+ return self.attrMap
+
+ #Generator methods
+ def childGenerator(self):
+ # Just use the iterator from the contents
+ return iter(self.contents)
+
+ def recursiveChildGenerator(self):
+ if not len(self.contents):
+ raise StopIteration
+ stopNode = self._lastRecursiveChild().next
+ current = self.contents[0]
+ while current is not stopNode:
+ yield current
+ current = current.next
+
+
+# Next, a couple classes to represent queries and their results.
+class SoupStrainer:
+ """Encapsulates a number of ways of matching a markup element (tag or
+ text)."""
+
+ def __init__(self, name=None, attrs={}, text=None, **kwargs):
+ self.name = name
+ if isinstance(attrs, basestring):
+ kwargs['class'] = _match_css_class(attrs)
+ attrs = None
+ if kwargs:
+ if attrs:
+ attrs = attrs.copy()
+ attrs.update(kwargs)
+ else:
+ attrs = kwargs
+ self.attrs = attrs
+ self.text = text
+
+ def __str__(self):
+ if self.text:
+ return self.text
+ else:
+ return "%s|%s" % (self.name, self.attrs)
+
+ def searchTag(self, markupName=None, markupAttrs={}):
+ found = None
+ markup = None
+ if isinstance(markupName, Tag):
+ markup = markupName
+ markupAttrs = markup
+ callFunctionWithTagData = callable(self.name) \
+ and not isinstance(markupName, Tag)
+
+ if (not self.name) \
+ or callFunctionWithTagData \
+ or (markup and self._matches(markup, self.name)) \
+ or (not markup and self._matches(markupName, self.name)):
+ if callFunctionWithTagData:
+ match = self.name(markupName, markupAttrs)
+ else:
+ match = True
+ markupAttrMap = None
+ for attr, matchAgainst in self.attrs.items():
+ if not markupAttrMap:
+ if hasattr(markupAttrs, 'get'):
+ markupAttrMap = markupAttrs
+ else:
+ markupAttrMap = {}
+ for k,v in markupAttrs:
+ markupAttrMap[k] = v
+ attrValue = markupAttrMap.get(attr)
+ if not self._matches(attrValue, matchAgainst):
+ match = False
+ break
+ if match:
+ if markup:
+ found = markup
+ else:
+ found = markupName
+ return found
+
+ def search(self, markup):
+ #print 'looking for %s in %s' % (self, markup)
+ found = None
+ # If given a list of items, scan it for a text element that
+ # matches.
+ if hasattr(markup, "__iter__") \
+ and not isinstance(markup, Tag):
+ for element in markup:
+ if isinstance(element, NavigableString) \
+ and self.search(element):
+ found = element
+ break
+ # If it's a Tag, make sure its name or attributes match.
+ # Don't bother with Tags if we're searching for text.
+ elif isinstance(markup, Tag):
+ if not self.text:
+ found = self.searchTag(markup)
+ # If it's text, make sure the text matches.
+ elif isinstance(markup, NavigableString) or \
+ isinstance(markup, basestring):
+ if self._matches(markup, self.text):
+ found = markup
+ else:
+ raise Exception, "I don't know how to match against a %s" \
+ % markup.__class__
+ return found
+
+ def _matches(self, markup, matchAgainst):
+ #print "Matching %s against %s" % (markup, matchAgainst)
+ result = False
+ if matchAgainst is True:
+ result = markup is not None
+ elif callable(matchAgainst):
+ result = matchAgainst(markup)
+ else:
+ #Custom match methods take the tag as an argument, but all
+ #other ways of matching match the tag name as a string.
+ if isinstance(markup, Tag):
+ markup = markup.name
+ if markup and not isinstance(markup, basestring):
+ markup = unicode(markup)
+ #Now we know that chunk is either a string, or None.
+ if hasattr(matchAgainst, 'match'):
+ # It's a regexp object.
+ result = markup and matchAgainst.search(markup)
+ elif hasattr(matchAgainst, '__iter__'): # list-like
+ result = markup in matchAgainst
+ elif hasattr(matchAgainst, 'items'):
+ result = markup.has_key(matchAgainst)
+ elif matchAgainst and isinstance(markup, basestring):
+ if isinstance(markup, unicode):
+ matchAgainst = unicode(matchAgainst)
+ else:
+ matchAgainst = str(matchAgainst)
+
+ if not result:
+ result = matchAgainst == markup
+ return result
+
+class ResultSet(list):
+ """A ResultSet is just a list that keeps track of the SoupStrainer
+ that created it."""
+ def __init__(self, source):
+ list.__init__([])
+ self.source = source
+
+# Now, some helper functions.
+
+def buildTagMap(default, *args):
+ """Turns a list of maps, lists, or scalars into a single map.
+ Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
+ NESTING_RESET_TAGS maps out of lists and partial maps."""
+ built = {}
+ for portion in args:
+ if hasattr(portion, 'items'):
+ #It's a map. Merge it.
+ for k,v in portion.items():
+ built[k] = v
+ elif hasattr(portion, '__iter__'): # is a list
+ #It's a list. Map each item to the default.
+ for k in portion:
+ built[k] = default
+ else:
+ #It's a scalar. Map it to the default.
+ built[portion] = default
+ return built
+
+# Now, the parser classes.
+
+class BeautifulStoneSoup(Tag, SGMLParser):
+
+ """This class contains the basic parser and search code. It defines
+ a parser that knows nothing about tag behavior except for the
+ following:
+
+ You can't close a tag without closing all the tags it encloses.
+ That is, "<foo><bar></foo>" actually means
+ "<foo><bar></bar></foo>".
+
+ [Another possible explanation is "<foo><bar /></foo>", but since
+ this class defines no SELF_CLOSING_TAGS, it will never use that
+ explanation.]
+
+ This class is useful for parsing XML or made-up markup languages,
+ or when BeautifulSoup makes an assumption counter to what you were
+ expecting."""
+
+ SELF_CLOSING_TAGS = {}
+ NESTABLE_TAGS = {}
+ RESET_NESTING_TAGS = {}
+ QUOTE_TAGS = {}
+ PRESERVE_WHITESPACE_TAGS = []
+
+ MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
+ lambda x: x.group(1) + ' />'),
+ (re.compile('<!\s+([^<>]*)>'),
+ lambda x: '<!' + x.group(1) + '>')
+ ]
+
+ ROOT_TAG_NAME = u'[document]'
+
+ HTML_ENTITIES = "html"
+ XML_ENTITIES = "xml"
+ XHTML_ENTITIES = "xhtml"
+ # TODO: This only exists for backwards-compatibility
+ ALL_ENTITIES = XHTML_ENTITIES
+
+ # Used when determining whether a text node is all whitespace and
+ # can be replaced with a single space. A text node that contains
+ # fancy Unicode spaces (usually non-breaking) should be left
+ # alone.
+ STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
+
+ def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
+ markupMassage=True, smartQuotesTo=XML_ENTITIES,
+ convertEntities=None, selfClosingTags=None, isHTML=False):
+ """The Soup object is initialized as the 'root tag', and the
+ provided markup (which can be a string or a file-like object)
+ is fed into the underlying parser.
+
+ sgmllib will process most bad HTML, and the BeautifulSoup
+ class has some tricks for dealing with some HTML that kills
+ sgmllib, but Beautiful Soup can nonetheless choke or lose data
+ if your data uses self-closing tags or declarations
+ incorrectly.
+
+ By default, Beautiful Soup uses regexes to sanitize input,
+ avoiding the vast majority of these problems. If the problems
+ don't apply to you, pass in False for markupMassage, and
+ you'll get better performance.
+
+ The default parser massage techniques fix the two most common
+ instances of invalid HTML that choke sgmllib:
+
+ <br/> (No space between name of closing tag and tag close)
+ <! --Comment--> (Extraneous whitespace in declaration)
+
+ You can pass in a custom list of (RE object, replace method)
+ tuples to get Beautiful Soup to scrub your input the way you
+ want."""
+
+ self.parseOnlyThese = parseOnlyThese
+ self.fromEncoding = fromEncoding
+ self.smartQuotesTo = smartQuotesTo
+ self.convertEntities = convertEntities
+ # Set the rules for how we'll deal with the entities we
+ # encounter
+ if self.convertEntities:
+ # It doesn't make sense to convert encoded characters to
+ # entities even while you're converting entities to Unicode.
+ # Just convert it all to Unicode.
+ self.smartQuotesTo = None
+ if convertEntities == self.HTML_ENTITIES:
+ self.convertXMLEntities = False
+ self.convertHTMLEntities = True
+ self.escapeUnrecognizedEntities = True
+ elif convertEntities == self.XHTML_ENTITIES:
+ self.convertXMLEntities = True
+ self.convertHTMLEntities = True
+ self.escapeUnrecognizedEntities = False
+ elif convertEntities == self.XML_ENTITIES:
+ self.convertXMLEntities = True
+ self.convertHTMLEntities = False
+ self.escapeUnrecognizedEntities = False
+ else:
+ self.convertXMLEntities = False
+ self.convertHTMLEntities = False
+ self.escapeUnrecognizedEntities = False
+
+ self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
+ SGMLParser.__init__(self)
+
+ if hasattr(markup, 'read'): # It's a file-type object.
+ markup = markup.read()
+ self.markup = markup
+ self.markupMassage = markupMassage
+ try:
+ self._feed(isHTML=isHTML)
+ except StopParsing:
+ pass
+ self.markup = None # The markup can now be GCed
+
+ def convert_charref(self, name):
+ """This method fixes a bug in Python's SGMLParser."""
+ try:
+ n = int(name)
+ except ValueError:
+ return
+ if not 0 <= n <= 127 : # ASCII ends at 127, not 255
+ return
+ return self.convert_codepoint(n)
+
+ def _feed(self, inDocumentEncoding=None, isHTML=False):
+ # Convert the document to Unicode.
+ markup = self.markup
+ if isinstance(markup, unicode):
+ if not hasattr(self, 'originalEncoding'):
+ self.originalEncoding = None
+ else:
+ dammit = UnicodeDammit\
+ (markup, [self.fromEncoding, inDocumentEncoding],
+ smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
+ markup = dammit.unicode
+ self.originalEncoding = dammit.originalEncoding
+ self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
+ if markup:
+ if self.markupMassage:
+ if not hasattr(self.markupMassage, "__iter__"):
+ self.markupMassage = self.MARKUP_MASSAGE
+ for fix, m in self.markupMassage:
+ markup = fix.sub(m, markup)
+ # TODO: We get rid of markupMassage so that the
+ # soup object can be deepcopied later on. Some
+ # Python installations can't copy regexes. If anyone
+ # was relying on the existence of markupMassage, this
+ # might cause problems.
+ del(self.markupMassage)
+ self.reset()
+
+ SGMLParser.feed(self, markup)
+ # Close out any unfinished strings and close all the open tags.
+ self.endData()
+ while self.currentTag.name != self.ROOT_TAG_NAME:
+ self.popTag()
+
+ def __getattr__(self, methodName):
+ """This method routes method call requests to either the SGMLParser
+ superclass or the Tag superclass, depending on the method name."""
+ #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
+
+ if methodName.startswith('start_') or methodName.startswith('end_') \
+ or methodName.startswith('do_'):
+ return SGMLParser.__getattr__(self, methodName)
+ elif not methodName.startswith('__'):
+ return Tag.__getattr__(self, methodName)
+ else:
+ raise AttributeError
+
+ def isSelfClosingTag(self, name):
+ """Returns true iff the given string is the name of a
+ self-closing tag according to this parser."""
+ return self.SELF_CLOSING_TAGS.has_key(name) \
+ or self.instanceSelfClosingTags.has_key(name)
+
+ def reset(self):
+ Tag.__init__(self, self, self.ROOT_TAG_NAME)
+ self.hidden = 1
+ SGMLParser.reset(self)
+ self.currentData = []
+ self.currentTag = None
+ self.tagStack = []
+ self.quoteStack = []
+ self.pushTag(self)
+
+ def popTag(self):
+ tag = self.tagStack.pop()
+
+ #print "Pop", tag.name
+ if self.tagStack:
+ self.currentTag = self.tagStack[-1]
+ return self.currentTag
+
+ def pushTag(self, tag):
+ #print "Push", tag.name
+ if self.currentTag:
+ self.currentTag.contents.append(tag)
+ self.tagStack.append(tag)
+ self.currentTag = self.tagStack[-1]
+
+ def endData(self, containerClass=NavigableString):
+ if self.currentData:
+ currentData = u''.join(self.currentData)
+ if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
+ not set([tag.name for tag in self.tagStack]).intersection(
+ self.PRESERVE_WHITESPACE_TAGS)):
+ if '\n' in currentData:
+ currentData = '\n'
+ else:
+ currentData = ' '
+ self.currentData = []
+ if self.parseOnlyThese and len(self.tagStack) <= 1 and \
+ (not self.parseOnlyThese.text or \
+ not self.parseOnlyThese.search(currentData)):
+ return
+ o = containerClass(currentData)
+ o.setup(self.currentTag, self.previous)
+ if self.previous:
+ self.previous.next = o
+ self.previous = o
+ self.currentTag.contents.append(o)
+
+
+ def _popToTag(self, name, inclusivePop=True):
+ """Pops the tag stack up to and including the most recent
+ instance of the given tag. If inclusivePop is false, pops the tag
+ stack up to but *not* including the most recent instqance of
+ the given tag."""
+ #print "Popping to %s" % name
+ if name == self.ROOT_TAG_NAME:
+ return
+
+ numPops = 0
+ mostRecentTag = None
+ for i in range(len(self.tagStack)-1, 0, -1):
+ if name == self.tagStack[i].name:
+ numPops = len(self.tagStack)-i
+ break
+ if not inclusivePop:
+ numPops = numPops - 1
+
+ for i in range(0, numPops):
+ mostRecentTag = self.popTag()
+ return mostRecentTag
+
+ def _smartPop(self, name):
+
+ """We need to pop up to the previous tag of this type, unless
+ one of this tag's nesting reset triggers comes between this
+ tag and the previous tag of this type, OR unless this tag is a
+ generic nesting trigger and another generic nesting trigger
+ comes between this tag and the previous tag of this type.
+
+ Examples:
+ <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
+ <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
+ <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
+
+ <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
+ <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
+ <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
+ """
+
+ nestingResetTriggers = self.NESTABLE_TAGS.get(name)
+ isNestable = nestingResetTriggers != None
+ isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
+ popTo = None
+ inclusive = True
+ for i in range(len(self.tagStack)-1, 0, -1):
+ p = self.tagStack[i]
+ if (not p or p.name == name) and not isNestable:
+ #Non-nestable tags get popped to the top or to their
+ #last occurance.
+ popTo = name
+ break
+ if (nestingResetTriggers is not None
+ and p.name in nestingResetTriggers) \
+ or (nestingResetTriggers is None and isResetNesting
+ and self.RESET_NESTING_TAGS.has_key(p.name)):
+
+ #If we encounter one of the nesting reset triggers
+ #peculiar to this tag, or we encounter another tag
+ #that causes nesting to reset, pop up to but not
+ #including that tag.
+ popTo = p.name
+ inclusive = False
+ break
+ p = p.parent
+ if popTo:
+ self._popToTag(popTo, inclusive)
+
+ def unknown_starttag(self, name, attrs, selfClosing=0):
+ #print "Start tag %s: %s" % (name, attrs)
+ if self.quoteStack:
+ #This is not a real tag.
+ #print "<%s> is not real!" % name
+ attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs])
+ self.handle_data('<%s%s>' % (name, attrs))
+ return
+ self.endData()
+
+ if not self.isSelfClosingTag(name) and not selfClosing:
+ self._smartPop(name)
+
+ if self.parseOnlyThese and len(self.tagStack) <= 1 \
+ and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
+ return
+
+ tag = Tag(self, name, attrs, self.currentTag, self.previous)
+ if self.previous:
+ self.previous.next = tag
+ self.previous = tag
+ self.pushTag(tag)
+ if selfClosing or self.isSelfClosingTag(name):
+ self.popTag()
+ if name in self.QUOTE_TAGS:
+ #print "Beginning quote (%s)" % name
+ self.quoteStack.append(name)
+ self.literal = 1
+ return tag
+
+ def unknown_endtag(self, name):
+ #print "End tag %s" % name
+ if self.quoteStack and self.quoteStack[-1] != name:
+ #This is not a real end tag.
+ #print "</%s> is not real!" % name
+ self.handle_data('</%s>' % name)
+ return
+ self.endData()
+ self._popToTag(name)
+ if self.quoteStack and self.quoteStack[-1] == name:
+ self.quoteStack.pop()
+ self.literal = (len(self.quoteStack) > 0)
+
+ def handle_data(self, data):
+ self.currentData.append(data)
+
+ def _toStringSubclass(self, text, subclass):
+ """Adds a certain piece of text to the tree as a NavigableString
+ subclass."""
+ self.endData()
+ self.handle_data(text)
+ self.endData(subclass)
+
+ def handle_pi(self, text):
+ """Handle a processing instruction as a ProcessingInstruction
+ object, possibly one with a %SOUP-ENCODING% slot into which an
+ encoding will be plugged later."""
+ if text[:3] == "xml":
+ text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
+ self._toStringSubclass(text, ProcessingInstruction)
+
+ def handle_comment(self, text):
+ "Handle comments as Comment objects."
+ self._toStringSubclass(text, Comment)
+
+ def handle_charref(self, ref):
+ "Handle character references as data."
+ if self.convertEntities:
+ data = unichr(int(ref))
+ else:
+ data = '&#%s;' % ref
+ self.handle_data(data)
+
+ def handle_entityref(self, ref):
+ """Handle entity references as data, possibly converting known
+ HTML and/or XML entity references to the corresponding Unicode
+ characters."""
+ data = None
+ if self.convertHTMLEntities:
+ try:
+ data = unichr(name2codepoint[ref])
+ except KeyError:
+ pass
+
+ if not data and self.convertXMLEntities:
+ data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
+
+ if not data and self.convertHTMLEntities and \
+ not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
+ # TODO: We've got a problem here. We're told this is
+ # an entity reference, but it's not an XML entity
+ # reference or an HTML entity reference. Nonetheless,
+ # the logical thing to do is to pass it through as an
+ # unrecognized entity reference.
+ #
+ # Except: when the input is "&carol;" this function
+ # will be called with input "carol". When the input is
+ # "AT&T", this function will be called with input
+ # "T". We have no way of knowing whether a semicolon
+ # was present originally, so we don't know whether
+ # this is an unknown entity or just a misplaced
+ # ampersand.
+ #
+ # The more common case is a misplaced ampersand, so I
+ # escape the ampersand and omit the trailing semicolon.
+ data = "&%s" % ref
+ if not data:
+ # This case is different from the one above, because we
+ # haven't already gone through a supposedly comprehensive
+ # mapping of entities to Unicode characters. We might not
+ # have gone through any mapping at all. So the chances are
+ # very high that this is a real entity, and not a
+ # misplaced ampersand.
+ data = "&%s;" % ref
+ self.handle_data(data)
+
+ def handle_decl(self, data):
+ "Handle DOCTYPEs and the like as Declaration objects."
+ self._toStringSubclass(data, Declaration)
+
+ def parse_declaration(self, i):
+ """Treat a bogus SGML declaration as raw data. Treat a CDATA
+ declaration as a CData object."""
+ j = None
+ if self.rawdata[i:i+9] == '<![CDATA[':
+ k = self.rawdata.find(']]>', i)
+ if k == -1:
+ k = len(self.rawdata)
+ data = self.rawdata[i+9:k]
+ j = k+3
+ self._toStringSubclass(data, CData)
+ else:
+ try:
+ j = SGMLParser.parse_declaration(self, i)
+ except SGMLParseError:
+ toHandle = self.rawdata[i:]
+ self.handle_data(toHandle)
+ j = i + len(toHandle)
+ return j
+
+class BeautifulSoup(BeautifulStoneSoup):
+
+ """This parser knows the following facts about HTML:
+
+ * Some tags have no closing tag and should be interpreted as being
+ closed as soon as they are encountered.
+
+ * The text inside some tags (ie. 'script') may contain tags which
+ are not really part of the document and which should be parsed
+ as text, not tags. If you want to parse the text as tags, you can
+ always fetch it and parse it explicitly.
+
+ * Tag nesting rules:
+
+ Most tags can't be nested at all. For instance, the occurance of
+ a <p> tag should implicitly close the previous <p> tag.
+
+ <p>Para1<p>Para2
+ should be transformed into:
+ <p>Para1</p><p>Para2
+
+ Some tags can be nested arbitrarily. For instance, the occurance
+ of a <blockquote> tag should _not_ implicitly close the previous
+ <blockquote> tag.
+
+ Alice said: <blockquote>Bob said: <blockquote>Blah
+ should NOT be transformed into:
+ Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
+
+ Some tags can be nested, but the nesting is reset by the
+ interposition of other tags. For instance, a <tr> tag should
+ implicitly close the previous <tr> tag within the same <table>,
+ but not close a <tr> tag in another table.
+
+ <table><tr>Blah<tr>Blah
+ should be transformed into:
+ <table><tr>Blah</tr><tr>Blah
+ but,
+ <tr>Blah<table><tr>Blah
+ should NOT be transformed into
+ <tr>Blah<table></tr><tr>Blah
+
+ Differing assumptions about tag nesting rules are a major source
+ of problems with the BeautifulSoup class. If BeautifulSoup is not
+ treating as nestable a tag your page author treats as nestable,
+ try ICantBelieveItsBeautifulSoup, MinimalSoup, or
+ BeautifulStoneSoup before writing your own subclass."""
+
+ def __init__(self, *args, **kwargs):
+ if not kwargs.has_key('smartQuotesTo'):
+ kwargs['smartQuotesTo'] = self.HTML_ENTITIES
+ kwargs['isHTML'] = True
+ BeautifulStoneSoup.__init__(self, *args, **kwargs)
+
+ SELF_CLOSING_TAGS = buildTagMap(None,
+ ('br' , 'hr', 'input', 'img', 'meta',
+ 'spacer', 'link', 'frame', 'base', 'col'))
+
+ PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
+
+ QUOTE_TAGS = {'script' : None, 'textarea' : None}
+
+ #According to the HTML standard, each of these inline tags can
+ #contain another tag of the same type. Furthermore, it's common
+ #to actually use these tags this way.
+ NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
+ 'center')
+
+ #According to the HTML standard, these block tags can contain
+ #another tag of the same type. Furthermore, it's common
+ #to actually use these tags this way.
+ NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del')
+
+ #Lists can contain other lists, but there are restrictions.
+ NESTABLE_LIST_TAGS = { 'ol' : [],
+ 'ul' : [],
+ 'li' : ['ul', 'ol'],
+ 'dl' : [],
+ 'dd' : ['dl'],
+ 'dt' : ['dl'] }
+
+ #Tables can contain other tables, but there are restrictions.
+ NESTABLE_TABLE_TAGS = {'table' : [],
+ 'tr' : ['table', 'tbody', 'tfoot', 'thead'],
+ 'td' : ['tr'],
+ 'th' : ['tr'],
+ 'thead' : ['table'],
+ 'tbody' : ['table'],
+ 'tfoot' : ['table'],
+ }
+
+ NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre')
+
+ #If one of these tags is encountered, all tags up to the next tag of
+ #this type are popped.
+ RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
+ NON_NESTABLE_BLOCK_TAGS,
+ NESTABLE_LIST_TAGS,
+ NESTABLE_TABLE_TAGS)
+
+ NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
+ NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
+
+ # Used to detect the charset in a META tag; see start_meta
+ CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
+
+ def start_meta(self, attrs):
+ """Beautiful Soup can detect a charset included in a META tag,
+ try to convert the document to that charset, and re-parse the
+ document from the beginning."""
+ httpEquiv = None
+ contentType = None
+ contentTypeIndex = None
+ tagNeedsEncodingSubstitution = False
+
+ for i in range(0, len(attrs)):
+ key, value = attrs[i]
+ key = key.lower()
+ if key == 'http-equiv':
+ httpEquiv = value
+ elif key == 'content':
+ contentType = value
+ contentTypeIndex = i
+
+ if httpEquiv and contentType: # It's an interesting meta tag.
+ match = self.CHARSET_RE.search(contentType)
+ if match:
+ if (self.declaredHTMLEncoding is not None or
+ self.originalEncoding == self.fromEncoding):
+ # An HTML encoding was sniffed while converting
+ # the document to Unicode, or an HTML encoding was
+ # sniffed during a previous pass through the
+ # document, or an encoding was specified
+ # explicitly and it worked. Rewrite the meta tag.
+ def rewrite(match):
+ return match.group(1) + "%SOUP-ENCODING%"
+ newAttr = self.CHARSET_RE.sub(rewrite, contentType)
+ attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
+ newAttr)
+ tagNeedsEncodingSubstitution = True
+ else:
+ # This is our first pass through the document.
+ # Go through it again with the encoding information.
+ newCharset = match.group(3)
+ if newCharset and newCharset != self.originalEncoding:
+ self.declaredHTMLEncoding = newCharset
+ self._feed(self.declaredHTMLEncoding)
+ raise StopParsing
+ pass
+ tag = self.unknown_starttag("meta", attrs)
+ if tag and tagNeedsEncodingSubstitution:
+ tag.containsSubstitutions = True
+
+class StopParsing(Exception):
+ pass
+
+class ICantBelieveItsBeautifulSoup(BeautifulSoup):
+
+ """The BeautifulSoup class is oriented towards skipping over
+ common HTML errors like unclosed tags. However, sometimes it makes
+ errors of its own. For instance, consider this fragment:
+
+ <b>Foo<b>Bar</b></b>
+
+ This is perfectly valid (if bizarre) HTML. However, the
+ BeautifulSoup class will implicitly close the first b tag when it
+ encounters the second 'b'. It will think the author wrote
+ "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
+ there's no real-world reason to bold something that's already
+ bold. When it encounters '</b></b>' it will close two more 'b'
+ tags, for a grand total of three tags closed instead of two. This
+ can throw off the rest of your document structure. The same is
+ true of a number of other tags, listed below.
+
+ It's much more common for someone to forget to close a 'b' tag
+ than to actually use nested 'b' tags, and the BeautifulSoup class
+ handles the common case. This class handles the not-co-common
+ case: where you can't believe someone wrote what they did, but
+ it's valid HTML and BeautifulSoup screwed up by assuming it
+ wouldn't be."""
+
+ I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
+ ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
+ 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
+ 'big')
+
+ I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',)
+
+ NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
+ I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
+ I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
+
+class MinimalSoup(BeautifulSoup):
+ """The MinimalSoup class is for parsing HTML that contains
+ pathologically bad markup. It makes no assumptions about tag
+ nesting, but it does know which tags are self-closing, that
+ <script> tags contain Javascript and should not be parsed, that
+ META tags may contain encoding information, and so on.
+
+ This also makes it better for subclassing than BeautifulStoneSoup
+ or BeautifulSoup."""
+
+ RESET_NESTING_TAGS = buildTagMap('noscript')
+ NESTABLE_TAGS = {}
+
+class BeautifulSOAP(BeautifulStoneSoup):
+ """This class will push a tag with only a single string child into
+ the tag's parent as an attribute. The attribute's name is the tag
+ name, and the value is the string child. An example should give
+ the flavor of the change:
+
+ <foo><bar>baz</bar></foo>
+ =>
+ <foo bar="baz"><bar>baz</bar></foo>
+
+ You can then access fooTag['bar'] instead of fooTag.barTag.string.
+
+ This is, of course, useful for scraping structures that tend to
+ use subelements instead of attributes, such as SOAP messages. Note
+ that it modifies its input, so don't print the modified version
+ out.
+
+ I'm not sure how many people really want to use this class; let me
+ know if you do. Mainly I like the name."""
+
+ def popTag(self):
+ if len(self.tagStack) > 1:
+ tag = self.tagStack[-1]
+ parent = self.tagStack[-2]
+ parent._getAttrMap()
+ if (isinstance(tag, Tag) and len(tag.contents) == 1 and
+ isinstance(tag.contents[0], NavigableString) and
+ not parent.attrMap.has_key(tag.name)):
+ parent[tag.name] = tag.contents[0]
+ BeautifulStoneSoup.popTag(self)
+
+#Enterprise class names! It has come to our attention that some people
+#think the names of the Beautiful Soup parser classes are too silly
+#and "unprofessional" for use in enterprise screen-scraping. We feel
+#your pain! For such-minded folk, the Beautiful Soup Consortium And
+#All-Night Kosher Bakery recommends renaming this file to
+#"RobustParser.py" (or, in cases of extreme enterprisiness,
+#"RobustParserBeanInterface.class") and using the following
+#enterprise-friendly class aliases:
+class RobustXMLParser(BeautifulStoneSoup):
+ pass
+class RobustHTMLParser(BeautifulSoup):
+ pass
+class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
+ pass
+class RobustInsanelyWackAssHTMLParser(MinimalSoup):
+ pass
+class SimplifyingSOAPParser(BeautifulSOAP):
+ pass
+
+######################################################
+#
+# Bonus library: Unicode, Dammit
+#
+# This class forces XML data into a standard format (usually to UTF-8
+# or Unicode). It is heavily based on code from Mark Pilgrim's
+# Universal Feed Parser. It does not rewrite the XML or HTML to
+# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
+# (XML) and BeautifulSoup.start_meta (HTML).
+
+# Autodetects character encodings.
+# Download from http://chardet.feedparser.org/
+try:
+ import chardet
+# import chardet.constants
+# chardet.constants._debug = 1
+except ImportError:
+ chardet = None
+
+# cjkcodecs and iconv_codec make Python know about more character encodings.
+# Both are available from http://cjkpython.i18n.org/
+# They're built in if you use Python 2.4.
+try:
+ import cjkcodecs.aliases
+except ImportError:
+ pass
+try:
+ import iconv_codec
+except ImportError:
+ pass
+
+class UnicodeDammit:
+ """A class for detecting the encoding of a *ML document and
+ converting it to a Unicode string. If the source encoding is
+ windows-1252, can replace MS smart quotes with their HTML or XML
+ equivalents."""
+
+ # This dictionary maps commonly seen values for "charset" in HTML
+ # meta tags to the corresponding Python codec names. It only covers
+ # values that aren't in Python's aliases and can't be determined
+ # by the heuristics in find_codec.
+ CHARSET_ALIASES = { "macintosh" : "mac-roman",
+ "x-sjis" : "shift-jis" }
+
+ def __init__(self, markup, overrideEncodings=[],
+ smartQuotesTo='xml', isHTML=False):
+ self.declaredHTMLEncoding = None
+ self.markup, documentEncoding, sniffedEncoding = \
+ self._detectEncoding(markup, isHTML)
+ self.smartQuotesTo = smartQuotesTo
+ self.triedEncodings = []
+ if markup == '' or isinstance(markup, unicode):
+ self.originalEncoding = None
+ self.unicode = unicode(markup)
+ return
+
+ u = None
+ for proposedEncoding in overrideEncodings:
+ u = self._convertFrom(proposedEncoding)
+ if u: break
+ if not u:
+ for proposedEncoding in (documentEncoding, sniffedEncoding):
+ u = self._convertFrom(proposedEncoding)
+ if u: break
+
+ # If no luck and we have auto-detection library, try that:
+ if not u and chardet and not isinstance(self.markup, unicode):
+ u = self._convertFrom(chardet.detect(self.markup)['encoding'])
+
+ # As a last resort, try utf-8 and windows-1252:
+ if not u:
+ for proposed_encoding in ("utf-8", "windows-1252"):
+ u = self._convertFrom(proposed_encoding)
+ if u: break
+
+ self.unicode = u
+ if not u: self.originalEncoding = None
+
+ def _subMSChar(self, orig):
+ """Changes a MS smart quote character to an XML or HTML
+ entity."""
+ sub = self.MS_CHARS.get(orig)
+ if isinstance(sub, tuple):
+ if self.smartQuotesTo == 'xml':
+ sub = '&#x%s;' % sub[1]
+ else:
+ sub = '&%s;' % sub[0]
+ return sub
+
+ def _convertFrom(self, proposed):
+ proposed = self.find_codec(proposed)
+ if not proposed or proposed in self.triedEncodings:
+ return None
+ self.triedEncodings.append(proposed)
+ markup = self.markup
+
+ # Convert smart quotes to HTML if coming from an encoding
+ # that might have them.
+ if self.smartQuotesTo and proposed.lower() in("windows-1252",
+ "iso-8859-1",
+ "iso-8859-2"):
+ markup = re.compile("([\x80-\x9f])").sub \
+ (lambda(x): self._subMSChar(x.group(1)),
+ markup)
+
+ try:
+ # print "Trying to convert document to %s" % proposed
+ u = self._toUnicode(markup, proposed)
+ self.markup = u
+ self.originalEncoding = proposed
+ except Exception, e:
+ # print "That didn't work!"
+ # print e
+ return None
+ #print "Correct encoding: %s" % proposed
+ return self.markup
+
+ def _toUnicode(self, data, encoding):
+ '''Given a string and its encoding, decodes the string into Unicode.
+ %encoding is a string recognized by encodings.aliases'''
+
+ # strip Byte Order Mark (if present)
+ if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
+ and (data[2:4] != '\x00\x00'):
+ encoding = 'utf-16be'
+ data = data[2:]
+ elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
+ and (data[2:4] != '\x00\x00'):
+ encoding = 'utf-16le'
+ data = data[2:]
+ elif data[:3] == '\xef\xbb\xbf':
+ encoding = 'utf-8'
+ data = data[3:]
+ elif data[:4] == '\x00\x00\xfe\xff':
+ encoding = 'utf-32be'
+ data = data[4:]
+ elif data[:4] == '\xff\xfe\x00\x00':
+ encoding = 'utf-32le'
+ data = data[4:]
+ newdata = unicode(data, encoding)
+ return newdata
+
+ def _detectEncoding(self, xml_data, isHTML=False):
+ """Given a document, tries to detect its XML encoding."""
+ xml_encoding = sniffed_xml_encoding = None
+ try:
+ if xml_data[:4] == '\x4c\x6f\xa7\x94':
+ # EBCDIC
+ xml_data = self._ebcdic_to_ascii(xml_data)
+ elif xml_data[:4] == '\x00\x3c\x00\x3f':
+ # UTF-16BE
+ sniffed_xml_encoding = 'utf-16be'
+ xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
+ elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
+ and (xml_data[2:4] != '\x00\x00'):
+ # UTF-16BE with BOM
+ sniffed_xml_encoding = 'utf-16be'
+ xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
+ elif xml_data[:4] == '\x3c\x00\x3f\x00':
+ # UTF-16LE
+ sniffed_xml_encoding = 'utf-16le'
+ xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
+ elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
+ (xml_data[2:4] != '\x00\x00'):
+ # UTF-16LE with BOM
+ sniffed_xml_encoding = 'utf-16le'
+ xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
+ elif xml_data[:4] == '\x00\x00\x00\x3c':
+ # UTF-32BE
+ sniffed_xml_encoding = 'utf-32be'
+ xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
+ elif xml_data[:4] == '\x3c\x00\x00\x00':
+ # UTF-32LE
+ sniffed_xml_encoding = 'utf-32le'
+ xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
+ elif xml_data[:4] == '\x00\x00\xfe\xff':
+ # UTF-32BE with BOM
+ sniffed_xml_encoding = 'utf-32be'
+ xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
+ elif xml_data[:4] == '\xff\xfe\x00\x00':
+ # UTF-32LE with BOM
+ sniffed_xml_encoding = 'utf-32le'
+ xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
+ elif xml_data[:3] == '\xef\xbb\xbf':
+ # UTF-8 with BOM
+ sniffed_xml_encoding = 'utf-8'
+ xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
+ else:
+ sniffed_xml_encoding = 'ascii'
+ pass
+ except:
+ xml_encoding_match = None
+ xml_encoding_match = re.compile(
+ '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
+ if not xml_encoding_match and isHTML:
+ regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I)
+ xml_encoding_match = regexp.search(xml_data)
+ if xml_encoding_match is not None:
+ xml_encoding = xml_encoding_match.groups()[0].lower()
+ if isHTML:
+ self.declaredHTMLEncoding = xml_encoding
+ if sniffed_xml_encoding and \
+ (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
+ 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
+ 'utf-16', 'utf-32', 'utf_16', 'utf_32',
+ 'utf16', 'u16')):
+ xml_encoding = sniffed_xml_encoding
+ return xml_data, xml_encoding, sniffed_xml_encoding
+
+
+ def find_codec(self, charset):
+ return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
+ or (charset and self._codec(charset.replace("-", ""))) \
+ or (charset and self._codec(charset.replace("-", "_"))) \
+ or charset
+
+ def _codec(self, charset):
+ if not charset: return charset
+ codec = None
+ try:
+ codecs.lookup(charset)
+ codec = charset
+ except (LookupError, ValueError):
+ pass
+ return codec
+
+ EBCDIC_TO_ASCII_MAP = None
+ def _ebcdic_to_ascii(self, s):
+ c = self.__class__
+ if not c.EBCDIC_TO_ASCII_MAP:
+ emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
+ 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
+ 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
+ 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
+ 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
+ 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
+ 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
+ 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
+ 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
+ 201,202,106,107,108,109,110,111,112,113,114,203,204,205,
+ 206,207,208,209,126,115,116,117,118,119,120,121,122,210,
+ 211,212,213,214,215,216,217,218,219,220,221,222,223,224,
+ 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
+ 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
+ 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
+ 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
+ 250,251,252,253,254,255)
+ import string
+ c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
+ ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
+ return s.translate(c.EBCDIC_TO_ASCII_MAP)
+
+ MS_CHARS = { '\x80' : ('euro', '20AC'),
+ '\x81' : ' ',
+ '\x82' : ('sbquo', '201A'),
+ '\x83' : ('fnof', '192'),
+ '\x84' : ('bdquo', '201E'),
+ '\x85' : ('hellip', '2026'),
+ '\x86' : ('dagger', '2020'),
+ '\x87' : ('Dagger', '2021'),
+ '\x88' : ('circ', '2C6'),
+ '\x89' : ('permil', '2030'),
+ '\x8A' : ('Scaron', '160'),
+ '\x8B' : ('lsaquo', '2039'),
+ '\x8C' : ('OElig', '152'),
+ '\x8D' : '?',
+ '\x8E' : ('#x17D', '17D'),
+ '\x8F' : '?',
+ '\x90' : '?',
+ '\x91' : ('lsquo', '2018'),
+ '\x92' : ('rsquo', '2019'),
+ '\x93' : ('ldquo', '201C'),
+ '\x94' : ('rdquo', '201D'),
+ '\x95' : ('bull', '2022'),
+ '\x96' : ('ndash', '2013'),
+ '\x97' : ('mdash', '2014'),
+ '\x98' : ('tilde', '2DC'),
+ '\x99' : ('trade', '2122'),
+ '\x9a' : ('scaron', '161'),
+ '\x9b' : ('rsaquo', '203A'),
+ '\x9c' : ('oelig', '153'),
+ '\x9d' : '?',
+ '\x9e' : ('#x17E', '17E'),
+ '\x9f' : ('Yuml', ''),}
+
+#######################################################################
+
+
+#By default, act as an HTML pretty-printer.
+if __name__ == '__main__':
+ import sys
+ soup = BeautifulSoup(sys.stdin)
+ print soup.prettify()
--- /dev/null
+import sys
+from rss_sqlite import Listing
+from xml import sax
+from cgi import escape
+from re import sub
+from htmlentitydefs import name2codepoint
+from gconf import client_get_default
+
+import logging
+logger = logging.getLogger(__name__)
+
+def unescape(text):
+ def fixup(m):
+ text = m.group(0)
+ if text[:2] == "&#":
+ # character reference
+ try:
+ if text[:3] == "&#x":
+ return unichr(int(text[3:-1], 16))
+ else:
+ return unichr(int(text[2:-1]))
+ except ValueError:
+ pass
+ else:
+ # named entity
+ try:
+ text = unichr(name2codepoint[text[1:-1]])
+ except KeyError:
+ pass
+ return text # leave as is
+ return sub("&#?\w+;", fixup, text)
+
+def sanitize(text):
+ from cgi import escape
+ return escape(text).encode('ascii', 'xmlcharrefreplace')
+
+class XmlHandler():
+
+ def __init__(self, listing):
+ self.listing=listing
+
+ def getConfigXml(self):
+ xml = "<?xml version=\"1.0\" encoding=\"utf-8\"?><xml>"
+ xml += "<hideReadFeed>True</hideReadFeed>"
+ xml += "<hideReadArticles>True</hideReadArticles>"
+ xml += "</xml>"
+ return xml
+
+ def generateCategoryXml(self):
+ xml = "<?xml version=\"1.0\" encoding=\"utf-8\"?><xml>"
+ for cat in self.listing.getListOfCategories():
+ xml += "<category>"
+ xml += "<catname>%s</catname>" %sanitize(self.listing.getCategoryTitle(cat))
+ xml += "<catid>%s</catid>" % cat
+ xml += "</category>"
+ xml += "</xml>"
+ return xml
+
+ def fix_title(self, title):
+ return escape(unescape(title).replace("<em>","").replace("</em>","").replace("<nobr>","").replace("</nobr>","").replace("<wbr>","").replace("—","-"))
+
+ def generateFeedsXml(self, catid):
+ xml = "<?xml version=\"1.0\" encoding=\"utf-8\"?><xml>"
+ for key in self.listing.getSortedListOfKeys("Manual", category=catid):
+ xml += "<feed>"
+ xml += "<feedname>%s</feedname>" %sanitize(self.listing.getFeedTitle(key))
+ xml += "<feedid>%s</feedid>" %key
+ xml += "<unread>%s</unread>" %self.listing.getFeedNumberOfUnreadItems(key)
+ xml += "<updatedDate>%s</updatedDate>" %self.listing.getFeedUpdateTime(key)
+ xml += "<icon>%s</icon>" %self.listing.getFavicon(key)
+ # xml += "<updating>True</updating>"
+ xml += "<updating>False</updating>"
+ xml += "</feed>"
+ xml += "</xml>"
+ return xml
+
+ def generateArticlesXml(self, key, onlyUnread):
+ feed = self.listing.getFeed(key)
+ xml = "<?xml version=\"1.0\" encoding=\"utf-8\"?><xml>"
+ if onlyUnread == "False":
+ onlyUnread = False
+ for id in feed.getIds(onlyUnread):
+ xml += "<article>"
+ xml += "<title>%s</title>" %self.fix_title(feed.getTitle(id))
+ xml += "<articleid>%s</articleid>" %id
+ xml += "<unread>%s</unread>" %str(feed.isEntryRead(id))
+ xml += "<updatedDate>%s</updatedDate>" %feed.getDateStamp(id)
+ xml += "<path>%s</path>" %feed.getContentLink(id)
+ xml += "</article>"
+ xml += "</xml>"
+ return xml
+
+ def do_GET(self):
+ (req, sep, arg) = self.path.partition("?")
+ request = req.split("/")
+ arguments = {}
+ if arg != "":
+ args = arg.split("&")
+ for arg in args:
+ ele = arg.split("=")
+ arguments[ele[0]] = ele[1]
+ if request[1] == "categories":
+ xml = self.generateCategoryXml()
+ elif request[1] == "feeds":
+ catid = request[2]
+ xml = self.generateFeedsXml(catid)
+ elif request[1] == "articles":
+ key = request[2]
+ onlyUnread = arguments.get("onlyUnread","False")
+ markAllAsRead = arguments.get("markAllAsRead", "False")
+ xml = self.generateArticlesXml(key, onlyUnread, markAllAsRead)
+ elif request[1] == "html":
+ key = request[2]
+ article = request[3]
+ feed = listing.getFeed(key)
+ try:
+ file = open(feed.getContentLink(article))
+ html = file.read().replace("body", "body bgcolor='#ffffff'", 1)
+ file.close()
+ except:
+ html = "<html><body>Error retrieving article</body></html>"
+ self.send_response(200)
+ self.send_header("Content-type", "text/html")
+ self.end_headers()
+ self.wfile.write(html)
+ #listing.updateUnread(key)
+ return
+ elif request[1] == "isUpdating":
+ xml = "<xml>"
+ key = request[2]
+ if (key in updatingFeeds) or ((key=="") and (len(updatingFeeds)>0)):
+ xml += "<updating>True</updating>"
+ else:
+ xml += "<updating>False</updating>"
+ xml += self.getCommands()
+ xml += "</xml>"
+ elif request[1] == "read":
+ key = request[2]
+ article = request[3]
+ feed = listing.getFeed(key)
+ feed.setEntryRead(article)
+ listing.updateUnread(key)
+ self.send_response(200)
+ self.send_header("Content-type", "text/html")
+ self.end_headers()
+ self.wfile.write("OK")
+ return
+ elif request[1] == "config":
+ xml = self.getConfigXml()
+ elif request[1] == "home":
+ file = open(self.path)
+ self.send_response(200)
+ self.send_header("Content-type", "text/html")
+ self.end_headers()
+ self.wfile.write(file.read())
+ file.close()
+ return
+ elif request[1] == "task":
+ self.openTaskSwitch()
+ xml = "<xml>OK</xml>"
+ elif request[1] == "deleteCat":
+ key = request[2]
+ listing.removeCategory(key)
+ xml = "<xml>OK</xml>"
+ elif request[1] == "deleteFeed":
+ key = request[3]
+ listing.removeFeed(key)
+ xml = "<xml>OK</xml>"
+ elif request[1] == "addFeed":
+ cat = request[2]
+ name = request[3]
+ url = arguments.get("url","")
+ listing.addFeed(name, url, category=cat)
+ xml = "<xml>OK</xml>"
+ elif request[1] == "updateFeed":
+ key = request[2]
+ listing.updateFeed (key, priority=-1)
+ #download = Download(listing, [key,])
+ #download.start()
+ xml = "<xml>OK</xml>"
+ elif request[1]=="updateAll":
+ #app.automaticUpdate()
+ self.updateAll()
+ xml = "<xml>OK</xml>"
+ elif request[1] == "addCat":
+ catName = request[2]
+ listing.addCategory(catName)
+ xml = "<xml>OK</xml>"
+ else:
+ self.send_error(404, "File not found")
+ return
+ self.send_response(200)
+ self.send_header("Content-type", "text/xml")
+ self.end_headers()
+ self.wfile.write(xml.encode("utf-8"))
--- /dev/null
+#!/usr/bin/env python2.5
+
+#
+# Copyright (c) 2007-2008 INdT.
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+# ============================================================================
+# Name : FeedingIt.py
+# Author : Yves Marcoz
+# Version : 0.6.1
+# Description : Simple RSS Reader
+# ============================================================================
+#try:
+# import gtk
+# import hildon
+# from gobject import idle_add
+#except:
+# pass
+
+from ConfigParser import RawConfigParser
+from gconf import client_get_default
+from urllib2 import ProxyHandler
+from mainthread import mainthread
+import logging
+logger = logging.getLogger(__name__)
+
+VERSION = "52"
+
+section = "FeedingIt"
+ranges = { "updateInterval":[0.5, 1, 2, 4, 12, 24], "expiry":[24, 48, 72, 144, 288], "fontSize":range(12,24), "orientation":["Automatic", "Landscape", "Portrait"], "artFontSize":[10, 12, 14, 16, 18, 20], "feedsort":["Manual", "Most unread", "Least unread", "Most recent", "Least recent"] }
+titles = {"updateInterval":"Auto-update interval", "expiry":"Delete articles", "fontSize":"List font size", "orientation":"Display orientation", "artFontSize":"Article font size","feedsort":"Feed sort order"}
+subtitles = {"updateInterval":"Every %s hours", "expiry":"After %s hours", "fontSize":"%s pixels", "orientation":"%s", "artFontSize":"%s pixels", "feedsort":"%s"}
+
+class Config():
+ def __init__(self, parent, configFilename):
+ self.configFilename = configFilename
+ self.parent = parent
+ # Load config
+ self.loadConfig()
+
+ # Backup current settings for later restore
+ self.config_backup = dict(self.config)
+ self.do_restore_backup = True
+
+ def on_save_button_clicked(self, button):
+ self.do_restore_backup = False
+ self.window.destroy()
+
+ def createDialog(self):
+ import gtk
+ import hildon
+ from gobject import idle_add
+ self.window = gtk.Dialog("Settings", self.parent)
+ self.window.set_geometry_hints(min_height=600)
+
+ save_button = self.window.add_button(gtk.STOCK_SAVE, gtk.RESPONSE_OK)
+ save_button.connect('clicked', self.on_save_button_clicked)
+ #self.window.set_default_size(-1, 600)
+ panArea = hildon.PannableArea()
+
+ vbox = gtk.VBox(False, 2)
+ self.buttons = {}
+
+ def heading(text):
+ l = gtk.Label()
+ l.set_size_request(-1, 6)
+ vbox.pack_start(l, expand=False)
+ vbox.pack_start(gtk.Frame(text), expand=False)
+
+ def add_setting(setting):
+ picker = hildon.PickerButton(gtk.HILDON_SIZE_FINGER_HEIGHT, hildon.BUTTON_ARRANGEMENT_VERTICAL)
+ selector = self.create_selector(ranges[setting], setting)
+ picker.set_selector(selector)
+ picker.set_title(titles[setting])
+ picker.set_text(titles[setting], subtitles[setting] % self.config[setting])
+ picker.set_name('HildonButton-finger')
+ picker.set_alignment(0,0,1,1)
+ self.buttons[setting] = picker
+ vbox.pack_start(picker, expand=False)
+
+ button = hildon.Button(gtk.HILDON_SIZE_FINGER_HEIGHT, hildon.BUTTON_ARRANGEMENT_VERTICAL)
+ button.set_label("View Known Issues and Tips")
+ button.connect("clicked", self.button_tips_clicked)
+ button.set_alignment(0,0,1,1)
+ vbox.pack_start(button, expand=False)
+
+ heading('Display')
+ add_setting('fontSize')
+ add_setting('artFontSize')
+ add_setting('orientation')
+ add_setting('feedsort')
+ button = hildon.CheckButton(gtk.HILDON_SIZE_FINGER_HEIGHT)
+ button.set_label("Hide read feeds")
+ button.set_active(self.config["hidereadfeeds"])
+ button.connect("toggled", self.button_toggled, "hidereadfeeds")
+ vbox.pack_start(button, expand=False)
+
+ button = hildon.CheckButton(gtk.HILDON_SIZE_FINGER_HEIGHT)
+ button.set_label("Hide read articles")
+ button.set_active(self.config["hidereadarticles"])
+ button.connect("toggled", self.button_toggled, "hidereadarticles")
+ vbox.pack_start(button, expand=False)
+
+
+ heading('Updating')
+ button = hildon.CheckButton(gtk.HILDON_SIZE_FINGER_HEIGHT)
+ button.set_label("Automatically update feeds")
+ button.set_active(self.config["autoupdate"])
+ button.connect("toggled", self.button_toggled, "autoupdate")
+ vbox.pack_start(button, expand=False)
+ add_setting('updateInterval')
+ add_setting('expiry')
+
+ heading('Network')
+ button = hildon.CheckButton(gtk.HILDON_SIZE_FINGER_HEIGHT)
+ button.set_label('Cache images')
+ button.set_active(self.config["imageCache"])
+ button.connect("toggled", self.button_toggled, "imageCache")
+ vbox.pack_start(button, expand=False)
+
+ button = hildon.CheckButton(gtk.HILDON_SIZE_FINGER_HEIGHT)
+ button.set_label("Use HTTP proxy")
+ button.set_active(self.config["proxy"])
+ button.connect("toggled", self.button_toggled, "proxy")
+ vbox.pack_start(button, expand=False)
+
+ button = hildon.CheckButton(gtk.HILDON_SIZE_FINGER_HEIGHT)
+ button.set_label('Open links in external browser')
+ button.set_active(self.config["extBrowser"])
+ button.connect("toggled", self.button_toggled, "extBrowser")
+ vbox.pack_start(button, expand=False)
+
+ panArea.add_with_viewport(vbox)
+
+ self.window.vbox.add(panArea)
+ self.window.connect("destroy", self.onExit)
+ #self.window.add(self.vbox)
+ self.window.set_default_size(-1, 600)
+ self.window.show_all()
+ return self.window
+
+ def button_tips_clicked(self, *widget):
+ import dbus
+ bus = dbus.SessionBus()
+ proxy = bus.get_object("com.nokia.osso_browser", "/com/nokia/osso_browser/request")
+ iface = dbus.Interface(proxy, 'com.nokia.osso_browser')
+ iface.open_new_window("http://feedingit.marcoz.org/news/?page_id=%s" % VERSION)
+
+ def onExit(self, *widget):
+ # When the dialog is closed without hitting
+ # the "Save" button, restore the configuration
+ if self.do_restore_backup:
+ logger.debug('Restoring configuration')
+ self.config = self.config_backup
+
+ self.saveConfig()
+ self.window.destroy()
+
+ def button_toggled(self, widget, configName):
+ #print "widget", widget.get_active()
+ if (widget.get_active()):
+ self.config[configName] = True
+ else:
+ self.config[configName] = False
+ #print "autoup", self.autoupdate
+ self.saveConfig()
+
+ def selection_changed(self, selector, button, setting):
+ from gobject import idle_add
+ current_selection = selector.get_current_text()
+ if current_selection:
+ self.config[setting] = current_selection
+ idle_add(self.updateButton, setting)
+ self.saveConfig()
+
+ def updateButton(self, setting):
+ self.buttons[setting].set_text(titles[setting], subtitles[setting] % self.config[setting])
+
+ def loadConfig(self):
+ self.config = {}
+ try:
+ configParser = RawConfigParser()
+ configParser.read(self.configFilename)
+ self.config["fontSize"] = configParser.getint(section, "fontSize")
+ self.config["artFontSize"] = configParser.getint(section, "artFontSize")
+ self.config["expiry"] = configParser.getint(section, "expiry")
+ self.config["autoupdate"] = configParser.getboolean(section, "autoupdate")
+ self.config["updateInterval"] = configParser.getfloat(section, "updateInterval")
+ self.config["orientation"] = configParser.get(section, "orientation")
+ self.config["imageCache"] = configParser.getboolean(section, "imageCache")
+ except:
+ self.config["fontSize"] = 17
+ self.config["artFontSize"] = 14
+ self.config["expiry"] = 24
+ self.config["autoupdate"] = False
+ self.config["updateInterval"] = 4
+ self.config["orientation"] = "Automatic"
+ self.config["imageCache"] = False
+ try:
+ self.config["proxy"] = configParser.getboolean(section, "proxy")
+ except:
+ self.config["proxy"] = True
+ try:
+ self.config["hidereadfeeds"] = configParser.getboolean(section, "hidereadfeeds")
+ self.config["hidereadarticles"] = configParser.getboolean(section, "hidereadarticles")
+ except:
+ self.config["hidereadfeeds"] = False
+ self.config["hidereadarticles"] = False
+ try:
+ self.config["extBrowser"] = configParser.getboolean(section, "extBrowser")
+ except:
+ self.config["extBrowser"] = False
+ try:
+ self.config["feedsort"] = configParser.get(section, "feedsort")
+ except:
+ self.config["feedsort"] = "Manual"
+
+ def saveConfig(self):
+ configParser = RawConfigParser()
+ configParser.add_section(section)
+ configParser.set(section, 'fontSize', str(self.config["fontSize"]))
+ configParser.set(section, 'artFontSize', str(self.config["artFontSize"]))
+ configParser.set(section, 'expiry', str(self.config["expiry"]))
+ configParser.set(section, 'autoupdate', str(self.config["autoupdate"]))
+ configParser.set(section, 'updateInterval', str(self.config["updateInterval"]))
+ configParser.set(section, 'orientation', str(self.config["orientation"]))
+ configParser.set(section, 'imageCache', str(self.config["imageCache"]))
+ configParser.set(section, 'proxy', str(self.config["proxy"]))
+ configParser.set(section, 'hidereadfeeds', str(self.config["hidereadfeeds"]))
+ configParser.set(section, 'hidereadarticles', str(self.config["hidereadarticles"]))
+ configParser.set(section, 'extBrowser', str(self.config["extBrowser"]))
+ configParser.set(section, 'feedsort', str(self.config["feedsort"]))
+
+ # Writing our configuration file
+ file = open(self.configFilename, 'wb')
+ configParser.write(file)
+ file.close()
+
+ def create_selector(self, choices, setting):
+ import gtk
+ import hildon
+ from gobject import idle_add
+ #self.pickerDialog = hildon.PickerDialog(self.parent)
+ selector = hildon.TouchSelector(text=True)
+ index = 0
+ for item in choices:
+ iter = selector.append_text(str(item))
+ if str(self.config[setting]) == str(item):
+ selector.set_active(0, index)
+ index += 1
+ selector.connect("changed", self.selection_changed, setting)
+ #self.pickerDialog.set_selector(selector)
+ return selector
+ #self.pickerDialog.show_all()
+
+ def getFontSize(self):
+ return self.config["fontSize"]
+ def getArtFontSize(self):
+ return self.config["artFontSize"]
+ def getExpiry(self):
+ return self.config["expiry"]
+ def isAutoUpdateEnabled(self):
+ return self.config["autoupdate"]
+ def getUpdateInterval(self):
+ return float(self.config["updateInterval"])
+ def getReadFont(self):
+ return "sans italic %s" % self.config["fontSize"]
+ def getUnreadFont(self):
+ return "sans %s" % self.config["fontSize"]
+ def getOrientation(self):
+ return ranges["orientation"].index(self.config["orientation"])
+ def getImageCache(self):
+ return self.config["imageCache"]
+ @mainthread
+ def getProxy(self):
+ if self.config["proxy"] == False:
+ return (False, None)
+ if client_get_default().get_bool('/system/http_proxy/use_http_proxy'):
+ port = client_get_default().get_int('/system/http_proxy/port')
+ http = client_get_default().get_string('/system/http_proxy/host')
+ proxy = ProxyHandler( {"http":"http://%s:%s/"% (http,port)} )
+ return (True, proxy)
+ return (False, None)
+ def getHideReadFeeds(self):
+ return self.config["hidereadfeeds"]
+ def getHideReadArticles(self):
+ return self.config["hidereadarticles"]
+ def getOpenInExternalBrowser(self):
+ return self.config["extBrowser"]
+ def getFeedSortOrder(self):
+ return self.config["feedsort"]
--- /dev/null
+# Copyright (c) 2011 Neal H. Walfield
+#
+# This software is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from __future__ import with_statement
+import os
+import logging
+import itertools
+import sys
+import string
+import traceback
+import time
+import errno
+import glob
+
+logger = None
+original_excepthook = None
+
+def my_excepthook(exctype, value, tb):
+ """Log uncaught exceptions."""
+ logger.error(
+ "Uncaught exception: %s"
+ % (''.join(traceback.format_exception(exctype, value, tb)),))
+ original_excepthook(exctype, value, tb)
+
+def init(dot_directory, debug=False, max_logfiles=1, program_name=None):
+ if not os.path.isabs(dot_directory):
+ dot_directory = os.path.join(os.path.expanduser("~"), dot_directory)
+
+ logging_directory = os.path.join(dot_directory, "logging")
+ try:
+ os.makedirs(logging_directory)
+ except OSError, e:
+ if e.errno != errno.EEXIST:
+ raise
+
+ if program_name is None:
+ program_name = os.path.basename(sys.argv[0])
+ string.translate(program_name, string.maketrans(' .', '__'))
+
+ timestamp = time.strftime("%Y%m%d")
+
+ logfiles = glob.glob(os.path.join(logging_directory,
+ program_name + '-*.log'))
+ if len(logfiles) >= max_logfiles:
+ logfiles.sort()
+ for f in logfiles[:-(max_logfiles+1)]:
+ print "Purging old log file %s" % (f,)
+ try:
+ os.remove(f)
+ except OSError, e:
+ print "Removing %s: %s" % (f, str(e))
+
+ logfile = os.path.join(logging_directory,
+ program_name + '-' + timestamp + '.log')
+
+ print "Sending output to %s" % logfile
+
+ global logger
+ logger = logging.getLogger(__name__)
+
+ if debug:
+ level = logging.DEBUG
+ else:
+ level = logging.INFO
+
+ logging.basicConfig(
+ level=level,
+ format=('%(asctime)s (pid: ' + str(os.getpid()) + ') '
+ + '%(levelname)-8s %(message)s'),
+ filename=logfile,
+ filemode='a')
+
+ # Log uncaught exceptions.
+ global original_excepthook
+ original_excepthook = sys.excepthook
+ sys.excepthook = my_excepthook
+
+ def redirect(thing):
+ filename = os.path.join(logging_directory, program_name + '.' + thing)
+ try:
+ with open(filename, "r") as fhandle:
+ contents = fhandle.read()
+ except IOError, e:
+ if e.errno in (errno.ENOENT,):
+ fhandle = None
+ contents = ""
+ else:
+ logging.error("Reading %s: %s" % (filename, str(e)))
+ raise
+
+ logging.error("std%s of last run: %s" % (thing, contents))
+
+ if fhandle is not None:
+ os.remove(filename)
+
+ print "Redirecting std%s to %s" % (thing, filename)
+ return open(filename, "w", 0)
+
+ sys.stderr = redirect('err')
+ sys.stdout = redirect('out')
+
--- /dev/null
+import Thread
+
+class Download(Thread):
+ def __init__(self, listing, key, config):
+ Thread.__init__(self)
+ self.listing = listing
+ self.key = key
+ self.config = config
+
+ def run (self):
+ (use_proxy, proxy) = self.config.getProxy()
+ key_lock = get_lock(self.key)
+ if key_lock != None:
+ if use_proxy:
+ self.listing.updateFeed(self.key, self.config.getExpiry(), proxy=proxy, imageCache=self.config.getImageCache() )
+ else:
+ self.listing.updateFeed(self.key, self.config.getExpiry(), imageCache=self.config.getImageCache() )
+ del key_lock
\ No newline at end of file
--- /dev/null
+#!/usr/bin/python
+
+import sys
+
+from PySide import QtGui
+from PySide import QtDeclarative
+import os
+from os import mkdir, remove, stat, environ
+from os.path import isfile, isdir, exists
+
+# Comment the line below if you don't want to use OpenGL for QML rendering or if it is not supported
+from PySide import QtOpenGL, QtCore
+
+from rss_sqlite import Listing
+CONFIGDIR = environ.get("HOME", "/home/user") + "/.feedingit"
+#CONFIGDIR = "/home/user/.feedingit"
+
+import logging
+#logger = logging.getLogger(__name__)
+
+import debugging
+debugging.init(dot_directory=".feedingit", program_name="feedingit-pyside")
+
+from cgi import escape
+from re import sub
+
+class FeedWrapper(QtCore.QObject):
+ def __init__(self, key):
+ QtCore.QObject.__init__(self)
+ self._key = key
+ def _name(self):
+ return listing.getFeedTitle(self._key)
+ def _unread(self):
+ return listing.getFeedNumberOfUnreadItems(self._key)
+ def _updatedDate(self):
+ return listing.getFeedUpdateTime(self._key)
+ def _icon(self):
+ return listing.getFavicon(self._key)
+ def _feedid(self):
+ return self._key
+ def _updating(self):
+ return false
+
+ changed = QtCore.Signal()
+
+ title = QtCore.Property(unicode, _name, notify=changed)
+ feedid = QtCore.Property(unicode, _feedid, notify=changed)
+ unread = QtCore.Property(unicode, _unread, notify=changed)
+ updatedDate= QtCore.Property(unicode, _updatedDate, notify=changed)
+ icon = QtCore.Property(unicode, _icon, notify=changed)
+ updating = QtCore.Property(unicode, _icon, notify=changed)
+
+class FeedsModel(QtCore.QAbstractListModel):
+ COLUMNS = ('feed', )
+ _category = None
+
+ def __init__(self):
+ QtCore.QAbstractListModel.__init__(self)
+ self._feeds = listing.getListOfFeeds(self._category)
+ self.setRoleNames(dict(enumerate(FeedsModel.COLUMNS)))
+
+ def rowCount(self, parent=QtCore.QModelIndex()):
+ return len(self._feeds)
+
+ def data(self, index, role):
+ if index.isValid() and role == FeedsModel.COLUMNS.index('feed'):
+ print self._feeds[index.row()]
+ return FeedWrapper(self._feeds[index.row()])
+ return None
+
+class ArticleWrapper(QtCore.QObject):
+ def __init__(self, feed, articleid):
+ QtCore.QObject.__init__(self)
+ self._feed = feed
+ self._articleid = articleid
+ def _name(self):
+ return self.fix_title(self._feed.getTitle(self._articleid))
+ def _unread(self):
+ return str(self._feed.isEntryRead(self._articleid))
+ def _getarticleid(self):
+ return self._articleid
+ def _updatedDate(self):
+ return self._feed.getDateStamp(self._articleid)
+ def _path(self):
+ return self._feed.getContentLink(self._articleid)
+
+ changed = QtCore.Signal()
+
+ title = QtCore.Property(unicode, _name, notify=changed)
+ articleid = QtCore.Property(unicode, _getarticleid, notify=changed)
+ unread = QtCore.Property(unicode, _unread, notify=changed)
+ updatedDate= QtCore.Property(unicode, _updatedDate, notify=changed)
+ path = QtCore.Property(unicode, _path, notify=changed)
+
+class ArticlesModel(QtCore.QAbstractListModel):
+ COLUMNS = ('article', )
+ _articles = []
+ _key = None
+ _feed = None
+
+ def __init__(self,):
+ QtCore.QAbstractListModel.__init__(self)
+ self.setRoleNames(dict(enumerate(ArticlesModel.COLUMNS)))
+
+ def updateModel(self, key):
+ self._key = key
+ self._feed = listing.getFeed(self._key)
+ self._articles = self._feed.getIds()
+
+ def rowCount(self, parent=QtCore.QModelIndex()):
+ print "art " + str(len(self._articles))
+ return len(self._articles)
+
+ def data(self, index, role):
+ print "data" + str(index) + " " + str(role)
+ if index.isValid() and role == ArticlesModel.COLUMNS.index('article'):
+ return ArticleWrapper(self._articles[index.row()])
+ return None
+
+class Controller(QtCore.QObject):
+
+ def __init__(self, listing):
+ QtCore.QObject.__init__(self)
+ from XmlHandler import XmlHandler
+ self._handler = XmlHandler(listing)
+
+ @QtCore.Slot(str,str, result=str)
+ def getArticle(self, key, article):
+ feed = listing.getFeed(key)
+ try:
+ file = open(feed.getContentLink(article))
+ html = file.read().replace("body", "body bgcolor='#ffffff'", 1)
+ file.close()
+ except:
+ html = "<html><body>Error retrieving article</body></html>"
+ return html
+
+ @QtCore.Slot(str, result=str)
+ def getFeedsXml(self, catid):
+ return self._handler.generateFeedsXml(catid)
+
+ @QtCore.Slot(str,result=str)
+ def getArticlesXml(self, key):
+ #onlyUnread = arguments.get("onlyUnread","False")
+ return self._handler.generateArticlesXml(key, "False")
+
+ @QtCore.Slot(result=str)
+ def getCategoryXml(self):
+ return self._handler.generateCategoryXml()
+
+ @QtCore.Slot(QtCore.QObject)
+ def feedClicked(self, wrapper):
+ #print 'User clicked on:', wrapper._key
+ #articlesModel.updateModel(wrapper._key)
+ pass
+
+ @QtCore.Slot(str)
+ def updateFeed(self, key):
+ print 'updating feed ', key
+ listing.updateFeed(key)
+
+ @QtCore.Slot()
+ def updateAll(self):
+ for feed in listing.getListOfFeeds("Manual"):
+ listing.updateFeed(feed)
+
+ @QtCore.Slot(str,str,str)
+ def addFeed(self, title, url, catid):
+ listing.addFeed(title,url, category=catid)
+
+ @QtCore.Slot(str)
+ def addCategory(self, name):
+ listing.addCategory(name)
+
+ @QtCore.Slot(str)
+ def markAllAsRead(self, key):
+ feed = listing.getFeed(key)
+ feed.markAllAsRead()
+
+ @QtCore.Slot(str, str)
+ def setEntryRead(self, key, articleid):
+ feed = listing.getFeed(key)
+ feed.setEntryRead(articleid)
+ listing.updateUnread(key)
+
+ @QtCore.Slot(str, result=str)
+ def getConfig(self, item):
+ if (item == "hideReadFeed"):
+ return "True"
+ if (item == "hideReadArticles"):
+ return "False"
+ return ""
+
+def main():
+
+ if not isdir(CONFIGDIR):
+ try:
+ mkdir(CONFIGDIR)
+ except:
+ logger.error("Error: Can't create configuration directory")
+ from sys import exit
+ exit(1)
+
+ from config import Config
+ global config
+ config = Config(None,CONFIGDIR+"config.ini")
+
+ global listing
+ listing = Listing(config, CONFIGDIR)
+
+ import mainthread
+ mainthread.init()
+
+ from jobmanager import JobManager
+ JobManager(True)
+
+ app = QtGui.QApplication(sys.argv)
+ view = QtDeclarative.QDeclarativeView()
+
+ global articlesModel
+ feedsModel = FeedsModel()
+ articlesModel = ArticlesModel()
+
+ controller = Controller(listing)
+
+ rc = view.rootContext()
+
+ rc.setContextProperty('controller', controller)
+ rc.setContextProperty('feedsModel', feedsModel)
+ rc.setContextProperty('articlesModel', articlesModel)
+
+ # Comment the two lines below if you don't want to use OpenGL for QML rendering or if it is not supported
+ glw = QtOpenGL.QGLWidget()
+ view.setViewport(glw)
+
+ if os.path.exists('/usr/share/feedingit/qml'):
+ view.setSource('/usr/share/feedingit/qml/main.qml')
+ else:
+ #view.setSource(os.path.join('qml','main.qml'))
+ view.setSource(os.path.join('qml','FeedingIt.qml'))
+
+ #view.showFullScreen()
+ view.show()
+ sys.exit(app.exec_())
+
+if __name__ == "__main__":
+
+ main()
--- /dev/null
+#!/usr/bin/env python
+"""Universal feed parser
+
+Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
+
+Visit http://feedparser.org/ for the latest version
+Visit http://feedparser.org/docs/ for the latest documentation
+
+Required: Python 2.4 or later
+Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
+"""
+
+__version__ = "5.0.1"
+__license__ = """Copyright (c) 2002-2008, Mark Pilgrim, All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE."""
+__author__ = "Mark Pilgrim <http://diveintomark.org/>"
+__contributors__ = ["Jason Diamond <http://injektilo.org/>",
+ "John Beimler <http://john.beimler.org/>",
+ "Fazal Majid <http://www.majid.info/mylos/weblog/>",
+ "Aaron Swartz <http://aaronsw.com/>",
+ "Kevin Marks <http://epeus.blogspot.com/>",
+ "Sam Ruby <http://intertwingly.net/>",
+ "Ade Oshineye <http://blog.oshineye.com/>",
+ "Martin Pool <http://sourcefrog.net/>",
+ "Kurt McKee <http://kurtmckee.org/>"]
+
+# HTTP "User-Agent" header to send to servers when downloading feeds.
+# If you are embedding feedparser in a larger application, you should
+# change this to your application name and URL.
+USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__
+
+# HTTP "Accept" header to send to servers when downloading feeds. If you don't
+# want to send an Accept header, set this to None.
+ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
+
+# List of preferred XML parsers, by SAX driver name. These will be tried first,
+# but if they're not installed, Python will keep searching through its own list
+# of pre-installed parsers until it finds one that supports everything we need.
+PREFERRED_XML_PARSERS = ["drv_libxml2"]
+
+# If you want feedparser to automatically run HTML markup through HTML Tidy, set
+# this to 1. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
+# or utidylib <http://utidylib.berlios.de/>.
+TIDY_MARKUP = 0
+
+# List of Python interfaces for HTML Tidy, in order of preference. Only useful
+# if TIDY_MARKUP = 1
+PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
+
+# If you want feedparser to automatically resolve all relative URIs, set this
+# to 1.
+RESOLVE_RELATIVE_URIS = 1
+
+# If you want feedparser to automatically sanitize all potentially unsafe
+# HTML content, set this to 1.
+SANITIZE_HTML = 1
+
+# ---------- Python 3 modules (make it work if possible) ----------
+try:
+ import rfc822
+except ImportError:
+ from email import _parseaddr as rfc822
+
+try:
+ # Python 3.1 introduces bytes.maketrans and simultaneously
+ # deprecates string.maketrans; use bytes.maketrans if possible
+ _maketrans = bytes.maketrans
+except (NameError, AttributeError):
+ import string
+ _maketrans = string.maketrans
+
+# base64 support for Atom feeds that contain embedded binary data
+try:
+ import base64, binascii
+except ImportError:
+ base64 = binascii = None
+else:
+ # Python 3.1 deprecates decodestring in favor of decodebytes
+ _base64decode = getattr(base64, 'decodebytes', base64.decodestring)
+
+def _s2bytes(s):
+ # Convert a UTF-8 str to bytes if the interpreter is Python 3
+ try:
+ return bytes(s, 'utf8')
+ except (NameError, TypeError):
+ # In Python 2.5 and below, bytes doesn't exist (NameError)
+ # In Python 2.6 and above, bytes and str are the same (TypeError)
+ return s
+
+def _l2bytes(l):
+ # Convert a list of ints to bytes if the interpreter is Python 3
+ try:
+ if bytes is not str:
+ # In Python 2.6 and above, this call won't raise an exception
+ # but it will return bytes([65]) as '[65]' instead of 'A'
+ return bytes(l)
+ raise NameError
+ except NameError:
+ return ''.join(map(chr, l))
+
+# If you want feedparser to allow all URL schemes, set this to ()
+# List culled from Python's urlparse documentation at:
+# http://docs.python.org/library/urlparse.html
+# as well as from "URI scheme" at Wikipedia:
+# https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme
+# Many more will likely need to be added!
+ACCEPTABLE_URI_SCHEMES = (
+ 'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'mailto',
+ 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu', 'sftp',
+ 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet', 'wais',
+ # Additional common-but-unofficial schemes
+ 'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs',
+ 'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg',
+)
+#ACCEPTABLE_URI_SCHEMES = ()
+
+# ---------- required modules (should come with any Python distribution) ----------
+import cgi
+import copy
+import datetime
+import re
+import struct
+import sys
+import time
+import types
+import urllib
+import urllib2
+import urlparse
+
+from htmlentitydefs import name2codepoint, codepoint2name, entitydefs
+
+try:
+ from io import BytesIO as _StringIO
+except ImportError:
+ try:
+ from cStringIO import StringIO as _StringIO
+ except ImportError:
+ from StringIO import StringIO as _StringIO
+
+# ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
+
+# gzip is included with most Python distributions, but may not be available if you compiled your own
+try:
+ import gzip
+except ImportError:
+ gzip = None
+try:
+ import zlib
+except ImportError:
+ zlib = None
+
+# If a real XML parser is available, feedparser will attempt to use it. feedparser has
+# been tested with the built-in SAX parser, PyXML, and libxml2. On platforms where the
+# Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
+# versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
+try:
+ import xml.sax
+ from xml.sax.saxutils import escape as _xmlescape
+except ImportError:
+ _XML_AVAILABLE = 0
+ def _xmlescape(data,entities={}):
+ data = data.replace('&', '&')
+ data = data.replace('>', '>')
+ data = data.replace('<', '<')
+ for char, entity in entities:
+ data = data.replace(char, entity)
+ return data
+else:
+ try:
+ xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
+ except xml.sax.SAXReaderNotAvailable:
+ _XML_AVAILABLE = 0
+ else:
+ _XML_AVAILABLE = 1
+
+# sgmllib is not available by default in Python 3; if the end user doesn't have
+# it available then we'll lose illformed XML parsing, content santizing, and
+# microformat support (at least while feedparser depends on BeautifulSoup).
+try:
+ import sgmllib
+except ImportError:
+ # This is probably Python 3, which doesn't include sgmllib anymore
+ _SGML_AVAILABLE = 0
+
+ # Mock sgmllib enough to allow subclassing later on
+ class sgmllib(object):
+ class SGMLParser(object):
+ def goahead(self, i):
+ pass
+ def parse_starttag(self, i):
+ pass
+else:
+ _SGML_AVAILABLE = 1
+
+ # sgmllib defines a number of module-level regular expressions that are
+ # insufficient for the XML parsing feedparser needs. Rather than modify
+ # the variables directly in sgmllib, they're defined here using the same
+ # names, and the compiled code objects of several sgmllib.SGMLParser
+ # methods are copied into _BaseHTMLProcessor so that they execute in
+ # feedparser's scope instead of sgmllib's scope.
+ charref = re.compile('&#(\d+|[xX][0-9a-fA-F]+);')
+ tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
+ attrfind = re.compile(
+ r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)[$]?(\s*=\s*'
+ r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?'
+ )
+
+ # Unfortunately, these must be copied over to prevent NameError exceptions
+ entityref = sgmllib.entityref
+ incomplete = sgmllib.incomplete
+ interesting = sgmllib.interesting
+ shorttag = sgmllib.shorttag
+ shorttagopen = sgmllib.shorttagopen
+ starttagopen = sgmllib.starttagopen
+
+ class _EndBracketRegEx:
+ def __init__(self):
+ # Overriding the built-in sgmllib.endbracket regex allows the
+ # parser to find angle brackets embedded in element attributes.
+ self.endbracket = re.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''')
+ def search(self, target, index=0):
+ match = self.endbracket.match(target, index)
+ if match is not None:
+ # Returning a new object in the calling thread's context
+ # resolves a thread-safety.
+ return EndBracketMatch(match)
+ return None
+ class EndBracketMatch:
+ def __init__(self, match):
+ self.match = match
+ def start(self, n):
+ return self.match.end(n)
+ endbracket = _EndBracketRegEx()
+
+
+# cjkcodecs and iconv_codec provide support for more character encodings.
+# Both are available from http://cjkpython.i18n.org/
+try:
+ import cjkcodecs.aliases
+except ImportError:
+ pass
+try:
+ import iconv_codec
+except ImportError:
+ pass
+
+# chardet library auto-detects character encodings
+# Download from http://chardet.feedparser.org/
+try:
+ import chardet
+except ImportError:
+ chardet = None
+
+# BeautifulSoup parser used for parsing microformats from embedded HTML content
+# http://www.crummy.com/software/BeautifulSoup/
+# feedparser is tested with BeautifulSoup 3.0.x, but it might work with the
+# older 2.x series. If it doesn't, and you can figure out why, I'll accept a
+# patch and modify the compatibility statement accordingly.
+try:
+ import BeautifulSoup
+except ImportError:
+ BeautifulSoup = None
+
+# ---------- don't touch these ----------
+class ThingsNobodyCaresAboutButMe(Exception): pass
+class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
+class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
+class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
+class UndeclaredNamespace(Exception): pass
+
+SUPPORTED_VERSIONS = {'': u'unknown',
+ 'rss090': u'RSS 0.90',
+ 'rss091n': u'RSS 0.91 (Netscape)',
+ 'rss091u': u'RSS 0.91 (Userland)',
+ 'rss092': u'RSS 0.92',
+ 'rss093': u'RSS 0.93',
+ 'rss094': u'RSS 0.94',
+ 'rss20': u'RSS 2.0',
+ 'rss10': u'RSS 1.0',
+ 'rss': u'RSS (unknown version)',
+ 'atom01': u'Atom 0.1',
+ 'atom02': u'Atom 0.2',
+ 'atom03': u'Atom 0.3',
+ 'atom10': u'Atom 1.0',
+ 'atom': u'Atom (unknown version)',
+ 'cdf': u'CDF',
+ }
+
+class FeedParserDict(dict):
+ keymap = {'channel': 'feed',
+ 'items': 'entries',
+ 'guid': 'id',
+ 'date': 'updated',
+ 'date_parsed': 'updated_parsed',
+ 'description': ['summary', 'subtitle'],
+ 'url': ['href'],
+ 'modified': 'updated',
+ 'modified_parsed': 'updated_parsed',
+ 'issued': 'published',
+ 'issued_parsed': 'published_parsed',
+ 'copyright': 'rights',
+ 'copyright_detail': 'rights_detail',
+ 'tagline': 'subtitle',
+ 'tagline_detail': 'subtitle_detail'}
+ def __getitem__(self, key):
+ if key == 'category':
+ try:
+ return dict.__getitem__(self, 'tags')[0]['term']
+ except IndexError:
+ raise KeyError, "object doesn't have key 'category'"
+ elif key == 'enclosures':
+ norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel'])
+ return [norel(link) for link in dict.__getitem__(self, 'links') if link['rel']==u'enclosure']
+ elif key == 'license':
+ for link in dict.__getitem__(self, 'links'):
+ if link['rel']==u'license' and link.has_key('href'):
+ return link['href']
+ elif key == 'categories':
+ return [(tag['scheme'], tag['term']) for tag in dict.__getitem__(self, 'tags')]
+ else:
+ realkey = self.keymap.get(key, key)
+ if isinstance(realkey, list):
+ for k in realkey:
+ if dict.__contains__(self, k):
+ return dict.__getitem__(self, k)
+ elif dict.__contains__(self, realkey):
+ return dict.__getitem__(self, realkey)
+ return dict.__getitem__(self, key)
+
+ def __contains__(self, key):
+ try:
+ self.__getitem__(key)
+ except KeyError:
+ return False
+ else:
+ return True
+
+ has_key = __contains__
+
+ def get(self, key, default=None):
+ try:
+ return self.__getitem__(key)
+ except KeyError:
+ return default
+
+ def __setitem__(self, key, value):
+ key = self.keymap.get(key, key)
+ if isinstance(key, list):
+ key = key[0]
+ return dict.__setitem__(self, key, value)
+
+ def setdefault(self, key, value):
+ if key not in self:
+ self[key] = value
+ return value
+ return self[key]
+
+ def __getattr__(self, key):
+ # __getattribute__() is called first; this will be called
+ # only if an attribute was not already found
+ try:
+ return self.__getitem__(key)
+ except KeyError:
+ raise AttributeError, "object has no attribute '%s'" % key
+
+
+_ebcdic_to_ascii_map = None
+def _ebcdic_to_ascii(s):
+ global _ebcdic_to_ascii_map
+ if not _ebcdic_to_ascii_map:
+ emap = (
+ 0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
+ 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
+ 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
+ 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
+ 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
+ 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
+ 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
+ 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
+ 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,
+ 202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,
+ 209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,
+ 216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,
+ 123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,
+ 125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,
+ 92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
+ 48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
+ )
+ _ebcdic_to_ascii_map = _maketrans( \
+ _l2bytes(range(256)), _l2bytes(emap))
+ return s.translate(_ebcdic_to_ascii_map)
+
+_cp1252 = {
+ unichr(128): unichr(8364), # euro sign
+ unichr(130): unichr(8218), # single low-9 quotation mark
+ unichr(131): unichr( 402), # latin small letter f with hook
+ unichr(132): unichr(8222), # double low-9 quotation mark
+ unichr(133): unichr(8230), # horizontal ellipsis
+ unichr(134): unichr(8224), # dagger
+ unichr(135): unichr(8225), # double dagger
+ unichr(136): unichr( 710), # modifier letter circumflex accent
+ unichr(137): unichr(8240), # per mille sign
+ unichr(138): unichr( 352), # latin capital letter s with caron
+ unichr(139): unichr(8249), # single left-pointing angle quotation mark
+ unichr(140): unichr( 338), # latin capital ligature oe
+ unichr(142): unichr( 381), # latin capital letter z with caron
+ unichr(145): unichr(8216), # left single quotation mark
+ unichr(146): unichr(8217), # right single quotation mark
+ unichr(147): unichr(8220), # left double quotation mark
+ unichr(148): unichr(8221), # right double quotation mark
+ unichr(149): unichr(8226), # bullet
+ unichr(150): unichr(8211), # en dash
+ unichr(151): unichr(8212), # em dash
+ unichr(152): unichr( 732), # small tilde
+ unichr(153): unichr(8482), # trade mark sign
+ unichr(154): unichr( 353), # latin small letter s with caron
+ unichr(155): unichr(8250), # single right-pointing angle quotation mark
+ unichr(156): unichr( 339), # latin small ligature oe
+ unichr(158): unichr( 382), # latin small letter z with caron
+ unichr(159): unichr( 376)} # latin capital letter y with diaeresis
+
+_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
+def _urljoin(base, uri):
+ uri = _urifixer.sub(r'\1\3', uri)
+ #try:
+ uri = urlparse.urljoin(base, uri)
+ if not isinstance(uri, unicode):
+ return uri.decode('utf-8', 'ignore')
+ return uri
+ #except:
+ # uri = urlparse.urlunparse([urllib.quote(part) for part in urlparse.urlparse(uri)])
+ # return urlparse.urljoin(base, uri)
+
+class _FeedParserMixin:
+ namespaces = {'': '',
+ 'http://backend.userland.com/rss': '',
+ 'http://blogs.law.harvard.edu/tech/rss': '',
+ 'http://purl.org/rss/1.0/': '',
+ 'http://my.netscape.com/rdf/simple/0.9/': '',
+ 'http://example.com/newformat#': '',
+ 'http://example.com/necho': '',
+ 'http://purl.org/echo/': '',
+ 'uri/of/echo/namespace#': '',
+ 'http://purl.org/pie/': '',
+ 'http://purl.org/atom/ns#': '',
+ 'http://www.w3.org/2005/Atom': '',
+ 'http://purl.org/rss/1.0/modules/rss091#': '',
+
+ 'http://webns.net/mvcb/': 'admin',
+ 'http://purl.org/rss/1.0/modules/aggregation/': 'ag',
+ 'http://purl.org/rss/1.0/modules/annotate/': 'annotate',
+ 'http://media.tangent.org/rss/1.0/': 'audio',
+ 'http://backend.userland.com/blogChannelModule': 'blogChannel',
+ 'http://web.resource.org/cc/': 'cc',
+ 'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons',
+ 'http://purl.org/rss/1.0/modules/company': 'co',
+ 'http://purl.org/rss/1.0/modules/content/': 'content',
+ 'http://my.theinfo.org/changed/1.0/rss/': 'cp',
+ 'http://purl.org/dc/elements/1.1/': 'dc',
+ 'http://purl.org/dc/terms/': 'dcterms',
+ 'http://purl.org/rss/1.0/modules/email/': 'email',
+ 'http://purl.org/rss/1.0/modules/event/': 'ev',
+ 'http://rssnamespace.org/feedburner/ext/1.0': 'feedburner',
+ 'http://freshmeat.net/rss/fm/': 'fm',
+ 'http://xmlns.com/foaf/0.1/': 'foaf',
+ 'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo',
+ 'http://postneo.com/icbm/': 'icbm',
+ 'http://purl.org/rss/1.0/modules/image/': 'image',
+ 'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes',
+ 'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes',
+ 'http://purl.org/rss/1.0/modules/link/': 'l',
+ 'http://search.yahoo.com/mrss': 'media',
+ #Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace
+ 'http://search.yahoo.com/mrss/': 'media',
+ 'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
+ 'http://prismstandard.org/namespaces/1.2/basic/': 'prism',
+ 'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf',
+ 'http://www.w3.org/2000/01/rdf-schema#': 'rdfs',
+ 'http://purl.org/rss/1.0/modules/reference/': 'ref',
+ 'http://purl.org/rss/1.0/modules/richequiv/': 'reqv',
+ 'http://purl.org/rss/1.0/modules/search/': 'search',
+ 'http://purl.org/rss/1.0/modules/slash/': 'slash',
+ 'http://schemas.xmlsoap.org/soap/envelope/': 'soap',
+ 'http://purl.org/rss/1.0/modules/servicestatus/': 'ss',
+ 'http://hacks.benhammersley.com/rss/streaming/': 'str',
+ 'http://purl.org/rss/1.0/modules/subscription/': 'sub',
+ 'http://purl.org/rss/1.0/modules/syndication/': 'sy',
+ 'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf',
+ 'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo',
+ 'http://purl.org/rss/1.0/modules/threading/': 'thr',
+ 'http://purl.org/rss/1.0/modules/textinput/': 'ti',
+ 'http://madskills.com/public/xml/rss/module/trackback/':'trackback',
+ 'http://wellformedweb.org/commentAPI/': 'wfw',
+ 'http://purl.org/rss/1.0/modules/wiki/': 'wiki',
+ 'http://www.w3.org/1999/xhtml': 'xhtml',
+ 'http://www.w3.org/1999/xlink': 'xlink',
+ 'http://www.w3.org/XML/1998/namespace': 'xml'
+}
+ _matchnamespaces = {}
+
+ can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'icon', 'logo']
+ can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
+ can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
+ html_types = [u'text/html', u'application/xhtml+xml']
+
+ def __init__(self, baseuri=None, baselang=None, encoding=u'utf-8'):
+ if not self._matchnamespaces:
+ for k, v in self.namespaces.items():
+ self._matchnamespaces[k.lower()] = v
+ self.feeddata = FeedParserDict() # feed-level data
+ self.encoding = encoding # character encoding
+ self.entries = [] # list of entry-level data
+ self.version = u'' # feed type/version, see SUPPORTED_VERSIONS
+ self.namespacesInUse = {} # dictionary of namespaces defined by the feed
+
+ # the following are used internally to track state;
+ # this is really out of control and should be refactored
+ self.infeed = 0
+ self.inentry = 0
+ self.incontent = 0
+ self.intextinput = 0
+ self.inimage = 0
+ self.inauthor = 0
+ self.incontributor = 0
+ self.inpublisher = 0
+ self.insource = 0
+ self.sourcedata = FeedParserDict()
+ self.contentparams = FeedParserDict()
+ self._summaryKey = None
+ self.namespacemap = {}
+ self.elementstack = []
+ self.basestack = []
+ self.langstack = []
+ self.baseuri = baseuri or u''
+ self.lang = baselang or None
+ self.svgOK = 0
+ self.hasTitle = 0
+ if baselang:
+ self.feeddata['language'] = baselang.replace('_','-')
+
+ def _normalize_attributes(self, kv):
+ k = kv[0].lower()
+ v = k in ('rel', 'type') and kv[1].lower() or kv[1]
+ # the sgml parser doesn't handle entities in attributes, nor
+ # does it pass the attribute values through as unicode, while
+ # strict xml parsers do -- account for this difference
+ if isinstance(self, _LooseFeedParser):
+ v = v.replace('&', '&')
+ if not isinstance(v, unicode):
+ v = v.decode('utf-8')
+ return (k, v)
+
+ def unknown_starttag(self, tag, attrs):
+ # normalize attrs
+ attrs = map(self._normalize_attributes, attrs)
+
+ # track xml:base and xml:lang
+ attrsD = dict(attrs)
+ baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
+ if not isinstance(baseuri, unicode):
+ baseuri = baseuri.decode(self.encoding, 'ignore')
+ # ensure that self.baseuri is always an absolute URI that
+ # uses a whitelisted URI scheme (e.g. not `javscript:`)
+ if self.baseuri:
+ self.baseuri = _makeSafeAbsoluteURI(self.baseuri, baseuri) or self.baseuri
+ else:
+ self.baseuri = _urljoin(self.baseuri, baseuri)
+ lang = attrsD.get('xml:lang', attrsD.get('lang'))
+ if lang == '':
+ # xml:lang could be explicitly set to '', we need to capture that
+ lang = None
+ elif lang is None:
+ # if no xml:lang is specified, use parent lang
+ lang = self.lang
+ if lang:
+ if tag in ('feed', 'rss', 'rdf:RDF'):
+ self.feeddata['language'] = lang.replace('_','-')
+ self.lang = lang
+ self.basestack.append(self.baseuri)
+ self.langstack.append(lang)
+
+ # track namespaces
+ for prefix, uri in attrs:
+ if prefix.startswith('xmlns:'):
+ self.trackNamespace(prefix[6:], uri)
+ elif prefix == 'xmlns':
+ self.trackNamespace(None, uri)
+
+ # track inline content
+ if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', u'xml').endswith(u'xml'):
+ if tag in ['xhtml:div', 'div']:
+ return # typepad does this 10/2007
+ # element declared itself as escaped markup, but it isn't really
+ self.contentparams['type'] = u'application/xhtml+xml'
+ if self.incontent and self.contentparams.get('type') == u'application/xhtml+xml':
+ if tag.find(':') <> -1:
+ prefix, tag = tag.split(':', 1)
+ namespace = self.namespacesInUse.get(prefix, '')
+ if tag=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
+ attrs.append(('xmlns',namespace))
+ if tag=='svg' and namespace=='http://www.w3.org/2000/svg':
+ attrs.append(('xmlns',namespace))
+ if tag == 'svg':
+ self.svgOK += 1
+ return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0)
+
+ # match namespaces
+ if tag.find(':') <> -1:
+ prefix, suffix = tag.split(':', 1)
+ else:
+ prefix, suffix = '', tag
+ prefix = self.namespacemap.get(prefix, prefix)
+ if prefix:
+ prefix = prefix + '_'
+
+ # special hack for better tracking of empty textinput/image elements in illformed feeds
+ if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
+ self.intextinput = 0
+ if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
+ self.inimage = 0
+
+ # call special handler (if defined) or default handler
+ methodname = '_start_' + prefix + suffix
+ try:
+ method = getattr(self, methodname)
+ return method(attrsD)
+ except AttributeError:
+ # Since there's no handler or something has gone wrong we explicitly add the element and its attributes
+ unknown_tag = prefix + suffix
+ if len(attrsD) == 0:
+ # No attributes so merge it into the encosing dictionary
+ return self.push(unknown_tag, 1)
+ else:
+ # Has attributes so create it in its own dictionary
+ context = self._getContext()
+ context[unknown_tag] = attrsD
+
+ def unknown_endtag(self, tag):
+ # match namespaces
+ if tag.find(':') <> -1:
+ prefix, suffix = tag.split(':', 1)
+ else:
+ prefix, suffix = '', tag
+ prefix = self.namespacemap.get(prefix, prefix)
+ if prefix:
+ prefix = prefix + '_'
+ if suffix == 'svg' and self.svgOK:
+ self.svgOK -= 1
+
+ # call special handler (if defined) or default handler
+ methodname = '_end_' + prefix + suffix
+ try:
+ if self.svgOK:
+ raise AttributeError()
+ method = getattr(self, methodname)
+ method()
+ except AttributeError:
+ self.pop(prefix + suffix)
+
+ # track inline content
+ if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', u'xml').endswith(u'xml'):
+ # element declared itself as escaped markup, but it isn't really
+ if tag in ['xhtml:div', 'div']:
+ return # typepad does this 10/2007
+ self.contentparams['type'] = u'application/xhtml+xml'
+ if self.incontent and self.contentparams.get('type') == u'application/xhtml+xml':
+ tag = tag.split(':')[-1]
+ self.handle_data('</%s>' % tag, escape=0)
+
+ # track xml:base and xml:lang going out of scope
+ if self.basestack:
+ self.basestack.pop()
+ if self.basestack and self.basestack[-1]:
+ self.baseuri = self.basestack[-1]
+ if self.langstack:
+ self.langstack.pop()
+ if self.langstack: # and (self.langstack[-1] is not None):
+ self.lang = self.langstack[-1]
+
+ def handle_charref(self, ref):
+ # called for each character reference, e.g. for ' ', ref will be '160'
+ if not self.elementstack:
+ return
+ ref = ref.lower()
+ if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
+ text = '&#%s;' % ref
+ else:
+ if ref[0] == 'x':
+ c = int(ref[1:], 16)
+ else:
+ c = int(ref)
+ text = unichr(c).encode('utf-8')
+ self.elementstack[-1][2].append(text)
+
+ def handle_entityref(self, ref):
+ # called for each entity reference, e.g. for '©', ref will be 'copy'
+ if not self.elementstack:
+ return
+ if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
+ text = '&%s;' % ref
+ elif ref in self.entities.keys():
+ text = self.entities[ref]
+ if text.startswith('&#') and text.endswith(';'):
+ return self.handle_entityref(text)
+ else:
+ try:
+ name2codepoint[ref]
+ except KeyError:
+ text = '&%s;' % ref
+ else:
+ text = unichr(name2codepoint[ref]).encode('utf-8')
+ self.elementstack[-1][2].append(text)
+
+ def handle_data(self, text, escape=1):
+ # called for each block of plain text, i.e. outside of any tag and
+ # not containing any character or entity references
+ if not self.elementstack:
+ return
+ if escape and self.contentparams.get('type') == u'application/xhtml+xml':
+ text = _xmlescape(text)
+ self.elementstack[-1][2].append(text)
+
+ def handle_comment(self, text):
+ # called for each comment, e.g. <!-- insert message here -->
+ pass
+
+ def handle_pi(self, text):
+ # called for each processing instruction, e.g. <?instruction>
+ pass
+
+ def handle_decl(self, text):
+ pass
+
+ def parse_declaration(self, i):
+ # override internal declaration handler to handle CDATA blocks
+ if self.rawdata[i:i+9] == '<![CDATA[':
+ k = self.rawdata.find(']]>', i)
+ if k == -1:
+ # CDATA block began but didn't finish
+ k = len(self.rawdata)
+ return k
+ self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
+ return k+3
+ else:
+ k = self.rawdata.find('>', i)
+ if k >= 0:
+ return k+1
+ else:
+ # We have an incomplete CDATA block.
+ return k
+
+ def mapContentType(self, contentType):
+ contentType = contentType.lower()
+ if contentType == 'text' or contentType == 'plain':
+ contentType = u'text/plain'
+ elif contentType == 'html':
+ contentType = u'text/html'
+ elif contentType == 'xhtml':
+ contentType = u'application/xhtml+xml'
+ return contentType
+
+ def trackNamespace(self, prefix, uri):
+ loweruri = uri.lower()
+ if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
+ self.version = u'rss090'
+ if loweruri == 'http://purl.org/rss/1.0/' and not self.version:
+ self.version = u'rss10'
+ if loweruri == 'http://www.w3.org/2005/atom' and not self.version:
+ self.version = u'atom10'
+ if loweruri.find(u'backend.userland.com/rss') <> -1:
+ # match any backend.userland.com namespace
+ uri = u'http://backend.userland.com/rss'
+ loweruri = uri
+ if self._matchnamespaces.has_key(loweruri):
+ self.namespacemap[prefix] = self._matchnamespaces[loweruri]
+ self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
+ else:
+ self.namespacesInUse[prefix or ''] = uri
+
+ def resolveURI(self, uri):
+ return _urljoin(self.baseuri or u'', uri)
+
+ def decodeEntities(self, element, data):
+ return data
+
+ def strattrs(self, attrs):
+ return ''.join([' %s="%s"' % (t[0],_xmlescape(t[1],{'"':'"'})) for t in attrs])
+
+ def push(self, element, expectingText):
+ self.elementstack.append([element, expectingText, []])
+
+ def pop(self, element, stripWhitespace=1):
+ if not self.elementstack:
+ return
+ if self.elementstack[-1][0] != element:
+ return
+
+ element, expectingText, pieces = self.elementstack.pop()
+
+ if self.version == u'atom10' and self.contentparams.get('type', u'text') == u'application/xhtml+xml':
+ # remove enclosing child element, but only if it is a <div> and
+ # only if all the remaining content is nested underneath it.
+ # This means that the divs would be retained in the following:
+ # <div>foo</div><div>bar</div>
+ while pieces and len(pieces)>1 and not pieces[-1].strip():
+ del pieces[-1]
+ while pieces and len(pieces)>1 and not pieces[0].strip():
+ del pieces[0]
+ if pieces and (pieces[0] == '<div>' or pieces[0].startswith('<div ')) and pieces[-1]=='</div>':
+ depth = 0
+ for piece in pieces[:-1]:
+ if piece.startswith('</'):
+ depth -= 1
+ if depth == 0:
+ break
+ elif piece.startswith('<') and not piece.endswith('/>'):
+ depth += 1
+ else:
+ pieces = pieces[1:-1]
+
+ # Ensure each piece is a str for Python 3
+ for (i, v) in enumerate(pieces):
+ if not isinstance(v, unicode):
+ pieces[i] = v.decode('utf-8')
+
+ output = u''.join(pieces)
+ if stripWhitespace:
+ output = output.strip()
+ if not expectingText:
+ return output
+
+ # decode base64 content
+ if base64 and self.contentparams.get('base64', 0):
+ try:
+ output = _base64decode(output)
+ except binascii.Error:
+ pass
+ except binascii.Incomplete:
+ pass
+ except TypeError:
+ # In Python 3, base64 takes and outputs bytes, not str
+ # This may not be the most correct way to accomplish this
+ output = _base64decode(output.encode('utf-8')).decode('utf-8')
+
+ # resolve relative URIs
+ if (element in self.can_be_relative_uri) and output:
+ output = self.resolveURI(output)
+
+ # decode entities within embedded markup
+ if not self.contentparams.get('base64', 0):
+ output = self.decodeEntities(element, output)
+
+ # some feed formats require consumers to guess
+ # whether the content is html or plain text
+ if not self.version.startswith(u'atom') and self.contentparams.get('type') == u'text/plain':
+ if self.lookslikehtml(output):
+ self.contentparams['type'] = u'text/html'
+
+ # remove temporary cruft from contentparams
+ try:
+ del self.contentparams['mode']
+ except KeyError:
+ pass
+ try:
+ del self.contentparams['base64']
+ except KeyError:
+ pass
+
+ is_htmlish = self.mapContentType(self.contentparams.get('type', u'text/html')) in self.html_types
+ # resolve relative URIs within embedded markup
+ if is_htmlish and RESOLVE_RELATIVE_URIS:
+ if element in self.can_contain_relative_uris:
+ output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', u'text/html'))
+
+ # parse microformats
+ # (must do this before sanitizing because some microformats
+ # rely on elements that we sanitize)
+ if is_htmlish and element in ['content', 'description', 'summary']:
+ mfresults = _parseMicroformats(output, self.baseuri, self.encoding)
+ if mfresults:
+ for tag in mfresults.get('tags', []):
+ self._addTag(tag['term'], tag['scheme'], tag['label'])
+ for enclosure in mfresults.get('enclosures', []):
+ self._start_enclosure(enclosure)
+ for xfn in mfresults.get('xfn', []):
+ self._addXFN(xfn['relationships'], xfn['href'], xfn['name'])
+ vcard = mfresults.get('vcard')
+ if vcard:
+ self._getContext()['vcard'] = vcard
+
+ # sanitize embedded markup
+ if is_htmlish and SANITIZE_HTML:
+ if element in self.can_contain_dangerous_markup:
+ output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', u'text/html'))
+
+ if self.encoding and not isinstance(output, unicode):
+ output = output.decode(self.encoding, 'ignore')
+
+ # address common error where people take data that is already
+ # utf-8, presume that it is iso-8859-1, and re-encode it.
+ if self.encoding in (u'utf-8', u'utf-8_INVALID_PYTHON_3') and isinstance(output, unicode):
+ try:
+ output = output.encode('iso-8859-1').decode('utf-8')
+ except (UnicodeEncodeError, UnicodeDecodeError):
+ pass
+
+ # map win-1252 extensions to the proper code points
+ if isinstance(output, unicode):
+ output = u''.join([c in _cp1252.keys() and _cp1252[c] or c for c in output])
+
+ # categories/tags/keywords/whatever are handled in _end_category
+ if element == 'category':
+ return output
+
+ if element == 'title' and self.hasTitle:
+ return output
+
+ # store output in appropriate place(s)
+ if self.inentry and not self.insource:
+ if element == 'content':
+ self.entries[-1].setdefault(element, [])
+ contentparams = copy.deepcopy(self.contentparams)
+ contentparams['value'] = output
+ self.entries[-1][element].append(contentparams)
+ elif element == 'link':
+ if not self.inimage:
+ # query variables in urls in link elements are improperly
+ # converted from `?a=1&b=2` to `?a=1&b;=2` as if they're
+ # unhandled character references. fix this special case.
+ output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output)
+ self.entries[-1][element] = output
+ if output:
+ self.entries[-1]['links'][-1]['href'] = output
+ else:
+ if element == 'description':
+ element = 'summary'
+ self.entries[-1][element] = output
+ if self.incontent:
+ contentparams = copy.deepcopy(self.contentparams)
+ contentparams['value'] = output
+ self.entries[-1][element + '_detail'] = contentparams
+ elif (self.infeed or self.insource):# and (not self.intextinput) and (not self.inimage):
+ context = self._getContext()
+ if element == 'description':
+ element = 'subtitle'
+ context[element] = output
+ if element == 'link':
+ # fix query variables; see above for the explanation
+ output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output)
+ context[element] = output
+ context['links'][-1]['href'] = output
+ elif self.incontent:
+ contentparams = copy.deepcopy(self.contentparams)
+ contentparams['value'] = output
+ context[element + '_detail'] = contentparams
+ return output
+
+ def pushContent(self, tag, attrsD, defaultContentType, expectingText):
+ self.incontent += 1
+ if self.lang:
+ self.lang=self.lang.replace('_','-')
+ self.contentparams = FeedParserDict({
+ 'type': self.mapContentType(attrsD.get('type', defaultContentType)),
+ 'language': self.lang,
+ 'base': self.baseuri})
+ self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams)
+ self.push(tag, expectingText)
+
+ def popContent(self, tag):
+ value = self.pop(tag)
+ self.incontent -= 1
+ self.contentparams.clear()
+ return value
+
+ # a number of elements in a number of RSS variants are nominally plain
+ # text, but this is routinely ignored. This is an attempt to detect
+ # the most common cases. As false positives often result in silent
+ # data loss, this function errs on the conservative side.
+ @staticmethod
+ def lookslikehtml(s):
+ # must have a close tag or a entity reference to qualify
+ if not (re.search(r'</(\w+)>',s) or re.search("&#?\w+;",s)):
+ return
+
+ # all tags must be in a restricted subset of valid HTML tags
+ if filter(lambda t: t.lower() not in _HTMLSanitizer.acceptable_elements,
+ re.findall(r'</?(\w+)',s)):
+ return
+
+ # all entities must have been defined as valid HTML entities
+ if filter(lambda e: e not in entitydefs.keys(), re.findall(r'&(\w+);', s)):
+ return
+
+ return 1
+
+ def _mapToStandardPrefix(self, name):
+ colonpos = name.find(':')
+ if colonpos <> -1:
+ prefix = name[:colonpos]
+ suffix = name[colonpos+1:]
+ prefix = self.namespacemap.get(prefix, prefix)
+ name = prefix + ':' + suffix
+ return name
+
+ def _getAttribute(self, attrsD, name):
+ return attrsD.get(self._mapToStandardPrefix(name))
+
+ def _isBase64(self, attrsD, contentparams):
+ if attrsD.get('mode', '') == 'base64':
+ return 1
+ if self.contentparams['type'].startswith(u'text/'):
+ return 0
+ if self.contentparams['type'].endswith(u'+xml'):
+ return 0
+ if self.contentparams['type'].endswith(u'/xml'):
+ return 0
+ return 1
+
+ def _itsAnHrefDamnIt(self, attrsD):
+ href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
+ if href:
+ try:
+ del attrsD['url']
+ except KeyError:
+ pass
+ try:
+ del attrsD['uri']
+ except KeyError:
+ pass
+ attrsD['href'] = href
+ return attrsD
+
+ def _save(self, key, value, overwrite=False):
+ context = self._getContext()
+ if overwrite:
+ context[key] = value
+ else:
+ context.setdefault(key, value)
+
+ def _start_rss(self, attrsD):
+ versionmap = {'0.91': u'rss091u',
+ '0.92': u'rss092',
+ '0.93': u'rss093',
+ '0.94': u'rss094'}
+ #If we're here then this is an RSS feed.
+ #If we don't have a version or have a version that starts with something
+ #other than RSS then there's been a mistake. Correct it.
+ if not self.version or not self.version.startswith(u'rss'):
+ attr_version = attrsD.get('version', '')
+ version = versionmap.get(attr_version)
+ if version:
+ self.version = version
+ elif attr_version.startswith('2.'):
+ self.version = u'rss20'
+ else:
+ self.version = u'rss'
+
+ def _start_channel(self, attrsD):
+ self.infeed = 1
+ self._cdf_common(attrsD)
+
+ def _cdf_common(self, attrsD):
+ if attrsD.has_key('lastmod'):
+ self._start_modified({})
+ self.elementstack[-1][-1] = attrsD['lastmod']
+ self._end_modified()
+ if attrsD.has_key('href'):
+ self._start_link({})
+ self.elementstack[-1][-1] = attrsD['href']
+ self._end_link()
+
+ def _start_feed(self, attrsD):
+ self.infeed = 1
+ versionmap = {'0.1': u'atom01',
+ '0.2': u'atom02',
+ '0.3': u'atom03'}
+ if not self.version:
+ attr_version = attrsD.get('version')
+ version = versionmap.get(attr_version)
+ if version:
+ self.version = version
+ else:
+ self.version = u'atom'
+
+ def _end_channel(self):
+ self.infeed = 0
+ _end_feed = _end_channel
+
+ def _start_image(self, attrsD):
+ context = self._getContext()
+ if not self.inentry:
+ context.setdefault('image', FeedParserDict())
+ self.inimage = 1
+ self.hasTitle = 0
+ self.push('image', 0)
+
+ def _end_image(self):
+ self.pop('image')
+ self.inimage = 0
+
+ def _start_textinput(self, attrsD):
+ context = self._getContext()
+ context.setdefault('textinput', FeedParserDict())
+ self.intextinput = 1
+ self.hasTitle = 0
+ self.push('textinput', 0)
+ _start_textInput = _start_textinput
+
+ def _end_textinput(self):
+ self.pop('textinput')
+ self.intextinput = 0
+ _end_textInput = _end_textinput
+
+ def _start_author(self, attrsD):
+ self.inauthor = 1
+ self.push('author', 1)
+ # Append a new FeedParserDict when expecting an author
+ context = self._getContext()
+ context.setdefault('authors', [])
+ context['authors'].append(FeedParserDict())
+ _start_managingeditor = _start_author
+ _start_dc_author = _start_author
+ _start_dc_creator = _start_author
+ _start_itunes_author = _start_author
+
+ def _end_author(self):
+ self.pop('author')
+ self.inauthor = 0
+ self._sync_author_detail()
+ _end_managingeditor = _end_author
+ _end_dc_author = _end_author
+ _end_dc_creator = _end_author
+ _end_itunes_author = _end_author
+
+ def _start_itunes_owner(self, attrsD):
+ self.inpublisher = 1
+ self.push('publisher', 0)
+
+ def _end_itunes_owner(self):
+ self.pop('publisher')
+ self.inpublisher = 0
+ self._sync_author_detail('publisher')
+
+ def _start_contributor(self, attrsD):
+ self.incontributor = 1
+ context = self._getContext()
+ context.setdefault('contributors', [])
+ context['contributors'].append(FeedParserDict())
+ self.push('contributor', 0)
+
+ def _end_contributor(self):
+ self.pop('contributor')
+ self.incontributor = 0
+
+ def _start_dc_contributor(self, attrsD):
+ self.incontributor = 1
+ context = self._getContext()
+ context.setdefault('contributors', [])
+ context['contributors'].append(FeedParserDict())
+ self.push('name', 0)
+
+ def _end_dc_contributor(self):
+ self._end_name()
+ self.incontributor = 0
+
+ def _start_name(self, attrsD):
+ self.push('name', 0)
+ _start_itunes_name = _start_name
+
+ def _end_name(self):
+ value = self.pop('name')
+ if self.inpublisher:
+ self._save_author('name', value, 'publisher')
+ elif self.inauthor:
+ self._save_author('name', value)
+ elif self.incontributor:
+ self._save_contributor('name', value)
+ elif self.intextinput:
+ context = self._getContext()
+ context['name'] = value
+ _end_itunes_name = _end_name
+
+ def _start_width(self, attrsD):
+ self.push('width', 0)
+
+ def _end_width(self):
+ value = self.pop('width')
+ try:
+ value = int(value)
+ except ValueError:
+ value = 0
+ if self.inimage:
+ context = self._getContext()
+ context['width'] = value
+
+ def _start_height(self, attrsD):
+ self.push('height', 0)
+
+ def _end_height(self):
+ value = self.pop('height')
+ try:
+ value = int(value)
+ except ValueError:
+ value = 0
+ if self.inimage:
+ context = self._getContext()
+ context['height'] = value
+
+ def _start_url(self, attrsD):
+ self.push('href', 1)
+ _start_homepage = _start_url
+ _start_uri = _start_url
+
+ def _end_url(self):
+ value = self.pop('href')
+ if self.inauthor:
+ self._save_author('href', value)
+ elif self.incontributor:
+ self._save_contributor('href', value)
+ _end_homepage = _end_url
+ _end_uri = _end_url
+
+ def _start_email(self, attrsD):
+ self.push('email', 0)
+ _start_itunes_email = _start_email
+
+ def _end_email(self):
+ value = self.pop('email')
+ if self.inpublisher:
+ self._save_author('email', value, 'publisher')
+ elif self.inauthor:
+ self._save_author('email', value)
+ elif self.incontributor:
+ self._save_contributor('email', value)
+ _end_itunes_email = _end_email
+
+ def _getContext(self):
+ if self.insource:
+ context = self.sourcedata
+ elif self.inimage and self.feeddata.has_key('image'):
+ context = self.feeddata['image']
+ elif self.intextinput:
+ context = self.feeddata['textinput']
+ elif self.inentry:
+ context = self.entries[-1]
+ else:
+ context = self.feeddata
+ return context
+
+ def _save_author(self, key, value, prefix='author'):
+ context = self._getContext()
+ context.setdefault(prefix + '_detail', FeedParserDict())
+ context[prefix + '_detail'][key] = value
+ self._sync_author_detail()
+ context.setdefault('authors', [FeedParserDict()])
+ context['authors'][-1][key] = value
+
+ def _save_contributor(self, key, value):
+ context = self._getContext()
+ context.setdefault('contributors', [FeedParserDict()])
+ context['contributors'][-1][key] = value
+
+ def _sync_author_detail(self, key='author'):
+ context = self._getContext()
+ detail = context.get('%s_detail' % key)
+ if detail:
+ name = detail.get('name')
+ email = detail.get('email')
+ if name and email:
+ context[key] = u'%s (%s)' % (name, email)
+ elif name:
+ context[key] = name
+ elif email:
+ context[key] = email
+ else:
+ author, email = context.get(key), None
+ if not author:
+ return
+ emailmatch = re.search(ur'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author)
+ if emailmatch:
+ email = emailmatch.group(0)
+ # probably a better way to do the following, but it passes all the tests
+ author = author.replace(email, u'')
+ author = author.replace(u'()', u'')
+ author = author.replace(u'<>', u'')
+ author = author.replace(u'<>', u'')
+ author = author.strip()
+ if author and (author[0] == u'('):
+ author = author[1:]
+ if author and (author[-1] == u')'):
+ author = author[:-1]
+ author = author.strip()
+ if author or email:
+ context.setdefault('%s_detail' % key, FeedParserDict())
+ if author:
+ context['%s_detail' % key]['name'] = author
+ if email:
+ context['%s_detail' % key]['email'] = email
+
+ def _start_subtitle(self, attrsD):
+ self.pushContent('subtitle', attrsD, u'text/plain', 1)
+ _start_tagline = _start_subtitle
+ _start_itunes_subtitle = _start_subtitle
+
+ def _end_subtitle(self):
+ self.popContent('subtitle')
+ _end_tagline = _end_subtitle
+ _end_itunes_subtitle = _end_subtitle
+
+ def _start_rights(self, attrsD):
+ self.pushContent('rights', attrsD, u'text/plain', 1)
+ _start_dc_rights = _start_rights
+ _start_copyright = _start_rights
+
+ def _end_rights(self):
+ self.popContent('rights')
+ _end_dc_rights = _end_rights
+ _end_copyright = _end_rights
+
+ def _start_item(self, attrsD):
+ self.entries.append(FeedParserDict())
+ self.push('item', 0)
+ self.inentry = 1
+ self.guidislink = 0
+ self.hasTitle = 0
+ id = self._getAttribute(attrsD, 'rdf:about')
+ if id:
+ context = self._getContext()
+ context['id'] = id
+ self._cdf_common(attrsD)
+ _start_entry = _start_item
+
+ def _end_item(self):
+ self.pop('item')
+ self.inentry = 0
+ _end_entry = _end_item
+
+ def _start_dc_language(self, attrsD):
+ self.push('language', 1)
+ _start_language = _start_dc_language
+
+ def _end_dc_language(self):
+ self.lang = self.pop('language')
+ _end_language = _end_dc_language
+
+ def _start_dc_publisher(self, attrsD):
+ self.push('publisher', 1)
+ _start_webmaster = _start_dc_publisher
+
+ def _end_dc_publisher(self):
+ self.pop('publisher')
+ self._sync_author_detail('publisher')
+ _end_webmaster = _end_dc_publisher
+
+ def _start_published(self, attrsD):
+ self.push('published', 1)
+ _start_dcterms_issued = _start_published
+ _start_issued = _start_published
+
+ def _end_published(self):
+ value = self.pop('published')
+ self._save('published_parsed', _parse_date(value), overwrite=True)
+ _end_dcterms_issued = _end_published
+ _end_issued = _end_published
+
+ def _start_updated(self, attrsD):
+ self.push('updated', 1)
+ _start_modified = _start_updated
+ _start_dcterms_modified = _start_updated
+ _start_pubdate = _start_updated
+ _start_dc_date = _start_updated
+ _start_lastbuilddate = _start_updated
+
+ def _end_updated(self):
+ value = self.pop('updated')
+ parsed_value = _parse_date(value)
+ self._save('updated_parsed', parsed_value, overwrite=True)
+ _end_modified = _end_updated
+ _end_dcterms_modified = _end_updated
+ _end_pubdate = _end_updated
+ _end_dc_date = _end_updated
+ _end_lastbuilddate = _end_updated
+
+ def _start_created(self, attrsD):
+ self.push('created', 1)
+ _start_dcterms_created = _start_created
+
+ def _end_created(self):
+ value = self.pop('created')
+ self._save('created_parsed', _parse_date(value), overwrite=True)
+ _end_dcterms_created = _end_created
+
+ def _start_expirationdate(self, attrsD):
+ self.push('expired', 1)
+
+ def _end_expirationdate(self):
+ self._save('expired_parsed', _parse_date(self.pop('expired')), overwrite=True)
+
+ def _start_cc_license(self, attrsD):
+ context = self._getContext()
+ value = self._getAttribute(attrsD, 'rdf:resource')
+ attrsD = FeedParserDict()
+ attrsD['rel'] = u'license'
+ if value:
+ attrsD['href']=value
+ context.setdefault('links', []).append(attrsD)
+
+ def _start_creativecommons_license(self, attrsD):
+ self.push('license', 1)
+ _start_creativeCommons_license = _start_creativecommons_license
+
+ def _end_creativecommons_license(self):
+ value = self.pop('license')
+ context = self._getContext()
+ attrsD = FeedParserDict()
+ attrsD['rel'] = u'license'
+ if value:
+ attrsD['href'] = value
+ context.setdefault('links', []).append(attrsD)
+ del context['license']
+ _end_creativeCommons_license = _end_creativecommons_license
+
+ def _addXFN(self, relationships, href, name):
+ context = self._getContext()
+ xfn = context.setdefault('xfn', [])
+ value = FeedParserDict({'relationships': relationships, 'href': href, 'name': name})
+ if value not in xfn:
+ xfn.append(value)
+
+ def _addTag(self, term, scheme, label):
+ context = self._getContext()
+ tags = context.setdefault('tags', [])
+ if (not term) and (not scheme) and (not label):
+ return
+ value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})
+ if value not in tags:
+ tags.append(value)
+
+ def _start_category(self, attrsD):
+ term = attrsD.get('term')
+ scheme = attrsD.get('scheme', attrsD.get('domain'))
+ label = attrsD.get('label')
+ self._addTag(term, scheme, label)
+ self.push('category', 1)
+ _start_dc_subject = _start_category
+ _start_keywords = _start_category
+
+ def _start_media_category(self, attrsD):
+ attrsD.setdefault('scheme', u'http://search.yahoo.com/mrss/category_schema')
+ self._start_category(attrsD)
+
+ def _end_itunes_keywords(self):
+ for term in self.pop('itunes_keywords').split():
+ self._addTag(term, u'http://www.itunes.com/', None)
+
+ def _start_itunes_category(self, attrsD):
+ self._addTag(attrsD.get('text'), u'http://www.itunes.com/', None)
+ self.push('category', 1)
+
+ def _end_category(self):
+ value = self.pop('category')
+ if not value:
+ return
+ context = self._getContext()
+ tags = context['tags']
+ if value and len(tags) and not tags[-1]['term']:
+ tags[-1]['term'] = value
+ else:
+ self._addTag(value, None, None)
+ _end_dc_subject = _end_category
+ _end_keywords = _end_category
+ _end_itunes_category = _end_category
+ _end_media_category = _end_category
+
+ def _start_cloud(self, attrsD):
+ self._getContext()['cloud'] = FeedParserDict(attrsD)
+
+ def _start_link(self, attrsD):
+ attrsD.setdefault('rel', u'alternate')
+ if attrsD['rel'] == u'self':
+ attrsD.setdefault('type', u'application/atom+xml')
+ else:
+ attrsD.setdefault('type', u'text/html')
+ context = self._getContext()
+ attrsD = self._itsAnHrefDamnIt(attrsD)
+ if attrsD.has_key('href'):
+ attrsD['href'] = self.resolveURI(attrsD['href'])
+ expectingText = self.infeed or self.inentry or self.insource
+ context.setdefault('links', [])
+ if not (self.inentry and self.inimage):
+ context['links'].append(FeedParserDict(attrsD))
+ if attrsD.has_key('href'):
+ expectingText = 0
+ if (attrsD.get('rel') == u'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
+ context['link'] = attrsD['href']
+ else:
+ self.push('link', expectingText)
+
+ def _end_link(self):
+ value = self.pop('link')
+ context = self._getContext()
+
+ def _start_guid(self, attrsD):
+ self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
+ self.push('id', 1)
+
+ def _end_guid(self):
+ value = self.pop('id')
+ self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))
+ if self.guidislink:
+ # guid acts as link, but only if 'ispermalink' is not present or is 'true',
+ # and only if the item doesn't already have a link element
+ self._save('link', value)
+
+ def _start_title(self, attrsD):
+ if self.svgOK:
+ return self.unknown_starttag('title', attrsD.items())
+ self.pushContent('title', attrsD, u'text/plain', self.infeed or self.inentry or self.insource)
+ _start_dc_title = _start_title
+ _start_media_title = _start_title
+
+ def _end_title(self):
+ if self.svgOK:
+ return
+ value = self.popContent('title')
+ if not value:
+ return
+ context = self._getContext()
+ self.hasTitle = 1
+ _end_dc_title = _end_title
+
+ def _end_media_title(self):
+ hasTitle = self.hasTitle
+ self._end_title()
+ self.hasTitle = hasTitle
+
+ def _start_description(self, attrsD):
+ context = self._getContext()
+ if context.has_key('summary'):
+ self._summaryKey = 'content'
+ self._start_content(attrsD)
+ else:
+ self.pushContent('description', attrsD, u'text/html', self.infeed or self.inentry or self.insource)
+ _start_dc_description = _start_description
+
+ def _start_abstract(self, attrsD):
+ self.pushContent('description', attrsD, u'text/plain', self.infeed or self.inentry or self.insource)
+
+ def _end_description(self):
+ if self._summaryKey == 'content':
+ self._end_content()
+ else:
+ value = self.popContent('description')
+ self._summaryKey = None
+ _end_abstract = _end_description
+ _end_dc_description = _end_description
+
+ def _start_info(self, attrsD):
+ self.pushContent('info', attrsD, u'text/plain', 1)
+ _start_feedburner_browserfriendly = _start_info
+
+ def _end_info(self):
+ self.popContent('info')
+ _end_feedburner_browserfriendly = _end_info
+
+ def _start_generator(self, attrsD):
+ if attrsD:
+ attrsD = self._itsAnHrefDamnIt(attrsD)
+ if attrsD.has_key('href'):
+ attrsD['href'] = self.resolveURI(attrsD['href'])
+ self._getContext()['generator_detail'] = FeedParserDict(attrsD)
+ self.push('generator', 1)
+
+ def _end_generator(self):
+ value = self.pop('generator')
+ context = self._getContext()
+ if context.has_key('generator_detail'):
+ context['generator_detail']['name'] = value
+
+ def _start_admin_generatoragent(self, attrsD):
+ self.push('generator', 1)
+ value = self._getAttribute(attrsD, 'rdf:resource')
+ if value:
+ self.elementstack[-1][2].append(value)
+ self.pop('generator')
+ self._getContext()['generator_detail'] = FeedParserDict({'href': value})
+
+ def _start_admin_errorreportsto(self, attrsD):
+ self.push('errorreportsto', 1)
+ value = self._getAttribute(attrsD, 'rdf:resource')
+ if value:
+ self.elementstack[-1][2].append(value)
+ self.pop('errorreportsto')
+
+ def _start_summary(self, attrsD):
+ context = self._getContext()
+ if context.has_key('summary'):
+ self._summaryKey = 'content'
+ self._start_content(attrsD)
+ else:
+ self._summaryKey = 'summary'
+ self.pushContent(self._summaryKey, attrsD, u'text/plain', 1)
+ _start_itunes_summary = _start_summary
+
+ def _end_summary(self):
+ if self._summaryKey == 'content':
+ self._end_content()
+ else:
+ self.popContent(self._summaryKey or 'summary')
+ self._summaryKey = None
+ _end_itunes_summary = _end_summary
+
+ def _start_enclosure(self, attrsD):
+ attrsD = self._itsAnHrefDamnIt(attrsD)
+ context = self._getContext()
+ attrsD['rel'] = u'enclosure'
+ context.setdefault('links', []).append(FeedParserDict(attrsD))
+
+ def _start_source(self, attrsD):
+ if 'url' in attrsD:
+ # This means that we're processing a source element from an RSS 2.0 feed
+ self.sourcedata['href'] = attrsD[u'url']
+ self.push('source', 1)
+ self.insource = 1
+ self.hasTitle = 0
+
+ def _end_source(self):
+ self.insource = 0
+ value = self.pop('source')
+ if value:
+ self.sourcedata['title'] = value
+ self._getContext()['source'] = copy.deepcopy(self.sourcedata)
+ self.sourcedata.clear()
+
+ def _start_content(self, attrsD):
+ self.pushContent('content', attrsD, u'text/plain', 1)
+ src = attrsD.get('src')
+ if src:
+ self.contentparams['src'] = src
+ self.push('content', 1)
+
+ def _start_body(self, attrsD):
+ self.pushContent('content', attrsD, u'application/xhtml+xml', 1)
+ _start_xhtml_body = _start_body
+
+ def _start_content_encoded(self, attrsD):
+ self.pushContent('content', attrsD, u'text/html', 1)
+ _start_fullitem = _start_content_encoded
+
+ def _end_content(self):
+ copyToSummary = self.mapContentType(self.contentparams.get('type')) in ([u'text/plain'] + self.html_types)
+ value = self.popContent('content')
+ if copyToSummary:
+ self._save('summary', value)
+
+ _end_body = _end_content
+ _end_xhtml_body = _end_content
+ _end_content_encoded = _end_content
+ _end_fullitem = _end_content
+
+ def _start_itunes_image(self, attrsD):
+ self.push('itunes_image', 0)
+ if attrsD.get('href'):
+ self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
+ _start_itunes_link = _start_itunes_image
+
+ def _end_itunes_block(self):
+ value = self.pop('itunes_block', 0)
+ self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0
+
+ def _end_itunes_explicit(self):
+ value = self.pop('itunes_explicit', 0)
+ # Convert 'yes' -> True, 'clean' to False, and any other value to None
+ # False and None both evaluate as False, so the difference can be ignored
+ # by applications that only need to know if the content is explicit.
+ self._getContext()['itunes_explicit'] = (None, False, True)[(value == 'yes' and 2) or value == 'clean' or 0]
+
+ def _start_media_content(self, attrsD):
+ context = self._getContext()
+ context.setdefault('media_content', [])
+ context['media_content'].append(attrsD)
+
+ def _start_media_thumbnail(self, attrsD):
+ context = self._getContext()
+ context.setdefault('media_thumbnail', [])
+ self.push('url', 1) # new
+ context['media_thumbnail'].append(attrsD)
+
+ def _end_media_thumbnail(self):
+ url = self.pop('url')
+ context = self._getContext()
+ if url != None and len(url.strip()) != 0:
+ if not context['media_thumbnail'][-1].has_key('url'):
+ context['media_thumbnail'][-1]['url'] = url
+
+ def _start_media_player(self, attrsD):
+ self.push('media_player', 0)
+ self._getContext()['media_player'] = FeedParserDict(attrsD)
+
+ def _end_media_player(self):
+ value = self.pop('media_player')
+ context = self._getContext()
+ context['media_player']['content'] = value
+
+ def _start_newlocation(self, attrsD):
+ self.push('newlocation', 1)
+
+ def _end_newlocation(self):
+ url = self.pop('newlocation')
+ context = self._getContext()
+ # don't set newlocation if the context isn't right
+ if context is not self.feeddata:
+ return
+ context['newlocation'] = _makeSafeAbsoluteURI(self.baseuri, url.strip())
+
+if _XML_AVAILABLE:
+ class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
+ def __init__(self, baseuri, baselang, encoding):
+ xml.sax.handler.ContentHandler.__init__(self)
+ _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
+ self.bozo = 0
+ self.exc = None
+ self.decls = {}
+
+ def startPrefixMapping(self, prefix, uri):
+ if not uri:
+ return
+ # Jython uses '' instead of None; standardize on None
+ prefix = prefix or None
+ self.trackNamespace(prefix, uri)
+ if prefix and uri == 'http://www.w3.org/1999/xlink':
+ self.decls['xmlns:' + prefix] = uri
+
+ def startElementNS(self, name, qname, attrs):
+ namespace, localname = name
+ lowernamespace = str(namespace or '').lower()
+ if lowernamespace.find(u'backend.userland.com/rss') <> -1:
+ # match any backend.userland.com namespace
+ namespace = u'http://backend.userland.com/rss'
+ lowernamespace = namespace
+ if qname and qname.find(':') > 0:
+ givenprefix = qname.split(':')[0]
+ else:
+ givenprefix = None
+ prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
+ if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix):
+ raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix
+ localname = str(localname).lower()
+
+ # qname implementation is horribly broken in Python 2.1 (it
+ # doesn't report any), and slightly broken in Python 2.2 (it
+ # doesn't report the xml: namespace). So we match up namespaces
+ # with a known list first, and then possibly override them with
+ # the qnames the SAX parser gives us (if indeed it gives us any
+ # at all). Thanks to MatejC for helping me test this and
+ # tirelessly telling me that it didn't work yet.
+ attrsD, self.decls = self.decls, {}
+ if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
+ attrsD['xmlns']=namespace
+ if localname=='svg' and namespace=='http://www.w3.org/2000/svg':
+ attrsD['xmlns']=namespace
+
+ if prefix:
+ localname = prefix.lower() + ':' + localname
+ elif namespace and not qname: #Expat
+ for name,value in self.namespacesInUse.items():
+ if name and value == namespace:
+ localname = name + ':' + localname
+ break
+
+ for (namespace, attrlocalname), attrvalue in attrs.items():
+ lowernamespace = (namespace or '').lower()
+ prefix = self._matchnamespaces.get(lowernamespace, '')
+ if prefix:
+ attrlocalname = prefix + ':' + attrlocalname
+ attrsD[str(attrlocalname).lower()] = attrvalue
+ for qname in attrs.getQNames():
+ attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
+ self.unknown_starttag(localname, attrsD.items())
+
+ def characters(self, text):
+ self.handle_data(text)
+
+ def endElementNS(self, name, qname):
+ namespace, localname = name
+ lowernamespace = str(namespace or '').lower()
+ if qname and qname.find(':') > 0:
+ givenprefix = qname.split(':')[0]
+ else:
+ givenprefix = ''
+ prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
+ if prefix:
+ localname = prefix + ':' + localname
+ elif namespace and not qname: #Expat
+ for name,value in self.namespacesInUse.items():
+ if name and value == namespace:
+ localname = name + ':' + localname
+ break
+ localname = str(localname).lower()
+ self.unknown_endtag(localname)
+
+ def error(self, exc):
+ self.bozo = 1
+ self.exc = exc
+
+ # drv_libxml2 calls warning() in some cases
+ warning = error
+
+ def fatalError(self, exc):
+ self.error(exc)
+ raise exc
+
+class _BaseHTMLProcessor(sgmllib.SGMLParser):
+ special = re.compile('''[<>'"]''')
+ bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
+ elements_no_end_tag = [
+ 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame',
+ 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param',
+ 'source', 'track', 'wbr'
+ ]
+
+ def __init__(self, encoding, _type):
+ self.encoding = encoding
+ self._type = _type
+ sgmllib.SGMLParser.__init__(self)
+
+ def reset(self):
+ self.pieces = []
+ sgmllib.SGMLParser.reset(self)
+
+ def _shorttag_replace(self, match):
+ tag = match.group(1)
+ if tag in self.elements_no_end_tag:
+ return '<' + tag + ' />'
+ else:
+ return '<' + tag + '></' + tag + '>'
+
+ # By declaring these methods and overriding their compiled code
+ # with the code from sgmllib, the original code will execute in
+ # feedparser's scope instead of sgmllib's. This means that the
+ # `tagfind` and `charref` regular expressions will be found as
+ # they're declared above, not as they're declared in sgmllib.
+ def goahead(self, i):
+ pass
+ goahead.func_code = sgmllib.SGMLParser.goahead.func_code
+
+ def __parse_starttag(self, i):
+ pass
+ __parse_starttag.func_code = sgmllib.SGMLParser.parse_starttag.func_code
+
+ def parse_starttag(self,i):
+ j = self.__parse_starttag(i)
+ if self._type == 'application/xhtml+xml':
+ if j>2 and self.rawdata[j-2:j]=='/>':
+ self.unknown_endtag(self.lasttag)
+ return j
+
+ def feed(self, data):
+ data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'<!\1', data)
+ #data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace
+ data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
+ data = data.replace(''', "'")
+ data = data.replace('"', '"')
+ try:
+ bytes
+ if bytes is str:
+ raise NameError
+ self.encoding = self.encoding + u'_INVALID_PYTHON_3'
+ except NameError:
+ if self.encoding and isinstance(data, unicode):
+ data = data.encode(self.encoding)
+ sgmllib.SGMLParser.feed(self, data)
+ sgmllib.SGMLParser.close(self)
+
+ def normalize_attrs(self, attrs):
+ if not attrs:
+ return attrs
+ # utility method to be called by descendants
+ attrs = dict([(k.lower(), v) for k, v in attrs]).items()
+ attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
+ attrs.sort()
+ return attrs
+
+ def unknown_starttag(self, tag, attrs):
+ # called for each start tag
+ # attrs is a list of (attr, value) tuples
+ # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
+ uattrs = []
+ strattrs=''
+ if attrs:
+ for key, value in attrs:
+ value=value.replace('>','>').replace('<','<').replace('"','"')
+ value = self.bare_ampersand.sub("&", value)
+ # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
+ if not isinstance(value, unicode):
+ value = value.decode(self.encoding, 'ignore')
+ try:
+ # Currently, in Python 3 the key is already a str, and cannot be decoded again
+ uattrs.append((unicode(key, self.encoding), value))
+ except TypeError:
+ uattrs.append((key, value))
+ strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs])
+ if self.encoding:
+ try:
+ strattrs = strattrs.encode(self.encoding)
+ except (UnicodeEncodeError, LookupError):
+ pass
+ if tag in self.elements_no_end_tag:
+ self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
+ else:
+ self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
+
+ def unknown_endtag(self, tag):
+ # called for each end tag, e.g. for </pre>, tag will be 'pre'
+ # Reconstruct the original end tag.
+ if tag not in self.elements_no_end_tag:
+ self.pieces.append("</%(tag)s>" % locals())
+
+ def handle_charref(self, ref):
+ # called for each character reference, e.g. for ' ', ref will be '160'
+ # Reconstruct the original character reference.
+ if ref.startswith('x'):
+ value = unichr(int(ref[1:],16))
+ else:
+ value = unichr(int(ref))
+
+ if value in _cp1252.keys():
+ self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:])
+ else:
+ self.pieces.append('&#%(ref)s;' % locals())
+
+ def handle_entityref(self, ref):
+ # called for each entity reference, e.g. for '©', ref will be 'copy'
+ # Reconstruct the original entity reference.
+ if name2codepoint.has_key(ref):
+ self.pieces.append('&%(ref)s;' % locals())
+ else:
+ self.pieces.append('&%(ref)s' % locals())
+
+ def handle_data(self, text):
+ # called for each block of plain text, i.e. outside of any tag and
+ # not containing any character or entity references
+ # Store the original text verbatim.
+ self.pieces.append(text)
+
+ def handle_comment(self, text):
+ # called for each HTML comment, e.g. <!-- insert Javascript code here -->
+ # Reconstruct the original comment.
+ self.pieces.append('<!--%(text)s-->' % locals())
+
+ def handle_pi(self, text):
+ # called for each processing instruction, e.g. <?instruction>
+ # Reconstruct original processing instruction.
+ self.pieces.append('<?%(text)s>' % locals())
+
+ def handle_decl(self, text):
+ # called for the DOCTYPE, if present, e.g.
+ # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
+ # "http://www.w3.org/TR/html4/loose.dtd">
+ # Reconstruct original DOCTYPE
+ self.pieces.append('<!%(text)s>' % locals())
+
+ _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
+ def _scan_name(self, i, declstartpos):
+ rawdata = self.rawdata
+ n = len(rawdata)
+ if i == n:
+ return None, -1
+ m = self._new_declname_match(rawdata, i)
+ if m:
+ s = m.group()
+ name = s.strip()
+ if (i + len(s)) == n:
+ return None, -1 # end of buffer
+ return name.lower(), m.end()
+ else:
+ self.handle_data(rawdata)
+# self.updatepos(declstartpos, i)
+ return None, -1
+
+ def convert_charref(self, name):
+ return '&#%s;' % name
+
+ def convert_entityref(self, name):
+ return '&%s;' % name
+
+ def output(self):
+ '''Return processed HTML as a single string'''
+ return ''.join([str(p) for p in self.pieces])
+
+ def parse_declaration(self, i):
+ try:
+ return sgmllib.SGMLParser.parse_declaration(self, i)
+ except sgmllib.SGMLParseError:
+ # escape the doctype declaration and continue parsing
+ self.handle_data('<')
+ return i+1
+
+class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
+ def __init__(self, baseuri, baselang, encoding, entities):
+ sgmllib.SGMLParser.__init__(self)
+ _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
+ _BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml')
+ self.entities=entities
+
+ def decodeEntities(self, element, data):
+ data = data.replace('<', '<')
+ data = data.replace('<', '<')
+ data = data.replace('<', '<')
+ data = data.replace('>', '>')
+ data = data.replace('>', '>')
+ data = data.replace('>', '>')
+ data = data.replace('&', '&')
+ data = data.replace('&', '&')
+ data = data.replace('"', '"')
+ data = data.replace('"', '"')
+ data = data.replace(''', ''')
+ data = data.replace(''', ''')
+ if self.contentparams.has_key('type') and not self.contentparams.get('type', u'xml').endswith(u'xml'):
+ data = data.replace('<', '<')
+ data = data.replace('>', '>')
+ data = data.replace('&', '&')
+ data = data.replace('"', '"')
+ data = data.replace(''', "'")
+ return data
+
+ def strattrs(self, attrs):
+ return ''.join([' %s="%s"' % (n,v.replace('"','"')) for n,v in attrs])
+
+class _MicroformatsParser:
+ STRING = 1
+ DATE = 2
+ URI = 3
+ NODE = 4
+ EMAIL = 5
+
+ known_xfn_relationships = ['contact', 'acquaintance', 'friend', 'met', 'co-worker', 'coworker', 'colleague', 'co-resident', 'coresident', 'neighbor', 'child', 'parent', 'sibling', 'brother', 'sister', 'spouse', 'wife', 'husband', 'kin', 'relative', 'muse', 'crush', 'date', 'sweetheart', 'me']
+ known_binary_extensions = ['zip','rar','exe','gz','tar','tgz','tbz2','bz2','z','7z','dmg','img','sit','sitx','hqx','deb','rpm','bz2','jar','rar','iso','bin','msi','mp2','mp3','ogg','ogm','mp4','m4v','m4a','avi','wma','wmv']
+
+ def __init__(self, data, baseuri, encoding):
+ self.document = BeautifulSoup.BeautifulSoup(data)
+ self.baseuri = baseuri
+ self.encoding = encoding
+ if isinstance(data, unicode):
+ data = data.encode(encoding)
+ self.tags = []
+ self.enclosures = []
+ self.xfn = []
+ self.vcard = None
+
+ def vcardEscape(self, s):
+ if isinstance(s, basestring):
+ s = s.replace(',', '\\,').replace(';', '\\;').replace('\n', '\\n')
+ return s
+
+ def vcardFold(self, s):
+ s = re.sub(';+$', '', s)
+ sFolded = ''
+ iMax = 75
+ sPrefix = ''
+ while len(s) > iMax:
+ sFolded += sPrefix + s[:iMax] + '\n'
+ s = s[iMax:]
+ sPrefix = ' '
+ iMax = 74
+ sFolded += sPrefix + s
+ return sFolded
+
+ def normalize(self, s):
+ return re.sub(r'\s+', ' ', s).strip()
+
+ def unique(self, aList):
+ results = []
+ for element in aList:
+ if element not in results:
+ results.append(element)
+ return results
+
+ def toISO8601(self, dt):
+ return time.strftime('%Y-%m-%dT%H:%M:%SZ', dt)
+
+ def getPropertyValue(self, elmRoot, sProperty, iPropertyType=4, bAllowMultiple=0, bAutoEscape=0):
+ all = lambda x: 1
+ sProperty = sProperty.lower()
+ bFound = 0
+ bNormalize = 1
+ propertyMatch = {'class': re.compile(r'\b%s\b' % sProperty)}
+ if bAllowMultiple and (iPropertyType != self.NODE):
+ snapResults = []
+ containers = elmRoot(['ul', 'ol'], propertyMatch)
+ for container in containers:
+ snapResults.extend(container('li'))
+ bFound = (len(snapResults) != 0)
+ if not bFound:
+ snapResults = elmRoot(all, propertyMatch)
+ bFound = (len(snapResults) != 0)
+ if (not bFound) and (sProperty == 'value'):
+ snapResults = elmRoot('pre')
+ bFound = (len(snapResults) != 0)
+ bNormalize = not bFound
+ if not bFound:
+ snapResults = [elmRoot]
+ bFound = (len(snapResults) != 0)
+ arFilter = []
+ if sProperty == 'vcard':
+ snapFilter = elmRoot(all, propertyMatch)
+ for node in snapFilter:
+ if node.findParent(all, propertyMatch):
+ arFilter.append(node)
+ arResults = []
+ for node in snapResults:
+ if node not in arFilter:
+ arResults.append(node)
+ bFound = (len(arResults) != 0)
+ if not bFound:
+ if bAllowMultiple:
+ return []
+ elif iPropertyType == self.STRING:
+ return ''
+ elif iPropertyType == self.DATE:
+ return None
+ elif iPropertyType == self.URI:
+ return ''
+ elif iPropertyType == self.NODE:
+ return None
+ else:
+ return None
+ arValues = []
+ for elmResult in arResults:
+ sValue = None
+ if iPropertyType == self.NODE:
+ if bAllowMultiple:
+ arValues.append(elmResult)
+ continue
+ else:
+ return elmResult
+ sNodeName = elmResult.name.lower()
+ if (iPropertyType == self.EMAIL) and (sNodeName == 'a'):
+ sValue = (elmResult.get('href') or '').split('mailto:').pop().split('?')[0]
+ if sValue:
+ sValue = bNormalize and self.normalize(sValue) or sValue.strip()
+ if (not sValue) and (sNodeName == 'abbr'):
+ sValue = elmResult.get('title')
+ if sValue:
+ sValue = bNormalize and self.normalize(sValue) or sValue.strip()
+ if (not sValue) and (iPropertyType == self.URI):
+ if sNodeName == 'a':
+ sValue = elmResult.get('href')
+ elif sNodeName == 'img':
+ sValue = elmResult.get('src')
+ elif sNodeName == 'object':
+ sValue = elmResult.get('data')
+ if sValue:
+ sValue = bNormalize and self.normalize(sValue) or sValue.strip()
+ if (not sValue) and (sNodeName == 'img'):
+ sValue = elmResult.get('alt')
+ if sValue:
+ sValue = bNormalize and self.normalize(sValue) or sValue.strip()
+ if not sValue:
+ sValue = elmResult.renderContents()
+ sValue = re.sub(r'<\S[^>]*>', '', sValue)
+ sValue = sValue.replace('\r\n', '\n')
+ sValue = sValue.replace('\r', '\n')
+ if sValue:
+ sValue = bNormalize and self.normalize(sValue) or sValue.strip()
+ if not sValue:
+ continue
+ if iPropertyType == self.DATE:
+ sValue = _parse_date_iso8601(sValue)
+ if bAllowMultiple:
+ arValues.append(bAutoEscape and self.vcardEscape(sValue) or sValue)
+ else:
+ return bAutoEscape and self.vcardEscape(sValue) or sValue
+ return arValues
+
+ def findVCards(self, elmRoot, bAgentParsing=0):
+ sVCards = ''
+
+ if not bAgentParsing:
+ arCards = self.getPropertyValue(elmRoot, 'vcard', bAllowMultiple=1)
+ else:
+ arCards = [elmRoot]
+
+ for elmCard in arCards:
+ arLines = []
+
+ def processSingleString(sProperty):
+ sValue = self.getPropertyValue(elmCard, sProperty, self.STRING, bAutoEscape=1).decode(self.encoding)
+ if sValue:
+ arLines.append(self.vcardFold(sProperty.upper() + ':' + sValue))
+ return sValue or u''
+
+ def processSingleURI(sProperty):
+ sValue = self.getPropertyValue(elmCard, sProperty, self.URI)
+ if sValue:
+ sContentType = ''
+ sEncoding = ''
+ sValueKey = ''
+ if sValue.startswith('data:'):
+ sEncoding = ';ENCODING=b'
+ sContentType = sValue.split(';')[0].split('/').pop()
+ sValue = sValue.split(',', 1).pop()
+ else:
+ elmValue = self.getPropertyValue(elmCard, sProperty)
+ if elmValue:
+ if sProperty != 'url':
+ sValueKey = ';VALUE=uri'
+ sContentType = elmValue.get('type', '').strip().split('/').pop().strip()
+ sContentType = sContentType.upper()
+ if sContentType == 'OCTET-STREAM':
+ sContentType = ''
+ if sContentType:
+ sContentType = ';TYPE=' + sContentType.upper()
+ arLines.append(self.vcardFold(sProperty.upper() + sEncoding + sContentType + sValueKey + ':' + sValue))
+
+ def processTypeValue(sProperty, arDefaultType, arForceType=None):
+ arResults = self.getPropertyValue(elmCard, sProperty, bAllowMultiple=1)
+ for elmResult in arResults:
+ arType = self.getPropertyValue(elmResult, 'type', self.STRING, 1, 1)
+ if arForceType:
+ arType = self.unique(arForceType + arType)
+ if not arType:
+ arType = arDefaultType
+ sValue = self.getPropertyValue(elmResult, 'value', self.EMAIL, 0)
+ if sValue:
+ arLines.append(self.vcardFold(sProperty.upper() + ';TYPE=' + ','.join(arType) + ':' + sValue))
+
+ # AGENT
+ # must do this before all other properties because it is destructive
+ # (removes nested class="vcard" nodes so they don't interfere with
+ # this vcard's other properties)
+ arAgent = self.getPropertyValue(elmCard, 'agent', bAllowMultiple=1)
+ for elmAgent in arAgent:
+ if re.compile(r'\bvcard\b').search(elmAgent.get('class')):
+ sAgentValue = self.findVCards(elmAgent, 1) + '\n'
+ sAgentValue = sAgentValue.replace('\n', '\\n')
+ sAgentValue = sAgentValue.replace(';', '\\;')
+ if sAgentValue:
+ arLines.append(self.vcardFold('AGENT:' + sAgentValue))
+ # Completely remove the agent element from the parse tree
+ elmAgent.extract()
+ else:
+ sAgentValue = self.getPropertyValue(elmAgent, 'value', self.URI, bAutoEscape=1);
+ if sAgentValue:
+ arLines.append(self.vcardFold('AGENT;VALUE=uri:' + sAgentValue))
+
+ # FN (full name)
+ sFN = processSingleString('fn')
+
+ # N (name)
+ elmName = self.getPropertyValue(elmCard, 'n')
+ if elmName:
+ sFamilyName = self.getPropertyValue(elmName, 'family-name', self.STRING, bAutoEscape=1)
+ sGivenName = self.getPropertyValue(elmName, 'given-name', self.STRING, bAutoEscape=1)
+ arAdditionalNames = self.getPropertyValue(elmName, 'additional-name', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'additional-names', self.STRING, 1, 1)
+ arHonorificPrefixes = self.getPropertyValue(elmName, 'honorific-prefix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-prefixes', self.STRING, 1, 1)
+ arHonorificSuffixes = self.getPropertyValue(elmName, 'honorific-suffix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-suffixes', self.STRING, 1, 1)
+ arLines.append(self.vcardFold('N:' + sFamilyName + ';' +
+ sGivenName + ';' +
+ ','.join(arAdditionalNames) + ';' +
+ ','.join(arHonorificPrefixes) + ';' +
+ ','.join(arHonorificSuffixes)))
+ elif sFN:
+ # implied "N" optimization
+ # http://microformats.org/wiki/hcard#Implied_.22N.22_Optimization
+ arNames = self.normalize(sFN).split()
+ if len(arNames) == 2:
+ bFamilyNameFirst = (arNames[0].endswith(',') or
+ len(arNames[1]) == 1 or
+ ((len(arNames[1]) == 2) and (arNames[1].endswith('.'))))
+ if bFamilyNameFirst:
+ arLines.append(self.vcardFold('N:' + arNames[0] + ';' + arNames[1]))
+ else:
+ arLines.append(self.vcardFold('N:' + arNames[1] + ';' + arNames[0]))
+
+ # SORT-STRING
+ sSortString = self.getPropertyValue(elmCard, 'sort-string', self.STRING, bAutoEscape=1)
+ if sSortString:
+ arLines.append(self.vcardFold('SORT-STRING:' + sSortString))
+
+ # NICKNAME
+ arNickname = self.getPropertyValue(elmCard, 'nickname', self.STRING, 1, 1)
+ if arNickname:
+ arLines.append(self.vcardFold('NICKNAME:' + ','.join(arNickname)))
+
+ # PHOTO
+ processSingleURI('photo')
+
+ # BDAY
+ dtBday = self.getPropertyValue(elmCard, 'bday', self.DATE)
+ if dtBday:
+ arLines.append(self.vcardFold('BDAY:' + self.toISO8601(dtBday)))
+
+ # ADR (address)
+ arAdr = self.getPropertyValue(elmCard, 'adr', bAllowMultiple=1)
+ for elmAdr in arAdr:
+ arType = self.getPropertyValue(elmAdr, 'type', self.STRING, 1, 1)
+ if not arType:
+ arType = ['intl','postal','parcel','work'] # default adr types, see RFC 2426 section 3.2.1
+ sPostOfficeBox = self.getPropertyValue(elmAdr, 'post-office-box', self.STRING, 0, 1)
+ sExtendedAddress = self.getPropertyValue(elmAdr, 'extended-address', self.STRING, 0, 1)
+ sStreetAddress = self.getPropertyValue(elmAdr, 'street-address', self.STRING, 0, 1)
+ sLocality = self.getPropertyValue(elmAdr, 'locality', self.STRING, 0, 1)
+ sRegion = self.getPropertyValue(elmAdr, 'region', self.STRING, 0, 1)
+ sPostalCode = self.getPropertyValue(elmAdr, 'postal-code', self.STRING, 0, 1)
+ sCountryName = self.getPropertyValue(elmAdr, 'country-name', self.STRING, 0, 1)
+ arLines.append(self.vcardFold('ADR;TYPE=' + ','.join(arType) + ':' +
+ sPostOfficeBox + ';' +
+ sExtendedAddress + ';' +
+ sStreetAddress + ';' +
+ sLocality + ';' +
+ sRegion + ';' +
+ sPostalCode + ';' +
+ sCountryName))
+
+ # LABEL
+ processTypeValue('label', ['intl','postal','parcel','work'])
+
+ # TEL (phone number)
+ processTypeValue('tel', ['voice'])
+
+ # EMAIL
+ processTypeValue('email', ['internet'], ['internet'])
+
+ # MAILER
+ processSingleString('mailer')
+
+ # TZ (timezone)
+ processSingleString('tz')
+
+ # GEO (geographical information)
+ elmGeo = self.getPropertyValue(elmCard, 'geo')
+ if elmGeo:
+ sLatitude = self.getPropertyValue(elmGeo, 'latitude', self.STRING, 0, 1)
+ sLongitude = self.getPropertyValue(elmGeo, 'longitude', self.STRING, 0, 1)
+ arLines.append(self.vcardFold('GEO:' + sLatitude + ';' + sLongitude))
+
+ # TITLE
+ processSingleString('title')
+
+ # ROLE
+ processSingleString('role')
+
+ # LOGO
+ processSingleURI('logo')
+
+ # ORG (organization)
+ elmOrg = self.getPropertyValue(elmCard, 'org')
+ if elmOrg:
+ sOrganizationName = self.getPropertyValue(elmOrg, 'organization-name', self.STRING, 0, 1)
+ if not sOrganizationName:
+ # implied "organization-name" optimization
+ # http://microformats.org/wiki/hcard#Implied_.22organization-name.22_Optimization
+ sOrganizationName = self.getPropertyValue(elmCard, 'org', self.STRING, 0, 1)
+ if sOrganizationName:
+ arLines.append(self.vcardFold('ORG:' + sOrganizationName))
+ else:
+ arOrganizationUnit = self.getPropertyValue(elmOrg, 'organization-unit', self.STRING, 1, 1)
+ arLines.append(self.vcardFold('ORG:' + sOrganizationName + ';' + ';'.join(arOrganizationUnit)))
+
+ # CATEGORY
+ arCategory = self.getPropertyValue(elmCard, 'category', self.STRING, 1, 1) + self.getPropertyValue(elmCard, 'categories', self.STRING, 1, 1)
+ if arCategory:
+ arLines.append(self.vcardFold('CATEGORIES:' + ','.join(arCategory)))
+
+ # NOTE
+ processSingleString('note')
+
+ # REV
+ processSingleString('rev')
+
+ # SOUND
+ processSingleURI('sound')
+
+ # UID
+ processSingleString('uid')
+
+ # URL
+ processSingleURI('url')
+
+ # CLASS
+ processSingleString('class')
+
+ # KEY
+ processSingleURI('key')
+
+ if arLines:
+ arLines = [u'BEGIN:vCard',u'VERSION:3.0'] + arLines + [u'END:vCard']
+ # XXX - this is super ugly; properly fix this with issue 148
+ for i, s in enumerate(arLines):
+ if not isinstance(s, unicode):
+ arLines[i] = s.decode('utf-8', 'ignore')
+ sVCards += u'\n'.join(arLines) + u'\n'
+
+ return sVCards.strip()
+
+ def isProbablyDownloadable(self, elm):
+ attrsD = elm.attrMap
+ if not attrsD.has_key('href'):
+ return 0
+ linktype = attrsD.get('type', '').strip()
+ if linktype.startswith('audio/') or \
+ linktype.startswith('video/') or \
+ (linktype.startswith('application/') and not linktype.endswith('xml')):
+ return 1
+ path = urlparse.urlparse(attrsD['href'])[2]
+ if path.find('.') == -1:
+ return 0
+ fileext = path.split('.').pop().lower()
+ return fileext in self.known_binary_extensions
+
+ def findTags(self):
+ all = lambda x: 1
+ for elm in self.document(all, {'rel': re.compile(r'\btag\b')}):
+ href = elm.get('href')
+ if not href:
+ continue
+ urlscheme, domain, path, params, query, fragment = \
+ urlparse.urlparse(_urljoin(self.baseuri, href))
+ segments = path.split('/')
+ tag = segments.pop()
+ if not tag:
+ if segments:
+ tag = segments.pop()
+ else:
+ # there are no tags
+ continue
+ tagscheme = urlparse.urlunparse((urlscheme, domain, '/'.join(segments), '', '', ''))
+ if not tagscheme.endswith('/'):
+ tagscheme += '/'
+ self.tags.append(FeedParserDict({"term": tag, "scheme": tagscheme, "label": elm.string or ''}))
+
+ def findEnclosures(self):
+ all = lambda x: 1
+ enclosure_match = re.compile(r'\benclosure\b')
+ for elm in self.document(all, {'href': re.compile(r'.+')}):
+ if not enclosure_match.search(elm.get('rel', u'')) and not self.isProbablyDownloadable(elm):
+ continue
+ if elm.attrMap not in self.enclosures:
+ self.enclosures.append(elm.attrMap)
+ if elm.string and not elm.get('title'):
+ self.enclosures[-1]['title'] = elm.string
+
+ def findXFN(self):
+ all = lambda x: 1
+ for elm in self.document(all, {'rel': re.compile('.+'), 'href': re.compile('.+')}):
+ rels = elm.get('rel', u'').split()
+ xfn_rels = []
+ for rel in rels:
+ if rel in self.known_xfn_relationships:
+ xfn_rels.append(rel)
+ if xfn_rels:
+ self.xfn.append({"relationships": xfn_rels, "href": elm.get('href', ''), "name": elm.string})
+
+def _parseMicroformats(htmlSource, baseURI, encoding):
+ if not BeautifulSoup:
+ return
+ try:
+ p = _MicroformatsParser(htmlSource, baseURI, encoding)
+ except UnicodeEncodeError:
+ # sgmllib throws this exception when performing lookups of tags
+ # with non-ASCII characters in them.
+ return
+ p.vcard = p.findVCards(p.document)
+ p.findTags()
+ p.findEnclosures()
+ p.findXFN()
+ return {"tags": p.tags, "enclosures": p.enclosures, "xfn": p.xfn, "vcard": p.vcard}
+
+class _RelativeURIResolver(_BaseHTMLProcessor):
+ relative_uris = [('a', 'href'),
+ ('applet', 'codebase'),
+ ('area', 'href'),
+ ('blockquote', 'cite'),
+ ('body', 'background'),
+ ('del', 'cite'),
+ ('form', 'action'),
+ ('frame', 'longdesc'),
+ ('frame', 'src'),
+ ('iframe', 'longdesc'),
+ ('iframe', 'src'),
+ ('head', 'profile'),
+ ('img', 'longdesc'),
+ ('img', 'src'),
+ ('img', 'usemap'),
+ ('input', 'src'),
+ ('input', 'usemap'),
+ ('ins', 'cite'),
+ ('link', 'href'),
+ ('object', 'classid'),
+ ('object', 'codebase'),
+ ('object', 'data'),
+ ('object', 'usemap'),
+ ('q', 'cite'),
+ ('script', 'src')]
+
+ def __init__(self, baseuri, encoding, _type):
+ _BaseHTMLProcessor.__init__(self, encoding, _type)
+ self.baseuri = baseuri
+
+ def resolveURI(self, uri):
+ return _makeSafeAbsoluteURI(_urljoin(self.baseuri, uri.strip()))
+
+ def unknown_starttag(self, tag, attrs):
+ attrs = self.normalize_attrs(attrs)
+ attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
+ _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
+
+def _resolveRelativeURIs(htmlSource, baseURI, encoding, _type):
+ if not _SGML_AVAILABLE:
+ return htmlSource
+
+ p = _RelativeURIResolver(baseURI, encoding, _type)
+ p.feed(htmlSource)
+ return p.output()
+
+def _makeSafeAbsoluteURI(base, rel=None):
+ # bail if ACCEPTABLE_URI_SCHEMES is empty
+ if not ACCEPTABLE_URI_SCHEMES:
+ return _urljoin(base, rel or u'')
+ if not base:
+ return rel or u''
+ if not rel:
+ scheme = urlparse.urlparse(base)[0]
+ if not scheme or scheme in ACCEPTABLE_URI_SCHEMES:
+ return base
+ return u''
+ uri = _urljoin(base, rel)
+ if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES:
+ return u''
+ return uri
+
+class _HTMLSanitizer(_BaseHTMLProcessor):
+ acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
+ 'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
+ 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
+ 'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
+ 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
+ 'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
+ 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
+ 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
+ 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
+ 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
+ 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
+ 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
+ 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript']
+
+ acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
+ 'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
+ 'background', 'balance', 'bgcolor', 'bgproperties', 'border',
+ 'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
+ 'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
+ 'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols',
+ 'colspan', 'compact', 'contenteditable', 'controls', 'coords', 'data',
+ 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', 'delay',
+ 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for',
+ 'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus',
+ 'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode',
+ 'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc',
+ 'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max',
+ 'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref',
+ 'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size',
+ 'prompt', 'pqg', 'radiogroup', 'readonly', 'rel', 'repeat-max',
+ 'repeat-min', 'replace', 'required', 'rev', 'rightspacing', 'rows',
+ 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src',
+ 'start', 'step', 'summary', 'suppress', 'tabindex', 'target', 'template',
+ 'title', 'toppadding', 'type', 'unselectable', 'usemap', 'urn', 'valign',
+ 'value', 'variable', 'volume', 'vspace', 'vrml', 'width', 'wrap',
+ 'xml:lang']
+
+ unacceptable_elements_with_end_tag = ['script', 'applet', 'style']
+
+ acceptable_css_properties = ['azimuth', 'background-color',
+ 'border-bottom-color', 'border-collapse', 'border-color',
+ 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
+ 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
+ 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
+ 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
+ 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
+ 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
+ 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
+ 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
+ 'white-space', 'width']
+
+ # survey of common keywords found in feeds
+ acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
+ 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
+ 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
+ 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
+ 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
+ 'transparent', 'underline', 'white', 'yellow']
+
+ valid_css_values = re.compile('^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|' +
+ '\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$')
+
+ mathml_elements = ['annotation', 'annotation-xml', 'maction', 'math',
+ 'merror', 'mfenced', 'mfrac', 'mi', 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded',
+ 'mphantom', 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle',
+ 'msub', 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
+ 'munderover', 'none', 'semantics']
+
+ mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
+ 'columnalign', 'close', 'columnlines', 'columnspacing', 'columnspan', 'depth',
+ 'display', 'displaystyle', 'encoding', 'equalcolumns', 'equalrows',
+ 'fence', 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness',
+ 'lspace', 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant',
+ 'maxsize', 'minsize', 'open', 'other', 'rowalign', 'rowalign', 'rowalign',
+ 'rowlines', 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
+ 'separator', 'separators', 'stretchy', 'width', 'width', 'xlink:href',
+ 'xlink:show', 'xlink:type', 'xmlns', 'xmlns:xlink']
+
+ # svgtiny - foreignObject + linearGradient + radialGradient + stop
+ svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
+ 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject',
+ 'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
+ 'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath',
+ 'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop',
+ 'svg', 'switch', 'text', 'title', 'tspan', 'use']
+
+ # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
+ svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
+ 'arabic-form', 'ascent', 'attributeName', 'attributeType',
+ 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
+ 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
+ 'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
+ 'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
+ 'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
+ 'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
+ 'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'keySplines',
+ 'keyTimes', 'lang', 'mathematical', 'marker-end', 'marker-mid',
+ 'marker-start', 'markerHeight', 'markerUnits', 'markerWidth', 'max',
+ 'min', 'name', 'offset', 'opacity', 'orient', 'origin',
+ 'overline-position', 'overline-thickness', 'panose-1', 'path',
+ 'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX', 'refY',
+ 'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
+ 'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
+ 'stop-color', 'stop-opacity', 'strikethrough-position',
+ 'strikethrough-thickness', 'stroke', 'stroke-dasharray',
+ 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
+ 'stroke-miterlimit', 'stroke-opacity', 'stroke-width', 'systemLanguage',
+ 'target', 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
+ 'underline-position', 'underline-thickness', 'unicode', 'unicode-range',
+ 'units-per-em', 'values', 'version', 'viewBox', 'visibility', 'width',
+ 'widths', 'x', 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
+ 'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
+ 'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1',
+ 'y2', 'zoomAndPan']
+
+ svg_attr_map = None
+ svg_elem_map = None
+
+ acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
+ 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
+ 'stroke-opacity']
+
+ def reset(self):
+ _BaseHTMLProcessor.reset(self)
+ self.unacceptablestack = 0
+ self.mathmlOK = 0
+ self.svgOK = 0
+
+ def unknown_starttag(self, tag, attrs):
+ acceptable_attributes = self.acceptable_attributes
+ keymap = {}
+ if not tag in self.acceptable_elements or self.svgOK:
+ if tag in self.unacceptable_elements_with_end_tag:
+ self.unacceptablestack += 1
+
+ # add implicit namespaces to html5 inline svg/mathml
+ if self._type.endswith('html'):
+ if not dict(attrs).get('xmlns'):
+ if tag=='svg':
+ attrs.append( ('xmlns','http://www.w3.org/2000/svg') )
+ if tag=='math':
+ attrs.append( ('xmlns','http://www.w3.org/1998/Math/MathML') )
+
+ # not otherwise acceptable, perhaps it is MathML or SVG?
+ if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs:
+ self.mathmlOK += 1
+ if tag=='svg' and ('xmlns','http://www.w3.org/2000/svg') in attrs:
+ self.svgOK += 1
+
+ # chose acceptable attributes based on tag class, else bail
+ if self.mathmlOK and tag in self.mathml_elements:
+ acceptable_attributes = self.mathml_attributes
+ elif self.svgOK and tag in self.svg_elements:
+ # for most vocabularies, lowercasing is a good idea. Many
+ # svg elements, however, are camel case
+ if not self.svg_attr_map:
+ lower=[attr.lower() for attr in self.svg_attributes]
+ mix=[a for a in self.svg_attributes if a not in lower]
+ self.svg_attributes = lower
+ self.svg_attr_map = dict([(a.lower(),a) for a in mix])
+
+ lower=[attr.lower() for attr in self.svg_elements]
+ mix=[a for a in self.svg_elements if a not in lower]
+ self.svg_elements = lower
+ self.svg_elem_map = dict([(a.lower(),a) for a in mix])
+ acceptable_attributes = self.svg_attributes
+ tag = self.svg_elem_map.get(tag,tag)
+ keymap = self.svg_attr_map
+ elif not tag in self.acceptable_elements:
+ return
+
+ # declare xlink namespace, if needed
+ if self.mathmlOK or self.svgOK:
+ if filter(lambda (n,v): n.startswith('xlink:'),attrs):
+ if not ('xmlns:xlink','http://www.w3.org/1999/xlink') in attrs:
+ attrs.append(('xmlns:xlink','http://www.w3.org/1999/xlink'))
+
+ clean_attrs = []
+ for key, value in self.normalize_attrs(attrs):
+ if key in acceptable_attributes:
+ key=keymap.get(key,key)
+ # make sure the uri uses an acceptable uri scheme
+ if key == u'href':
+ value = _makeSafeAbsoluteURI(value)
+ clean_attrs.append((key,value))
+ elif key=='style':
+ clean_value = self.sanitize_style(value)
+ if clean_value:
+ clean_attrs.append((key,clean_value))
+ _BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs)
+
+ def unknown_endtag(self, tag):
+ if not tag in self.acceptable_elements:
+ if tag in self.unacceptable_elements_with_end_tag:
+ self.unacceptablestack -= 1
+ if self.mathmlOK and tag in self.mathml_elements:
+ if tag == 'math' and self.mathmlOK:
+ self.mathmlOK -= 1
+ elif self.svgOK and tag in self.svg_elements:
+ tag = self.svg_elem_map.get(tag,tag)
+ if tag == 'svg' and self.svgOK:
+ self.svgOK -= 1
+ else:
+ return
+ _BaseHTMLProcessor.unknown_endtag(self, tag)
+
+ def handle_pi(self, text):
+ pass
+
+ def handle_decl(self, text):
+ pass
+
+ def handle_data(self, text):
+ if not self.unacceptablestack:
+ _BaseHTMLProcessor.handle_data(self, text)
+
+ def sanitize_style(self, style):
+ # disallow urls
+ style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
+
+ # gauntlet
+ if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
+ return ''
+ # This replaced a regexp that used re.match and was prone to pathological back-tracking.
+ if re.sub("\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip():
+ return ''
+
+ clean = []
+ for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
+ if not value:
+ continue
+ if prop.lower() in self.acceptable_css_properties:
+ clean.append(prop + ': ' + value + ';')
+ elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
+ for keyword in value.split():
+ if not keyword in self.acceptable_css_keywords and \
+ not self.valid_css_values.match(keyword):
+ break
+ else:
+ clean.append(prop + ': ' + value + ';')
+ elif self.svgOK and prop.lower() in self.acceptable_svg_properties:
+ clean.append(prop + ': ' + value + ';')
+
+ return ' '.join(clean)
+
+ def parse_comment(self, i, report=1):
+ ret = _BaseHTMLProcessor.parse_comment(self, i, report)
+ if ret >= 0:
+ return ret
+ # if ret == -1, this may be a malicious attempt to circumvent
+ # sanitization, or a page-destroying unclosed comment
+ match = re.compile(r'--[^>]*>').search(self.rawdata, i+4)
+ if match:
+ return match.end()
+ # unclosed comment; deliberately fail to handle_data()
+ return len(self.rawdata)
+
+
+def _sanitizeHTML(htmlSource, encoding, _type):
+ if not _SGML_AVAILABLE:
+ return htmlSource
+ p = _HTMLSanitizer(encoding, _type)
+ htmlSource = htmlSource.replace('<![CDATA[', '<![CDATA[')
+ p.feed(htmlSource)
+ data = p.output()
+ if TIDY_MARKUP:
+ # loop through list of preferred Tidy interfaces looking for one that's installed,
+ # then set up a common _tidy function to wrap the interface-specific API.
+ _tidy = None
+ for tidy_interface in PREFERRED_TIDY_INTERFACES:
+ try:
+ if tidy_interface == "uTidy":
+ from tidy import parseString as _utidy
+ def _tidy(data, **kwargs):
+ return str(_utidy(data, **kwargs))
+ break
+ elif tidy_interface == "mxTidy":
+ from mx.Tidy import Tidy as _mxtidy
+ def _tidy(data, **kwargs):
+ nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs)
+ return data
+ break
+ except:
+ pass
+ if _tidy:
+ utf8 = isinstance(data, unicode)
+ if utf8:
+ data = data.encode('utf-8')
+ data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8")
+ if utf8:
+ data = unicode(data, 'utf-8')
+ if data.count('<body'):
+ data = data.split('<body', 1)[1]
+ if data.count('>'):
+ data = data.split('>', 1)[1]
+ if data.count('</body'):
+ data = data.split('</body', 1)[0]
+ data = data.strip().replace('\r\n', '\n')
+ return data
+
+class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
+ def http_error_default(self, req, fp, code, msg, headers):
+ # The default implementation just raises HTTPError.
+ # Forget that.
+ fp.status = code
+ return fp
+
+ def http_error_301(self, req, fp, code, msg, hdrs):
+ result = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp,
+ code, msg, hdrs)
+ result.status = code
+ result.newurl = result.geturl()
+ return result
+ # The default implementations in urllib2.HTTPRedirectHandler
+ # are identical, so hardcoding a http_error_301 call above
+ # won't affect anything
+ http_error_300 = http_error_301
+ http_error_302 = http_error_301
+ http_error_303 = http_error_301
+ http_error_307 = http_error_301
+
+ def http_error_401(self, req, fp, code, msg, headers):
+ # Check if
+ # - server requires digest auth, AND
+ # - we tried (unsuccessfully) with basic auth, AND
+ # If all conditions hold, parse authentication information
+ # out of the Authorization header we sent the first time
+ # (for the username and password) and the WWW-Authenticate
+ # header the server sent back (for the realm) and retry
+ # the request with the appropriate digest auth headers instead.
+ # This evil genius hack has been brought to you by Aaron Swartz.
+ host = urlparse.urlparse(req.get_full_url())[1]
+ if base64 is None or 'Authorization' not in req.headers \
+ or 'WWW-Authenticate' not in headers:
+ return self.http_error_default(req, fp, code, msg, headers)
+ auth = _base64decode(req.headers['Authorization'].split(' ')[1])
+ user, passw = auth.split(':')
+ realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
+ self.add_password(realm, host, user, passw)
+ retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
+ self.reset_retry_count()
+ return retry
+
+def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers):
+ """URL, filename, or string --> stream
+
+ This function lets you define parsers that take any input source
+ (URL, pathname to local or network file, or actual data as a string)
+ and deal with it in a uniform manner. Returned object is guaranteed
+ to have all the basic stdio read methods (read, readline, readlines).
+ Just .close() the object when you're done with it.
+
+ If the etag argument is supplied, it will be used as the value of an
+ If-None-Match request header.
+
+ If the modified argument is supplied, it can be a tuple of 9 integers
+ (as returned by gmtime() in the standard Python time module) or a date
+ string in any format supported by feedparser. Regardless, it MUST
+ be in GMT (Greenwich Mean Time). It will be reformatted into an
+ RFC 1123-compliant date and used as the value of an If-Modified-Since
+ request header.
+
+ If the agent argument is supplied, it will be used as the value of a
+ User-Agent request header.
+
+ If the referrer argument is supplied, it will be used as the value of a
+ Referer[sic] request header.
+
+ If handlers is supplied, it is a list of handlers used to build a
+ urllib2 opener.
+
+ if request_headers is supplied it is a dictionary of HTTP request headers
+ that will override the values generated by FeedParser.
+ """
+
+ if hasattr(url_file_stream_or_string, 'read'):
+ return url_file_stream_or_string
+
+ if url_file_stream_or_string == '-':
+ return sys.stdin
+
+ if isinstance(url_file_stream_or_string, basestring) \
+ and urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'):
+ # Deal with the feed URI scheme
+ if url_file_stream_or_string.startswith('feed:http'):
+ url_file_stream_or_string = url_file_stream_or_string[5:]
+ elif url_file_stream_or_string.startswith('feed:'):
+ url_file_stream_or_string = 'http:' + url_file_stream_or_string[5:]
+ if not agent:
+ agent = USER_AGENT
+ # test for inline user:password for basic auth
+ auth = None
+ if base64:
+ urltype, rest = urllib.splittype(url_file_stream_or_string)
+ realhost, rest = urllib.splithost(rest)
+ if realhost:
+ user_passwd, realhost = urllib.splituser(realhost)
+ if user_passwd:
+ url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)
+ auth = base64.standard_b64encode(user_passwd).strip()
+
+ # iri support
+ if isinstance(url_file_stream_or_string, unicode):
+ url_file_stream_or_string = _convert_to_idn(url_file_stream_or_string)
+
+ # try to open with urllib2 (to use optional headers)
+ request = _build_urllib2_request(url_file_stream_or_string, agent, etag, modified, referrer, auth, request_headers)
+ opener = apply(urllib2.build_opener, tuple(handlers + [_FeedURLHandler()]))
+ opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
+ try:
+ return opener.open(request)
+ finally:
+ opener.close() # JohnD
+
+ # try to open with native open function (if url_file_stream_or_string is a filename)
+ try:
+ return open(url_file_stream_or_string, 'rb')
+ except IOError:
+ pass
+
+ # treat url_file_stream_or_string as string
+ if isinstance(url_file_stream_or_string, unicode):
+ return _StringIO(url_file_stream_or_string.encode('utf-8'))
+ return _StringIO(url_file_stream_or_string)
+
+def _convert_to_idn(url):
+ """Convert a URL to IDN notation"""
+ # this function should only be called with a unicode string
+ # strategy: if the host cannot be encoded in ascii, then
+ # it'll be necessary to encode it in idn form
+ parts = list(urlparse.urlsplit(url))
+ try:
+ parts[1].encode('ascii')
+ except UnicodeEncodeError:
+ # the url needs to be converted to idn notation
+ host = parts[1].rsplit(':', 1)
+ newhost = []
+ port = u''
+ if len(host) == 2:
+ port = host.pop()
+ for h in host[0].split('.'):
+ newhost.append(h.encode('idna').decode('utf-8'))
+ parts[1] = '.'.join(newhost)
+ if port:
+ parts[1] += ':' + port
+ return urlparse.urlunsplit(parts)
+ else:
+ return url
+
+def _build_urllib2_request(url, agent, etag, modified, referrer, auth, request_headers):
+ request = urllib2.Request(url)
+ request.add_header('User-Agent', agent)
+ if etag:
+ request.add_header('If-None-Match', etag)
+ if isinstance(modified, basestring):
+ modified = _parse_date(modified)
+ elif isinstance(modified, datetime.datetime):
+ modified = modified.utctimetuple()
+ if modified:
+ # format into an RFC 1123-compliant timestamp. We can't use
+ # time.strftime() since the %a and %b directives can be affected
+ # by the current locale, but RFC 2616 states that dates must be
+ # in English.
+ short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
+ months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
+ request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
+ if referrer:
+ request.add_header('Referer', referrer)
+ if gzip and zlib:
+ request.add_header('Accept-encoding', 'gzip, deflate')
+ elif gzip:
+ request.add_header('Accept-encoding', 'gzip')
+ elif zlib:
+ request.add_header('Accept-encoding', 'deflate')
+ else:
+ request.add_header('Accept-encoding', '')
+ if auth:
+ request.add_header('Authorization', 'Basic %s' % auth)
+ if ACCEPT_HEADER:
+ request.add_header('Accept', ACCEPT_HEADER)
+ # use this for whatever -- cookies, special headers, etc
+ # [('Cookie','Something'),('x-special-header','Another Value')]
+ for header_name, header_value in request_headers.items():
+ request.add_header(header_name, header_value)
+ request.add_header('A-IM', 'feed') # RFC 3229 support
+ return request
+
+_date_handlers = []
+def registerDateHandler(func):
+ '''Register a date handler function (takes string, returns 9-tuple date in GMT)'''
+ _date_handlers.insert(0, func)
+
+# ISO-8601 date parsing routines written by Fazal Majid.
+# The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
+# parser is beyond the scope of feedparser and would be a worthwhile addition
+# to the Python library.
+# A single regular expression cannot parse ISO 8601 date formats into groups
+# as the standard is highly irregular (for instance is 030104 2003-01-04 or
+# 0301-04-01), so we use templates instead.
+# Please note the order in templates is significant because we need a
+# greedy match.
+_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-0MM?-?DD', 'YYYY-MM', 'YYYY-?OOO',
+ 'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
+ '-YY-?MM', '-OOO', '-YY',
+ '--MM-?DD', '--MM',
+ '---DD',
+ 'CC', '']
+_iso8601_re = [
+ tmpl.replace(
+ 'YYYY', r'(?P<year>\d{4})').replace(
+ 'YY', r'(?P<year>\d\d)').replace(
+ 'MM', r'(?P<month>[01]\d)').replace(
+ 'DD', r'(?P<day>[0123]\d)').replace(
+ 'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
+ 'CC', r'(?P<century>\d\d$)')
+ + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
+ + r'(:(?P<second>\d{2}))?'
+ + r'(\.(?P<fracsecond>\d+))?'
+ + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
+ for tmpl in _iso8601_tmpl]
+try:
+ del tmpl
+except NameError:
+ pass
+_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
+try:
+ del regex
+except NameError:
+ pass
+def _parse_date_iso8601(dateString):
+ '''Parse a variety of ISO-8601-compatible formats like 20040105'''
+ m = None
+ for _iso8601_match in _iso8601_matches:
+ m = _iso8601_match(dateString)
+ if m:
+ break
+ if not m:
+ return
+ if m.span() == (0, 0):
+ return
+ params = m.groupdict()
+ ordinal = params.get('ordinal', 0)
+ if ordinal:
+ ordinal = int(ordinal)
+ else:
+ ordinal = 0
+ year = params.get('year', '--')
+ if not year or year == '--':
+ year = time.gmtime()[0]
+ elif len(year) == 2:
+ # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
+ year = 100 * int(time.gmtime()[0] / 100) + int(year)
+ else:
+ year = int(year)
+ month = params.get('month', '-')
+ if not month or month == '-':
+ # ordinals are NOT normalized by mktime, we simulate them
+ # by setting month=1, day=ordinal
+ if ordinal:
+ month = 1
+ else:
+ month = time.gmtime()[1]
+ month = int(month)
+ day = params.get('day', 0)
+ if not day:
+ # see above
+ if ordinal:
+ day = ordinal
+ elif params.get('century', 0) or \
+ params.get('year', 0) or params.get('month', 0):
+ day = 1
+ else:
+ day = time.gmtime()[2]
+ else:
+ day = int(day)
+ # special case of the century - is the first year of the 21st century
+ # 2000 or 2001 ? The debate goes on...
+ if 'century' in params.keys():
+ year = (int(params['century']) - 1) * 100 + 1
+ # in ISO 8601 most fields are optional
+ for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:
+ if not params.get(field, None):
+ params[field] = 0
+ hour = int(params.get('hour', 0))
+ minute = int(params.get('minute', 0))
+ second = int(float(params.get('second', 0)))
+ # weekday is normalized by mktime(), we can ignore it
+ weekday = 0
+ daylight_savings_flag = -1
+ tm = [year, month, day, hour, minute, second, weekday,
+ ordinal, daylight_savings_flag]
+ # ISO 8601 time zone adjustments
+ tz = params.get('tz')
+ if tz and tz != 'Z':
+ if tz[0] == '-':
+ tm[3] += int(params.get('tzhour', 0))
+ tm[4] += int(params.get('tzmin', 0))
+ elif tz[0] == '+':
+ tm[3] -= int(params.get('tzhour', 0))
+ tm[4] -= int(params.get('tzmin', 0))
+ else:
+ return None
+ # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
+ # which is guaranteed to normalize d/m/y/h/m/s.
+ # Many implementations have bugs, but we'll pretend they don't.
+ return time.localtime(time.mktime(tuple(tm)))
+registerDateHandler(_parse_date_iso8601)
+
+# 8-bit date handling routines written by ytrewq1.
+_korean_year = u'\ub144' # b3e2 in euc-kr
+_korean_month = u'\uc6d4' # bff9 in euc-kr
+_korean_day = u'\uc77c' # c0cf in euc-kr
+_korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr
+_korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr
+
+_korean_onblog_date_re = \
+ re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
+ (_korean_year, _korean_month, _korean_day))
+_korean_nate_date_re = \
+ re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
+ (_korean_am, _korean_pm))
+def _parse_date_onblog(dateString):
+ '''Parse a string according to the OnBlog 8-bit date format'''
+ m = _korean_onblog_date_re.match(dateString)
+ if not m:
+ return
+ w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
+ {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
+ 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
+ 'zonediff': '+09:00'}
+ return _parse_date_w3dtf(w3dtfdate)
+registerDateHandler(_parse_date_onblog)
+
+def _parse_date_nate(dateString):
+ '''Parse a string according to the Nate 8-bit date format'''
+ m = _korean_nate_date_re.match(dateString)
+ if not m:
+ return
+ hour = int(m.group(5))
+ ampm = m.group(4)
+ if (ampm == _korean_pm):
+ hour += 12
+ hour = str(hour)
+ if len(hour) == 1:
+ hour = '0' + hour
+ w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
+ {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
+ 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\
+ 'zonediff': '+09:00'}
+ return _parse_date_w3dtf(w3dtfdate)
+registerDateHandler(_parse_date_nate)
+
+_mssql_date_re = \
+ re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?')
+def _parse_date_mssql(dateString):
+ '''Parse a string according to the MS SQL date format'''
+ m = _mssql_date_re.match(dateString)
+ if not m:
+ return
+ w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
+ {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
+ 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
+ 'zonediff': '+09:00'}
+ return _parse_date_w3dtf(w3dtfdate)
+registerDateHandler(_parse_date_mssql)
+
+# Unicode strings for Greek date strings
+_greek_months = \
+ { \
+ u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7
+ u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7
+ u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7
+ u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7
+ u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7
+ u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7
+ u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7
+ u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7
+ u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7
+ u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7
+ u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7
+ u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7
+ u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7
+ u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7
+ u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7
+ u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7
+ u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7
+ u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7
+ u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7
+ }
+
+_greek_wdays = \
+ { \
+ u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7
+ u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7
+ u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7
+ u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7
+ u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7
+ u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7
+ u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7
+ }
+
+_greek_date_format_re = \
+ re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
+
+def _parse_date_greek(dateString):
+ '''Parse a string according to a Greek 8-bit date format.'''
+ m = _greek_date_format_re.match(dateString)
+ if not m:
+ return
+ wday = _greek_wdays[m.group(1)]
+ month = _greek_months[m.group(3)]
+ rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
+ {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
+ 'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\
+ 'zonediff': m.group(8)}
+ return _parse_date_rfc822(rfc822date)
+registerDateHandler(_parse_date_greek)
+
+# Unicode strings for Hungarian date strings
+_hungarian_months = \
+ { \
+ u'janu\u00e1r': u'01', # e1 in iso-8859-2
+ u'febru\u00e1ri': u'02', # e1 in iso-8859-2
+ u'm\u00e1rcius': u'03', # e1 in iso-8859-2
+ u'\u00e1prilis': u'04', # e1 in iso-8859-2
+ u'm\u00e1ujus': u'05', # e1 in iso-8859-2
+ u'j\u00fanius': u'06', # fa in iso-8859-2
+ u'j\u00falius': u'07', # fa in iso-8859-2
+ u'augusztus': u'08',
+ u'szeptember': u'09',
+ u'okt\u00f3ber': u'10', # f3 in iso-8859-2
+ u'november': u'11',
+ u'december': u'12',
+ }
+
+_hungarian_date_format_re = \
+ re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
+
+def _parse_date_hungarian(dateString):
+ '''Parse a string according to a Hungarian 8-bit date format.'''
+ m = _hungarian_date_format_re.match(dateString)
+ if not m or m.group(2) not in _hungarian_months:
+ return None
+ month = _hungarian_months[m.group(2)]
+ day = m.group(3)
+ if len(day) == 1:
+ day = '0' + day
+ hour = m.group(4)
+ if len(hour) == 1:
+ hour = '0' + hour
+ w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
+ {'year': m.group(1), 'month': month, 'day': day,\
+ 'hour': hour, 'minute': m.group(5),\
+ 'zonediff': m.group(6)}
+ return _parse_date_w3dtf(w3dtfdate)
+registerDateHandler(_parse_date_hungarian)
+
+# W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by
+# Drake and licensed under the Python license. Removed all range checking
+# for month, day, hour, minute, and second, since mktime will normalize
+# these later
+def _parse_date_w3dtf(dateString):
+ def __extract_date(m):
+ year = int(m.group('year'))
+ if year < 100:
+ year = 100 * int(time.gmtime()[0] / 100) + int(year)
+ if year < 1000:
+ return 0, 0, 0
+ julian = m.group('julian')
+ if julian:
+ julian = int(julian)
+ month = julian / 30 + 1
+ day = julian % 30 + 1
+ jday = None
+ while jday != julian:
+ t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0))
+ jday = time.gmtime(t)[-2]
+ diff = abs(jday - julian)
+ if jday > julian:
+ if diff < day:
+ day = day - diff
+ else:
+ month = month - 1
+ day = 31
+ elif jday < julian:
+ if day + diff < 28:
+ day = day + diff
+ else:
+ month = month + 1
+ return year, month, day
+ month = m.group('month')
+ day = 1
+ if month is None:
+ month = 1
+ else:
+ month = int(month)
+ day = m.group('day')
+ if day:
+ day = int(day)
+ else:
+ day = 1
+ return year, month, day
+
+ def __extract_time(m):
+ if not m:
+ return 0, 0, 0
+ hours = m.group('hours')
+ if not hours:
+ return 0, 0, 0
+ hours = int(hours)
+ minutes = int(m.group('minutes'))
+ seconds = m.group('seconds')
+ if seconds:
+ seconds = int(seconds)
+ else:
+ seconds = 0
+ return hours, minutes, seconds
+
+ def __extract_tzd(m):
+ '''Return the Time Zone Designator as an offset in seconds from UTC.'''
+ if not m:
+ return 0
+ tzd = m.group('tzd')
+ if not tzd:
+ return 0
+ if tzd == 'Z':
+ return 0
+ hours = int(m.group('tzdhours'))
+ minutes = m.group('tzdminutes')
+ if minutes:
+ minutes = int(minutes)
+ else:
+ minutes = 0
+ offset = (hours*60 + minutes) * 60
+ if tzd[0] == '+':
+ return -offset
+ return offset
+
+ __date_re = ('(?P<year>\d\d\d\d)'
+ '(?:(?P<dsep>-|)'
+ '(?:(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?'
+ '|(?P<julian>\d\d\d)))?')
+ __tzd_re = '(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)'
+ __tzd_rx = re.compile(__tzd_re)
+ __time_re = ('(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)'
+ '(?:(?P=tsep)(?P<seconds>\d\d)(?:[.,]\d+)?)?'
+ + __tzd_re)
+ __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re)
+ __datetime_rx = re.compile(__datetime_re)
+ m = __datetime_rx.match(dateString)
+ if (m is None) or (m.group() != dateString):
+ return
+ gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)
+ if gmt[0] == 0:
+ return
+ return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone)
+registerDateHandler(_parse_date_w3dtf)
+
+def _parse_date_rfc822(dateString):
+ '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date'''
+ data = dateString.split()
+ if not data:
+ return None
+ if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames:
+ del data[0]
+ if len(data) == 4:
+ s = data[3]
+ i = s.find('+')
+ if i > 0:
+ data[3:] = [s[:i], s[i+1:]]
+ else:
+ data.append('')
+ dateString = " ".join(data)
+ # Account for the Etc/GMT timezone by stripping 'Etc/'
+ elif len(data) == 5 and data[4].lower().startswith('etc/'):
+ data[4] = data[4][4:]
+ dateString = " ".join(data)
+ if len(data) < 5:
+ dateString += ' 00:00:00 GMT'
+ tm = rfc822.parsedate_tz(dateString)
+ if tm:
+ # Jython doesn't adjust for 2-digit years like CPython does,
+ # so account for it by shifting the year so that it's in the
+ # range 1970-2069 (1970 being the year of the Unix epoch).
+ if tm[0] < 100:
+ tm = (tm[0] + (1900, 2000)[tm[0] < 70],) + tm[1:]
+ return time.gmtime(rfc822.mktime_tz(tm))
+# rfc822.py defines several time zones, but we define some extra ones.
+# 'ET' is equivalent to 'EST', etc.
+_additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
+rfc822._timezones.update(_additional_timezones)
+registerDateHandler(_parse_date_rfc822)
+
+def _parse_date_perforce(aDateString):
+ """parse a date in yyyy/mm/dd hh:mm:ss TTT format"""
+ # Fri, 2006/09/15 08:19:53 EDT
+ _my_date_pattern = re.compile( \
+ r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})')
+
+ m = _my_date_pattern.search(aDateString)
+ if m is None:
+ return None
+ dow, year, month, day, hour, minute, second, tz = m.groups()
+ months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
+ dateString = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz)
+ tm = rfc822.parsedate_tz(dateString)
+ if tm:
+ return time.gmtime(rfc822.mktime_tz(tm))
+registerDateHandler(_parse_date_perforce)
+
+def _parse_date(dateString):
+ '''Parses a variety of date formats into a 9-tuple in GMT'''
+ if not dateString:
+ return None
+ for handler in _date_handlers:
+ try:
+ date9tuple = handler(dateString)
+ except (KeyError, OverflowError, ValueError):
+ continue
+ if not date9tuple:
+ continue
+ if len(date9tuple) != 9:
+ continue
+ return date9tuple
+ return None
+
+def _getCharacterEncoding(http_headers, xml_data):
+ '''Get the character encoding of the XML document
+
+ http_headers is a dictionary
+ xml_data is a raw string (not Unicode)
+
+ This is so much trickier than it sounds, it's not even funny.
+ According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
+ is application/xml, application/*+xml,
+ application/xml-external-parsed-entity, or application/xml-dtd,
+ the encoding given in the charset parameter of the HTTP Content-Type
+ takes precedence over the encoding given in the XML prefix within the
+ document, and defaults to 'utf-8' if neither are specified. But, if
+ the HTTP Content-Type is text/xml, text/*+xml, or
+ text/xml-external-parsed-entity, the encoding given in the XML prefix
+ within the document is ALWAYS IGNORED and only the encoding given in
+ the charset parameter of the HTTP Content-Type header should be
+ respected, and it defaults to 'us-ascii' if not specified.
+
+ Furthermore, discussion on the atom-syntax mailing list with the
+ author of RFC 3023 leads me to the conclusion that any document
+ served with a Content-Type of text/* and no charset parameter
+ must be treated as us-ascii. (We now do this.) And also that it
+ must always be flagged as non-well-formed. (We now do this too.)
+
+ If Content-Type is unspecified (input was local file or non-HTTP source)
+ or unrecognized (server just got it totally wrong), then go by the
+ encoding given in the XML prefix of the document and default to
+ 'iso-8859-1' as per the HTTP specification (RFC 2616).
+
+ Then, assuming we didn't find a character encoding in the HTTP headers
+ (and the HTTP Content-type allowed us to look in the body), we need
+ to sniff the first few bytes of the XML data and try to determine
+ whether the encoding is ASCII-compatible. Section F of the XML
+ specification shows the way here:
+ http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
+
+ If the sniffed encoding is not ASCII-compatible, we need to make it
+ ASCII compatible so that we can sniff further into the XML declaration
+ to find the encoding attribute, which will tell us the true encoding.
+
+ Of course, none of this guarantees that we will be able to parse the
+ feed in the declared character encoding (assuming it was declared
+ correctly, which many are not). CJKCodecs and iconv_codec help a lot;
+ you should definitely install them if you can.
+ http://cjkpython.i18n.org/
+ '''
+
+ def _parseHTTPContentType(content_type):
+ '''takes HTTP Content-Type header and returns (content type, charset)
+
+ If no charset is specified, returns (content type, '')
+ If no content type is specified, returns ('', '')
+ Both return parameters are guaranteed to be lowercase strings
+ '''
+ content_type = content_type or ''
+ content_type, params = cgi.parse_header(content_type)
+ charset = params.get('charset', '').replace("'", "")
+ if not isinstance(charset, unicode):
+ charset = charset.decode('utf-8', 'ignore')
+ return content_type, charset
+
+ sniffed_xml_encoding = u''
+ xml_encoding = u''
+ true_encoding = u''
+ http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type', http_headers.get('Content-type')))
+ # Must sniff for non-ASCII-compatible character encodings before
+ # searching for XML declaration. This heuristic is defined in
+ # section F of the XML specification:
+ # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
+ try:
+ if xml_data[:4] == _l2bytes([0x4c, 0x6f, 0xa7, 0x94]):
+ # EBCDIC
+ xml_data = _ebcdic_to_ascii(xml_data)
+ elif xml_data[:4] == _l2bytes([0x00, 0x3c, 0x00, 0x3f]):
+ # UTF-16BE
+ sniffed_xml_encoding = u'utf-16be'
+ xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
+ elif (len(xml_data) >= 4) and (xml_data[:2] == _l2bytes([0xfe, 0xff])) and (xml_data[2:4] != _l2bytes([0x00, 0x00])):
+ # UTF-16BE with BOM
+ sniffed_xml_encoding = u'utf-16be'
+ xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
+ elif xml_data[:4] == _l2bytes([0x3c, 0x00, 0x3f, 0x00]):
+ # UTF-16LE
+ sniffed_xml_encoding = u'utf-16le'
+ xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
+ elif (len(xml_data) >= 4) and (xml_data[:2] == _l2bytes([0xff, 0xfe])) and (xml_data[2:4] != _l2bytes([0x00, 0x00])):
+ # UTF-16LE with BOM
+ sniffed_xml_encoding = u'utf-16le'
+ xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
+ elif xml_data[:4] == _l2bytes([0x00, 0x00, 0x00, 0x3c]):
+ # UTF-32BE
+ sniffed_xml_encoding = u'utf-32be'
+ xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
+ elif xml_data[:4] == _l2bytes([0x3c, 0x00, 0x00, 0x00]):
+ # UTF-32LE
+ sniffed_xml_encoding = u'utf-32le'
+ xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
+ elif xml_data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]):
+ # UTF-32BE with BOM
+ sniffed_xml_encoding = u'utf-32be'
+ xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
+ elif xml_data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]):
+ # UTF-32LE with BOM
+ sniffed_xml_encoding = u'utf-32le'
+ xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
+ elif xml_data[:3] == _l2bytes([0xef, 0xbb, 0xbf]):
+ # UTF-8 with BOM
+ sniffed_xml_encoding = u'utf-8'
+ xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
+ else:
+ # ASCII-compatible
+ pass
+ xml_encoding_match = re.compile(_s2bytes('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')).match(xml_data)
+ except UnicodeDecodeError:
+ xml_encoding_match = None
+ if xml_encoding_match:
+ xml_encoding = xml_encoding_match.groups()[0].decode('utf-8').lower()
+ if sniffed_xml_encoding and (xml_encoding in (u'iso-10646-ucs-2', u'ucs-2', u'csunicode', u'iso-10646-ucs-4', u'ucs-4', u'csucs4', u'utf-16', u'utf-32', u'utf_16', u'utf_32', u'utf16', u'u16')):
+ xml_encoding = sniffed_xml_encoding
+ acceptable_content_type = 0
+ application_content_types = (u'application/xml', u'application/xml-dtd', u'application/xml-external-parsed-entity')
+ text_content_types = (u'text/xml', u'text/xml-external-parsed-entity')
+ if (http_content_type in application_content_types) or \
+ (http_content_type.startswith(u'application/') and http_content_type.endswith(u'+xml')):
+ acceptable_content_type = 1
+ true_encoding = http_encoding or xml_encoding or u'utf-8'
+ elif (http_content_type in text_content_types) or \
+ (http_content_type.startswith(u'text/')) and http_content_type.endswith(u'+xml'):
+ acceptable_content_type = 1
+ true_encoding = http_encoding or u'us-ascii'
+ elif http_content_type.startswith(u'text/'):
+ true_encoding = http_encoding or u'us-ascii'
+ elif http_headers and (not (http_headers.has_key('content-type') or http_headers.has_key('Content-type'))):
+ true_encoding = xml_encoding or u'iso-8859-1'
+ else:
+ true_encoding = xml_encoding or u'utf-8'
+ # some feeds claim to be gb2312 but are actually gb18030.
+ # apparently MSIE and Firefox both do the following switch:
+ if true_encoding.lower() == u'gb2312':
+ true_encoding = u'gb18030'
+ return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
+
+def _toUTF8(data, encoding):
+ '''Changes an XML data stream on the fly to specify a new encoding
+
+ data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
+ encoding is a string recognized by encodings.aliases
+ '''
+ # strip Byte Order Mark (if present)
+ if (len(data) >= 4) and (data[:2] == _l2bytes([0xfe, 0xff])) and (data[2:4] != _l2bytes([0x00, 0x00])):
+ encoding = 'utf-16be'
+ data = data[2:]
+ elif (len(data) >= 4) and (data[:2] == _l2bytes([0xff, 0xfe])) and (data[2:4] != _l2bytes([0x00, 0x00])):
+ encoding = 'utf-16le'
+ data = data[2:]
+ elif data[:3] == _l2bytes([0xef, 0xbb, 0xbf]):
+ encoding = 'utf-8'
+ data = data[3:]
+ elif data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]):
+ encoding = 'utf-32be'
+ data = data[4:]
+ elif data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]):
+ encoding = 'utf-32le'
+ data = data[4:]
+ newdata = unicode(data, encoding)
+ declmatch = re.compile('^<\?xml[^>]*?>')
+ newdecl = '''<?xml version='1.0' encoding='utf-8'?>'''
+ if declmatch.search(newdata):
+ newdata = declmatch.sub(newdecl, newdata)
+ else:
+ newdata = newdecl + u'\n' + newdata
+ return newdata.encode('utf-8')
+
+def _stripDoctype(data):
+ '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
+
+ rss_version may be 'rss091n' or None
+ stripped_data is the same XML document, minus the DOCTYPE
+ '''
+ start = re.search(_s2bytes('<\w'), data)
+ start = start and start.start() or -1
+ head,data = data[:start+1], data[start+1:]
+
+ entity_pattern = re.compile(_s2bytes(r'^\s*<!ENTITY([^>]*?)>'), re.MULTILINE)
+ entity_results=entity_pattern.findall(head)
+ head = entity_pattern.sub(_s2bytes(''), head)
+ doctype_pattern = re.compile(_s2bytes(r'^\s*<!DOCTYPE([^>]*?)>'), re.MULTILINE)
+ doctype_results = doctype_pattern.findall(head)
+ doctype = doctype_results and doctype_results[0] or _s2bytes('')
+ if doctype.lower().count(_s2bytes('netscape')):
+ version = u'rss091n'
+ else:
+ version = None
+
+ # only allow in 'safe' inline entity definitions
+ replacement=_s2bytes('')
+ if len(doctype_results)==1 and entity_results:
+ safe_pattern=re.compile(_s2bytes('\s+(\w+)\s+"(&#\w+;|[^&"]*)"'))
+ safe_entities=filter(lambda e: safe_pattern.match(e),entity_results)
+ if safe_entities:
+ replacement=_s2bytes('<!DOCTYPE feed [\n <!ENTITY') + _s2bytes('>\n <!ENTITY ').join(safe_entities) + _s2bytes('>\n]>')
+ data = doctype_pattern.sub(replacement, head) + data
+
+ return version, data, dict(replacement and [(k.decode('utf-8'), v.decode('utf-8')) for k, v in safe_pattern.findall(replacement)])
+
+def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None):
+ '''Parse a feed from a URL, file, stream, or string.
+
+ request_headers, if given, is a dict from http header name to value to add
+ to the request; this overrides internally generated values.
+ '''
+
+ if handlers is None:
+ handlers = []
+ if request_headers is None:
+ request_headers = {}
+ if response_headers is None:
+ response_headers = {}
+
+ result = FeedParserDict()
+ result['feed'] = FeedParserDict()
+ result['entries'] = []
+ result['bozo'] = 0
+ if not isinstance(handlers, list):
+ handlers = [handlers]
+ try:
+ f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers)
+ data = f.read()
+ except Exception, e:
+ result['bozo'] = 1
+ result['bozo_exception'] = e
+ data = None
+ f = None
+
+ if hasattr(f, 'headers'):
+ result['headers'] = dict(f.headers)
+ # overwrite existing headers using response_headers
+ if 'headers' in result:
+ result['headers'].update(response_headers)
+ elif response_headers:
+ result['headers'] = copy.deepcopy(response_headers)
+
+ # if feed is gzip-compressed, decompress it
+ if f and data and 'headers' in result:
+ if gzip and 'gzip' in (result['headers'].get('content-encoding'), result['headers'].get('Content-Encoding')):
+ try:
+ data = gzip.GzipFile(fileobj=_StringIO(data)).read()
+ except (IOError, struct.error), e:
+ # IOError can occur if the gzip header is bad
+ # struct.error can occur if the data is damaged
+ # Some feeds claim to be gzipped but they're not, so
+ # we get garbage. Ideally, we should re-request the
+ # feed without the 'Accept-encoding: gzip' header,
+ # but we don't.
+ result['bozo'] = 1
+ result['bozo_exception'] = e
+ data = None
+ elif zlib and 'deflate' in (result['headers'].get('content-encoding'), result['headers'].get('Content-Encoding')):
+ try:
+ data = zlib.decompress(data)
+ except zlib.error, e:
+ result['bozo'] = 1
+ result['bozo_exception'] = e
+ data = None
+
+ # save HTTP headers
+ if 'headers' in result:
+ if 'etag' in result['headers'] or 'ETag' in result['headers']:
+ etag = result['headers'].get('etag', result['headers'].get('ETag', u''))
+ if not isinstance(etag, unicode):
+ etag = etag.decode('utf-8', 'ignore')
+ if etag:
+ result['etag'] = etag
+ if 'last-modified' in result['headers'] or 'Last-Modified' in result['headers']:
+ modified = result['headers'].get('last-modified', result['headers'].get('Last-Modified'))
+ if modified:
+ result['modified'] = _parse_date(modified)
+ if hasattr(f, 'url'):
+ if not isinstance(f.url, unicode):
+ result['href'] = f.url.decode('utf-8', 'ignore')
+ else:
+ result['href'] = f.url
+ result['status'] = 200
+ if hasattr(f, 'status'):
+ result['status'] = f.status
+ if hasattr(f, 'close'):
+ f.close()
+
+ if data is None:
+ return result
+
+ # there are four encodings to keep track of:
+ # - http_encoding is the encoding declared in the Content-Type HTTP header
+ # - xml_encoding is the encoding declared in the <?xml declaration
+ # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data
+ # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
+ http_headers = result.get('headers', {})
+ result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \
+ _getCharacterEncoding(http_headers, data)
+ if http_headers and (not acceptable_content_type):
+ if http_headers.has_key('content-type') or http_headers.has_key('Content-type'):
+ bozo_message = '%s is not an XML media type' % http_headers.get('content-type', http_headers.get('Content-type'))
+ else:
+ bozo_message = 'no Content-type specified'
+ result['bozo'] = 1
+ result['bozo_exception'] = NonXMLContentType(bozo_message)
+
+ if data is not None:
+ result['version'], data, entities = _stripDoctype(data)
+
+ # ensure that baseuri is an absolute uri using an acceptable URI scheme
+ contentloc = http_headers.get('content-location', http_headers.get('Content-Location', u''))
+ href = result.get('href', u'')
+ baseuri = _makeSafeAbsoluteURI(href, contentloc) or _makeSafeAbsoluteURI(contentloc) or href
+
+ baselang = http_headers.get('content-language', http_headers.get('Content-Language', None))
+ if not isinstance(baselang, unicode) and baselang is not None:
+ baselang = baselang.decode('utf-8', 'ignore')
+
+ # if server sent 304, we're done
+ if result.get('status', 0) == 304:
+ result['version'] = u''
+ result['debug_message'] = 'The feed has not changed since you last checked, ' + \
+ 'so the server sent no data. This is a feature, not a bug!'
+ return result
+
+ # if there was a problem downloading, we're done
+ if data is None:
+ return result
+
+ # determine character encoding
+ use_strict_parser = 0
+ known_encoding = 0
+ tried_encodings = []
+ # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
+ for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding):
+ if not proposed_encoding:
+ continue
+ if proposed_encoding in tried_encodings:
+ continue
+ tried_encodings.append(proposed_encoding)
+ try:
+ data = _toUTF8(data, proposed_encoding)
+ except (UnicodeDecodeError, LookupError):
+ pass
+ else:
+ known_encoding = use_strict_parser = 1
+ break
+ # if no luck and we have auto-detection library, try that
+ if (not known_encoding) and chardet:
+ proposed_encoding = chardet.detect(data)['encoding']
+ if proposed_encoding and (proposed_encoding not in tried_encodings):
+ tried_encodings.append(proposed_encoding)
+ try:
+ data = _toUTF8(data, proposed_encoding)
+ except (UnicodeDecodeError, LookupError):
+ pass
+ else:
+ known_encoding = use_strict_parser = 1
+ # if still no luck and we haven't tried utf-8 yet, try that
+ if (not known_encoding) and (u'utf-8' not in tried_encodings):
+ proposed_encoding = u'utf-8'
+ tried_encodings.append(proposed_encoding)
+ try:
+ data = _toUTF8(data, proposed_encoding)
+ except UnicodeDecodeError:
+ pass
+ else:
+ known_encoding = use_strict_parser = 1
+ # if still no luck and we haven't tried windows-1252 yet, try that
+ if (not known_encoding) and (u'windows-1252' not in tried_encodings):
+ proposed_encoding = u'windows-1252'
+ tried_encodings.append(proposed_encoding)
+ try:
+ data = _toUTF8(data, proposed_encoding)
+ except UnicodeDecodeError:
+ pass
+ else:
+ known_encoding = use_strict_parser = 1
+ # if still no luck and we haven't tried iso-8859-2 yet, try that.
+ if (not known_encoding) and (u'iso-8859-2' not in tried_encodings):
+ proposed_encoding = u'iso-8859-2'
+ tried_encodings.append(proposed_encoding)
+ try:
+ data = _toUTF8(data, proposed_encoding)
+ except UnicodeDecodeError:
+ pass
+ else:
+ known_encoding = use_strict_parser = 1
+ # if still no luck, give up
+ if not known_encoding:
+ result['bozo'] = 1
+ result['bozo_exception'] = CharacterEncodingUnknown( \
+ 'document encoding unknown, I tried ' + \
+ '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' % \
+ (result['encoding'], xml_encoding))
+ result['encoding'] = u''
+ elif proposed_encoding != result['encoding']:
+ result['bozo'] = 1
+ result['bozo_exception'] = CharacterEncodingOverride( \
+ 'document declared as %s, but parsed as %s' % \
+ (result['encoding'], proposed_encoding))
+ result['encoding'] = proposed_encoding
+
+ if not _XML_AVAILABLE:
+ use_strict_parser = 0
+ if use_strict_parser:
+ # initialize the SAX parser
+ feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')
+ saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
+ saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
+ try:
+ # disable downloading external doctype references, if possible
+ saxparser.setFeature(xml.sax.handler.feature_external_ges, 0)
+ except xml.sax.SAXNotSupportedException:
+ pass
+ saxparser.setContentHandler(feedparser)
+ saxparser.setErrorHandler(feedparser)
+ source = xml.sax.xmlreader.InputSource()
+ source.setByteStream(_StringIO(data))
+ if hasattr(saxparser, '_ns_stack'):
+ # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
+ # PyXML doesn't have this problem, and it doesn't have _ns_stack either
+ saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
+ try:
+ saxparser.parse(source)
+ except xml.sax.SAXParseException, e:
+ result['bozo'] = 1
+ result['bozo_exception'] = feedparser.exc or e
+ use_strict_parser = 0
+ if not use_strict_parser and _SGML_AVAILABLE:
+ feedparser = _LooseFeedParser(baseuri, baselang, 'utf-8', entities)
+ feedparser.feed(data.decode('utf-8', 'replace'))
+ result['feed'] = feedparser.feeddata
+ result['entries'] = feedparser.entries
+ result['version'] = result['version'] or feedparser.version
+ result['namespaces'] = feedparser.namespacesInUse
+ return result
--- /dev/null
+#!/usr/bin/env python2.5
+
+# Copyright (c) 2011 Neal H. Walfield <neal@walfield.org>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import urllib2
+import httplib
+import time
+import logging
+logger = logging.getLogger(__name__)
+
+class ProgressSocket(object):
+ """
+ Monitor what is being sent and received.
+ """
+ def __init__(self, socket, connection):
+ self.socket = socket
+ self.connection = connection
+
+ def __getattribute__(self, attr):
+ # logger.debug("%s.__getattribute__(%s)"
+ # % (self.__class__.__name__, attr))
+
+ def send(data):
+ # 100k at a time.
+ bs = 100 * 1024
+ sent = 0
+ while sent < len (data):
+ remaining = len (data) - sent
+ if remaining < bs:
+ amount = remaining
+ else:
+ amount = bs
+
+ self.socket.sendall(data[sent:sent+amount])
+ sent += amount
+ self.connection.stats['sent'] += amount
+ self.connection.opener.stats['sent'] += amount
+
+ if self.connection.callback is not None:
+ self.connection.callback ()
+
+ def read(*args, **kwargs):
+ data = self.socket.read (*args, **kwargs)
+ # print "GOT: %s" % (data[0:240],)
+ self.connection.stats['received'] += len (data)
+ self.connection.opener.stats['received'] += len (data)
+ if self.connection.callback is not None:
+ self.connection.callback ()
+ return data
+
+ if attr == 'send' or attr == 'sendall':
+ return send
+ if attr == 'read':
+ return read
+
+ try:
+ return super (ProgressSocket, self).__getattribute__(attr)
+ except AttributeError:
+ socket = super (ProgressSocket, self).__getattribute__('socket')
+ return socket.__getattribute__(attr)
+
+ def makefile(self, mode, bufsize):
+ return ProgressSocket (socket=self.socket.makefile(mode, bufsize),
+ connection=self.connection)
+
+ def close(self):
+ return self.socket.close ()
+
+def HTTPProgressConnectionBuilder(callback, opener):
+ class HTTPProgressConnection(httplib.HTTPConnection):
+ def __init__(self, *args, **kwargs):
+ self.method = None
+ self.url = None
+ return httplib.HTTPConnection.__init__ (self, *args, **kwargs)
+
+ def putrequest(self, method, url, *args, **kwargs):
+ self.method = method
+ self.url = url
+ return httplib.HTTPConnection.putrequest (
+ self, method, url, *args, **kwargs)
+
+ def connect(self):
+ httplib.HTTPConnection.connect(self)
+ # Wrap the socket.
+ self.sock = ProgressSocket(socket=self.sock,
+ connection=self)
+
+ HTTPProgressConnection.callback = callback
+ HTTPProgressConnection.opener = opener
+ HTTPProgressConnection.stats \
+ = {'sent': 0, 'received': 0, 'started':time.time()}
+ return HTTPProgressConnection
+
+class HTTPProgressHandler(urllib2.HTTPHandler):
+ def __init__(self, callback):
+ self.callback = callback
+ self.stats = {'sent': 0, 'received': 0, 'started':time.time()}
+ return urllib2.HTTPHandler.__init__(self)
+
+ def http_open(self, request):
+ return self.do_open(
+ HTTPProgressConnectionBuilder(self.callback, self),
+ request)
+
+if __name__ == '__main__':
+ def callback(connection):
+ req = ""
+ if connection.method:
+ req += connection.method + " "
+ req += connection.host + ':' + str (connection.port)
+ if connection.url:
+ req += connection.url
+
+ cstats = connection.stats
+ ostats = connection.opener.stats
+
+ print(
+ ("%s: connection: %d sent, %d received: %d kb/s; "
+ + "opener: %d sent, %d received, %d kb/s")
+ % (req,
+ cstats['sent'], cstats['received'],
+ ((cstats['sent'] + cstats['received'])
+ / (time.time() - cstats['started']) / 1024),
+ ostats['sent'], ostats['received'],
+ ((ostats['sent'] + ostats['received'])
+ / (time.time() - ostats['started']) / 1024)))
+
+ opener = urllib2.build_opener(HTTPProgressHandler(callback))
+
+ data = opener.open ('http://google.com')
+ downloaded = 0
+ for d in data:
+ downloaded += len (d)
+ print "Document is %d bytes in size" % (downloaded,)
--- /dev/null
+#!/usr/bin/env python2.5
+
+# Copyright (c) 2011 Neal H. Walfield <neal@walfield.org>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import threading
+import thread
+import traceback
+import heapq
+import sys
+import mainthread
+import logging
+logger = logging.getLogger(__name__)
+
+def debug(*args):
+ if False:
+ logger.debug(' '.join(args))
+
+# The default priority. Like nice(), a smaller numeric priority
+# corresponds to a higher priority class.
+default_priority = 0
+
+class JobRunner(threading.Thread):
+ def __init__(self, job_manager):
+ threading.Thread.__init__(self)
+ self.job_manager = job_manager
+
+ def run (self):
+ have_lock = True
+ self.job_manager.lock.acquire ()
+ try:
+ while (self.job_manager.pause == 0
+ and not self.job_manager.do_quit
+ and (len (self.job_manager.threads)
+ <= self.job_manager.num_threads)):
+ try:
+ _, key, job = heapq.heappop (self.job_manager.queue)
+ except IndexError:
+ return
+
+ try:
+ self.job_manager.in_progress.append (key)
+ self.job_manager.lock.release ()
+ have_lock = False
+
+ # Execute the job.
+ try:
+ job ()
+ except KeyboardInterrupt:
+ # This is handled below and doesn't require a
+ # traceback.
+ raise
+ except:
+ print ("Executing job %s (%s) from thread %s: %s"
+ % (str (key), str (job),
+ threading.currentThread(),
+ traceback.format_exc ()))
+
+ self.job_manager.lock.acquire ()
+ have_lock = True
+
+ assert key in self.job_manager.in_progress
+ finally:
+ try:
+ self.job_manager.in_progress.remove (key)
+ except ValueError:
+ pass
+
+ debug("Finished executing job %s (%s)" % (key, job,))
+
+ self.job_manager._stats_hooks_run ({'job':job, 'key':key})
+ except KeyboardInterrupt:
+ debug("%s: KeyboardInterrupt" % threading.currentThread())
+ thread.interrupt_main()
+ debug("%s: Forwarded KeyboardInterrupt to main thread"
+ % threading.currentThread())
+ finally:
+ if have_lock:
+ self.job_manager.lock.release ()
+
+ assert self in self.job_manager.threads
+ self.job_manager.threads.remove (self)
+
+ debug ("Job runner %s (%d left) exiting."
+ % (threading.currentThread(),
+ len (self.job_manager.threads)))
+
+_jm = None
+def JobManager(start=False):
+ """
+ Return the job manager instance. The job manager will not start
+ executing jobs until this is called with start set to True. Note:
+ you can still queue jobs.
+ """
+ global _jm
+ if _jm is None:
+ _jm = _JobManager ()
+ if start and not _jm.started:
+ _jm.started = True
+ if _jm.jobs > 0:
+ _jm._stats_hooks_run ()
+ _jm.tickle ()
+
+ return _jm
+
+class _JobManager(object):
+ def __init__(self, started=False, num_threads=4):
+ """
+ Initialize the job manager.
+
+ If started is false, jobs may be queued, but jobs will not be
+ started until start() is called.
+ """
+ # A reentrant lock so that a job runner can call stat without
+ # dropping the lock.
+ self.lock = threading.RLock()
+
+ # If we can start executing jobs.
+ self.started = started
+
+ # The maximum number of threads to use for executing jobs.
+ self._num_threads = num_threads
+
+ # List of jobs (priority, key, job) that are queued for
+ # execution.
+ self.queue = []
+ # List of keys of the jobs that are being executed.
+ self.in_progress = []
+ # List of threads.
+ self.threads = []
+
+ # If 0, jobs may execute, otherwise, job execution is paused.
+ self.pause = 0
+
+ # The total number of jobs that this manager ever executed.
+ self.jobs = 0
+
+ # A list of status hooks to execute when the stats change.
+ self._stats_hooks = []
+ self._current_stats = self.stats ()
+
+ self.do_quit = False
+
+ def _lock(f):
+ def wrapper(*args, **kwargs):
+ self = args[0]
+ self.lock.acquire ()
+ try:
+ return f(*args, **kwargs)
+ finally:
+ self.lock.release()
+ return wrapper
+
+ def get_num_threads(self):
+ return self._num_threads
+ def set_num_threads(self, value):
+ self._num_threads = value
+ self.tickle ()
+ num_threads = property(get_num_threads, set_num_threads)
+
+ @_lock
+ def start(self):
+ """
+ Start executing jobs.
+ """
+ if self.started:
+ return
+ if self.jobs > 0:
+ self._stats_hooks_run ()
+ self.tickle ()
+
+ @_lock
+ def tickle(self):
+ """
+ Ensure that there are enough job runners for the number of
+ pending jobs.
+ """
+ if self.do_quit:
+ debug("%s.quit called, not creating new threads."
+ % self.__class__.__name__)
+ return
+
+ if self.pause > 0:
+ # Job execution is paused. Don't start any new threads.
+ debug("%s.tickle(): Not doing anything: paused"
+ % (self.__class__.__name__))
+ return
+
+ debug("%s.tickle: Have %d threads (can start %d); %d jobs queued"
+ % (self.__class__.__name__,
+ len (self.threads), self.num_threads, len (self.queue)))
+ if len (self.threads) < self.num_threads:
+ for _ in range (min (len (self.queue),
+ self.num_threads - len (self.threads))):
+ thread = JobRunner (self)
+ # Setting threads as daemons means faster shutdown
+ # when the main thread exists, but it results in
+ # exceptions and occassional setfaults.
+ # thread.setDaemon(True)
+ self.threads.append (thread)
+ thread.start ()
+ debug("Now have %d threads" % len (self.threads))
+
+ @_lock
+ def execute(self, job, key=None, priority=default_priority):
+ """
+ Enqueue a job for execution. job is a function to execute.
+ If key is not None, the job is only enqueued if there is no
+ job that is inprogress or enqueued with the same key.
+ priority is the job's priority. Like nice(), a smaller
+ numeric priority corresponds to a higher priority class. Jobs
+ are executed highest priority first, in the order that they
+ were added.
+ """
+ if self.do_quit:
+ debug("%s.quit called, not enqueuing new jobs."
+ % self.__class__.__name__)
+
+ if key is not None:
+ if key in self.in_progress:
+ return
+ for item in self.queue:
+ if item[1] == key:
+ if item[0][0] < priority:
+ # Priority raised.
+ item[0][0] = priority
+ self.queue = heapq.heapify (self.queue)
+ return
+
+ # To ensure that jobs with the same priority are executed
+ # in the order they are added, we set the priority to
+ # [priority, next (monotomic counter)].
+ self.jobs += 1
+ heapq.heappush (self.queue, [[priority, self.jobs], key, job])
+
+ if self.started:
+ self._stats_hooks_run ()
+ self.tickle ()
+ else:
+ debug("%s not initialized. delaying execution of %s (%s)"
+ % (self.__class__.__name__, key, str (job),))
+
+ @_lock
+ def pause(self):
+ """
+ Increasement the pause count. When the pause count is greater
+ than 0, job execution is suspended.
+ """
+ self.pause += 1
+
+ if self.pause == 1:
+ self._stats_hooks_run ()
+
+ @_lock
+ def resume(self):
+ """
+ Decrement the pause count. If the pause count is greater than
+ 0 and this decrement brings it to 0, enqueued jobs are
+ resumed.
+ """
+ assert self.pause > 0
+ self.pause -= 1
+ if not self.paused():
+ self._stats_hooks_run ()
+ self.tickle ()
+
+ @_lock
+ def paused(self):
+ """
+ Returns whether job execution is paused.
+ """
+ return self.pause > 0
+
+ @_lock
+ def cancel(self):
+ """
+ Cancel any pending jobs.
+ """
+ self.queue = []
+ self._stats_hooks_run ()
+
+ def quit(self):
+ self.cancel ()
+ self.do_quit = True
+
+ @_lock
+ def stats(self):
+ """
+ Return a dictionary consisting of:
+
+ - 'paused': whether execution is paused
+ - 'jobs': the total number of jobs this manager has
+ executed, is executing or are queued
+ - 'jobs-completed': the numer of jobs that have completed
+ - 'jobs-in-progress': the number of jobs in progress
+ - 'jobs-queued': the number of jobs currently queued
+ """
+ return {'paused': self.paused(),
+ 'jobs': self.jobs,
+ 'jobs-completed':
+ self.jobs - len (self.in_progress) - len (self.queue),
+ 'jobs-in-progress': len (self.in_progress),
+ 'jobs-queued': len (self.queue)
+ }
+
+ def stats_hook_register(self, func, *args, **kwargs):
+ """
+ Registers a function to be called when the job status changes.
+ Passed the following parameters:
+
+ - the JobManager instance.
+ - the previous stats (as returned by stats)
+ - the current stats
+ - the job that was completed (or None)
+
+ Note: the hook may not be run in the main thread!
+ """
+ mainthread=False
+ try:
+ mainthread = kwargs['run_in_main_thread']
+ del kwargs['run_in_main_thread']
+ except KeyError:
+ pass
+ self._stats_hooks.append ([func, mainthread, args, kwargs])
+
+ def _stats_hooks_run(self, completed_job=None):
+ """
+ Run the stats hooks.
+ """
+ # if not self._stats_hooks:
+ # return
+
+ self.lock.acquire ()
+ try:
+ old_stats = self._current_stats
+ self._current_stats = self.stats ()
+ current_stats = self._current_stats
+ finally:
+ self.lock.release ()
+
+ debug("%s -> %s" % (str (old_stats), str (current_stats)))
+
+ for (f, run_in_main_thread, args, kwargs) in self._stats_hooks:
+ if run_in_main_thread:
+ debug("JobManager._stats_hooks_run: Running %s in main thread"
+ % f)
+ mainthread.execute(
+ f, self, old_stats, current_stats, completed_job,
+ async=True, *args, **kwargs)
+ else:
+ debug("JobManager._stats_hooks_run: Running %s in any thread"
+ % f)
+ f(self, old_stats, current_stats, completed_job,
+ *args, **kwargs)
--- /dev/null
+#!/usr/bin/env python2.5
+
+# Copyright (c) 2011 Neal H. Walfield <neal@walfield.org>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import threading
+import traceback
+import logging
+logger = logging.getLogger(__name__)
+
+_run_in_main_thread = None
+_main_thread = None
+
+def init(run_in_main_thread=None):
+ """
+ run_in_main_thread is a function that takes a single argument, a
+ callable and returns False. run_in_main_thread should run the
+ function in the main thread.
+
+ If you are using glib, gobject.idle_add (the default) is
+ sufficient. (gobject.idle_add is thread-safe.)
+ """
+ if run_in_main_thread is None:
+ import gobject
+ run_in_main_thread = gobject.idle_add
+
+ global _run_in_main_thread
+ assert _run_in_main_thread is None
+ _run_in_main_thread = run_in_main_thread
+
+ global _main_thread
+ _main_thread = threading.currentThread ()
+
+def execute(func, *args, **kwargs):
+ """
+ Execute FUNC in the main thread.
+
+ If kwargs['async'] exists and is True, the function is executed
+ asynchronously (i.e., the thread does not wait for the function to
+ return in which case the function's return value is discarded).
+ Otherwise, this function waits until the function is executed and
+ returns its return value.
+ """
+ async = False
+ try:
+ async = kwargs['async']
+ del kwargs['async']
+ except KeyError:
+ pass
+
+ if threading.currentThread() == _main_thread:
+ if async:
+ try:
+ func (*args, **kwargs)
+ except:
+ logger.debug("mainthread.execute: Executing %s: %s"
+ % (func, traceback.format_exc ()))
+ return
+ else:
+ return func (*args, **kwargs)
+
+ assert _run_in_main_thread is not None, \
+ "You can't call this function from a non-main thread until you've called init()"
+
+ if not async:
+ cond = threading.Condition()
+
+ result = {}
+ result['done'] = False
+
+ def doit():
+ def it():
+ # Execute the function.
+ assert threading.currentThread() == _main_thread
+
+ try:
+ result['result'] = func (*args, **kwargs)
+ except:
+ logger.debug("mainthread.execute: Executing %s: %s"
+ % (func, traceback.format_exc ()))
+
+ if not async:
+ cond.acquire ()
+ result['done'] = True
+ if not async:
+ cond.notify ()
+ cond.release ()
+
+ return False
+ return it
+
+ if not async:
+ cond.acquire ()
+ _run_in_main_thread (doit())
+
+ if async:
+ # Don't wait for the method to complete execution.
+ return
+
+ # Wait for the result to become available.
+ while not result['done']:
+ cond.wait ()
+
+ return result.get ('result', None)
+
+if __name__ == "__main__":
+ import sys
+ import gobject
+
+ init()
+
+ def in_main_thread(test_num):
+ assert threading.currentThread() == _main_thread, \
+ "Test %d failed" % (test_num,)
+ return test_num
+
+ mainloop = gobject.MainLoop()
+ gobject.threads_init()
+
+ assert execute (in_main_thread, 1) == 1
+ assert (execute (in_main_thread, 2, async=False) == 2)
+ execute (in_main_thread, 3, async=True)
+
+ class T(threading.Thread):
+ def __init__(self):
+ threading.Thread.__init__(self)
+
+ def run(self):
+ assert threading.currentThread() != _main_thread
+
+ assert execute (in_main_thread, 4) == 4
+ assert (execute (in_main_thread, 5, async=False) == 5)
+ execute (in_main_thread, 6, async=True)
+ execute (mainloop.quit, async=False)
+
+ def start_thread():
+ t = T()
+ t.start()
+ return False
+
+ gobject.idle_add (start_thread)
+ mainloop.run()
+
+def mainthread(f):
+ def wrapper(*args, **kwargs):
+ return execute (f, *args, **kwargs)
+ return wrapper
--- /dev/null
+#!/usr/bin/env python2.5
+
+#
+# Copyright (c) 2007-2008 INdT.
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+# ============================================================================
+# Name : FeedingIt.py
+# Author : Yves Marcoz
+# Version : 0.2.2
+# Description : Simple RSS Reader
+# ============================================================================
+
+from xml.dom.minidom import parse, parseString
+import urllib2
+import gtk
+import hildon
+import gobject
+import time
+from os.path import isfile, dirname
+import gobject
+import logging
+logger = logging.getLogger(__name__)
+
+class ExportOpmlData():
+ def __init__(self, parent, listing):
+ fs = hildon.FileSystemModel()
+ dialog = hildon.FileChooserDialog(parent, gtk.FILE_CHOOSER_ACTION_SAVE, fs)
+ #(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL,
+ #gtk.STOCK_SAVE, gtk.RESPONSE_OK))
+ #)
+ #dialog = gobject.new(hildon.FileChooserDialog, \
+ # action=gtk.FILE_CHOOSER_ACTION_SAVE)
+ #dialog.set_default_response(gtk.RESPONSE_OK)
+ #dialog.set_property('autonaming',False)
+ #dialog.set_property('show-files',True)
+ dialog.set_current_folder('/home/user/MyDocs/')
+ dialog.set_current_name('feedingit-export')
+ dialog.set_extension('opml')
+ response = dialog.run()
+ dialog.hide()
+ if response == gtk.RESPONSE_OK:
+ filename = dialog.get_filename()
+ logger.debug("ExportOpmlData: %s" % filename)
+ #try:
+
+ cont = True
+ if isfile(filename):
+ note = "File already exists. Aborted"
+ confirm = hildon.Note ("confirmation", parent, "File already exists. Are you sure you want to overwrite it?", gtk.STOCK_DIALOG_WARNING )
+ confirm.set_button_texts ("Yes", "Cancel")
+ response = confirm.run()
+ confirm.destroy()
+ if response == gtk.RESPONSE_OK:
+ cont = True
+ else:
+ note = "Operation cancelled."
+ cont = False
+ if cont:
+ file = open(filename, "w")
+ file.write(self.getOpmlText(listing))
+ file.close()
+ note = "Feeds exported to %s" %filename
+ #except:
+ note = "Failed to export feeds"
+
+ #dialog.destroy()
+ #dialog = hildon.Note ("information", parent, note , gtk.STOCK_DIALOG_INFO )
+ #dialog.run()
+ #dialog.destroy()
+ elif response == gtk.RESPONSE_CANCEL:
+ dialog.destroy()
+
+ def getOpmlText(self, listing):
+ time_now = time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.gmtime())
+ opml_text = """<?xml version="1.0" encoding="UTF-8"?>
+<opml version="1.0">
+<head>
+ <title>Feeding It Export</title>
+</head>
+<body>
+"""
+ for key in listing.getListOfFeeds():
+ title = listing.getFeedTitle(key)
+ url = listing.getFeedUrl(key)
+ if not title == "Archived Articles":
+ opml_text += """\n\t\t<outline type="rss" text="%s" title="%s" xmlUrl="%s"/>""" % (self.sanitize(title), self.sanitize(title), self.sanitize(url))
+ opml_text += """\n</body>\n</opml>\n"""
+ return opml_text
+
+ def sanitize(self, text):
+ from cgi import escape
+ return escape(text).encode('ascii', 'xmlcharrefreplace')
+
+
+
+class GetOpmlData():
+ def __init__(self, parent):
+ self.parent = parent
+ dialog = hildon.Note ("confirmation", parent, "What type of OPML?", gtk.STOCK_DIALOG_WARNING )
+ dialog.set_button_texts ("File", "URL")
+ response = dialog.run()
+ dialog.destroy()
+
+ if response == gtk.RESPONSE_OK:
+ # Choose a file
+ self.data = self.askForFile()
+ else:
+ # Download a URL
+ self.data = self.downloadFile()
+
+ def getData(self):
+ if not self.data == None:
+ dialog = OpmlDialog(self.parent, self.data)
+ response = dialog.run()
+ if response == gtk.RESPONSE_ACCEPT:
+ items = dialog.getItems()
+ else:
+ items = []
+ dialog.destroy()
+ return items
+ return []
+
+ def downloadFile(self):
+ dlg = gtk.Dialog("Import OPML from web", self.parent, gtk.DIALOG_DESTROY_WITH_PARENT,
+ ('Import', gtk.RESPONSE_OK,
+ gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL))
+ hb = gtk.HBox(False, 5)
+ hb.pack_start(gtk.Label('URL:'), expand=False)
+ entry = hildon.Entry(0)
+ entry.set_text("http://")
+ entry.select_region(-1, -1)
+ hb.pack_start(entry, expand=True)
+ hb.show_all()
+ dlg.vbox.pack_start(hb, False)
+
+ resp = dlg.run()
+ url = entry.get_text()
+ dlg.destroy()
+ if resp == gtk.RESPONSE_CANCEL:
+ return None
+ try:
+ f = urllib2.urlopen(url)
+ data = f.read()
+ f.close()
+ except:
+ #Show error note
+ return None
+ return data
+
+ def askForFile(self):
+ #dialog = hildon.FileChooserDialog(self.parent,
+ # gtk.FILE_CHOOSER_ACTION_OPEN)
+ #dialog = gobject.new(hildon.FileChooserDialog, \
+ # action=gtk.FILE_CHOOSER_ACTION_OPEN)
+ #dialog.set_default_response(gtk.RESPONSE_OK)
+ fs = hildon.FileSystemModel()
+ dialog = hildon.FileChooserDialog(self.parent, gtk.FILE_CHOOSER_ACTION_OPEN, fs)
+
+ filter = gtk.FileFilter()
+ filter.set_name("All files")
+ filter.add_pattern("*")
+ dialog.add_filter(filter)
+
+ filter = gtk.FileFilter()
+ filter.set_name("OPML")
+ filter.add_pattern("*.xml")
+ filter.add_pattern("*.opml")
+ dialog.add_filter(filter)
+
+ response = dialog.run()
+ if response == gtk.RESPONSE_OK:
+ file = open(dialog.get_filename())
+ data = file.read()
+ file.close()
+ dialog.destroy()
+ return data
+ elif response == gtk.RESPONSE_CANCEL:
+ dialog.destroy()
+ return None
+
+
+class OpmlDialog(gtk.Dialog):
+ def parse(self, opmlData):
+ self.feeds = []
+ dom1 = parseString(opmlData)
+
+ outlines = dom1.getElementsByTagName('outline')
+ for outline in outlines:
+ title = outline.getAttribute('text')
+ url = outline.getAttribute('xmlUrl')
+ if url == "":
+ url = outline.getAttribute('htmlUrl')
+ if not url == "":
+ self.feeds.append( (title, url) )
+
+ def getFeedLinks(self):
+ return self.feeds
+
+ def __init__(self, parent, opmlData):
+ self.parse(opmlData)
+ gtk.Dialog.__init__(self, "Select OPML Feeds", parent, gtk.DIALOG_DESTROY_WITH_PARENT, (gtk.STOCK_OK, gtk.RESPONSE_ACCEPT))
+
+ self.pannableArea = hildon.PannableArea()
+ self.treestore = gtk.ListStore(gobject.TYPE_STRING, gobject.TYPE_STRING)
+ self.treeview = gtk.TreeView(self.treestore)
+
+ self.displayFeeds()
+
+ self.set_default_size(-1, 600)
+ self.vbox.pack_start(self.pannableArea)
+
+ button = hildon.GtkButton(gtk.HILDON_SIZE_AUTO)
+ button.set_label("Select All")
+ button.connect("clicked", self.button_select_all_clicked)
+ self.action_area.pack_end(button)
+
+ button = hildon.GtkButton(gtk.HILDON_SIZE_AUTO)
+ button.set_label("Unselect All")
+ button.connect("clicked", self.button_select_none_clicked)
+ self.action_area.pack_end(button)
+
+ self.show_all()
+
+ def button_select_all_clicked(self, button):
+ self.treeview.get_selection().select_all()
+
+ def button_select_none_clicked(self, button):
+ self.treeview.get_selection().unselect_all()
+
+ def displayFeeds(self):
+ self.treeview.destroy()
+ self.treestore = gtk.ListStore(gobject.TYPE_STRING, gobject.TYPE_STRING)
+ self.treeview = gtk.TreeView()
+
+ self.treeview.get_selection().set_mode(gtk.SELECTION_MULTIPLE)
+ hildon.hildon_gtk_tree_view_set_ui_mode(self.treeview, gtk.HILDON_UI_MODE_EDIT)
+ self.refreshList()
+ self.treeview.append_column(gtk.TreeViewColumn('Feed Name', gtk.CellRendererText(), text = 0))
+
+ self.pannableArea.add(self.treeview)
+ self.pannableArea.show_all()
+ self.treeview.get_selection().select_all()
+
+ def refreshList(self, selected=None, offset=0):
+ rect = self.treeview.get_visible_rect()
+ y = rect.y+rect.height
+ self.treestore = gtk.ListStore(gobject.TYPE_STRING, gobject.TYPE_STRING)
+ self.treeview.set_model(self.treestore)
+ for (title, url) in self.feeds:
+ item = self.treestore.append([title, url])
+ self.treeview.get_selection().select_iter(item)
+ #self.treeview.get_selection().select_all()
+ self.pannableArea.show_all()
+
+ def getItems(self):
+ list = []
+ treeselection = self.treeview.get_selection()
+ (model, pathlist) = treeselection.get_selected_rows()
+ for path in pathlist:
+ list.append( (model.get_value(model.get_iter(path),0), model.get_value(model.get_iter(path),1)) )
+ return list
+
+def showOpmlData(widget, parent, button):
+ dialog = GetOpmlData(parent)
+ logger.debug("showOpmlData: %s" % dialog.getData())
+ #dialog.destroy()
+
+if __name__ == "__main__":
+ window = hildon.Window()
+ window.set_title("Test App")
+
+
+ button = gtk.Button("Click to confirm.")
+ window.add(button)
+ button.connect("clicked", showOpmlData, window, button)
+ window.connect("destroy", gtk.main_quit)
+ window.show_all()
+
+ gtk.main()
+ window.destroy()
--- /dev/null
+import Qt 4.7
+import QtWebKit 1.0
+import "common" as Common
+
+Rectangle {
+ /*x: parent.width; height: parent.height;*/
+ width: parent.width;
+ height: parent.height
+ property alias zoomEnabled: slider.visible;
+ property alias value: slider.value;
+ //anchors.top: parent.top; anchors.bottom: parent.bottom;
+ color: "white";
+
+ Flickable {
+ id: flickable
+ //anchors.fill: screen;
+ height: parent.height;
+ width: parent.width;
+ contentWidth: webView.width*webView.scale; //Math.max(screen.width,webView.width*webView.scale)
+ contentHeight: Math.max(articleViewer.height,webView.height*webView.scale)
+ //contentWidth: childrenRect.width; contentHeight: childrenRect.height
+ interactive: parent.vertPanningEnabled;
+
+ flickDeceleration: 1500;
+ flickableDirection: Flickable.VerticalFlick
+ WebView {
+ id: webView
+ //url: flipItem.url;
+ html: flipItem.html;
+ preferredWidth: flickable.width
+ preferredHeight: flickable.height
+ //scale: 1.25;
+ transformOrigin: Item.TopLeft
+ scale: slider.value;
+ settings.defaultFontSize: 24
+ }
+
+// onFlickStarted: {
+// console.log("start contentx"+contentX)
+// console.log("start contenty"+contentY)
+// }
+ }
+
+ Common.Slider {
+ id: slider; visible: false
+ minimum: 0.2;
+ maximum: 2;
+ property real prevScale: 1
+ anchors {
+ bottom: parent.bottom; bottomMargin: 65
+ left: parent.left; leftMargin: 25
+ right: parent.right; rightMargin: 25
+ }
+ onValueChanged: {
+ if (webView.width * value > flickable.width) {
+ var xoff = (flickable.width/2 + flickable.contentX) * value / prevScale;
+ flickable.contentX = xoff - flickable.width/2;
+ }
+ if (webView.height * value > flickable.height) {
+ var yoff = (flickable.height/2 + flickable.contentY) * value / prevScale;
+ flickable.contentY = yoff - flickable.height/2;
+ }
+ prevScale = value;
+ }
+ Component.onCompleted: {value=0; value=1; }
+ }
+}
--- /dev/null
+import Qt 4.7
+
+Item {
+ id: articleViewer
+ //width: 480; height: 360;
+ width: parent.width; height: parent.height;
+ //property string feedid: "61ac1458d761423344998dc76770e36e" //articlesItem.feedid;
+ //property string hideReadArticles: "";
+ property alias articleShown: articleView.visible;
+ property bool zoomEnabled: false;
+ property bool vertPanningEnabled: true
+
+ function modulo(x,y) {
+ // Fixes modulo for negative numbers
+ return ((x%y)+y)%y;
+ }
+
+ function reload() {
+ articles.xml = articleViewer.feedid == "" ? "<?xml version=\"1.0\" encoding=\"utf-8\"?><xml></xml>" : controller.getArticlesXml(articleViewer.feedid);
+ articles.reload()
+ }
+
+ function next() {
+ if (articleView.visible) {
+ //articleView.positionViewAtIndex(modulo(articleView.currentIndex+1, articleView.count), ListView.Contain);
+ articleView.incrementCurrentIndex();
+ }
+ }
+
+ function prev() {
+ if (articleView.visible) {
+ //articleView.positionViewAtIndex(modulo(articleView.currentIndex-1, articleView.count), ListView.Contain);
+ articleView.decrementCurrentIndex();
+ }
+ }
+
+ function markAllAsRead() {
+ if (feedid!="") {
+ controller.markAllAsRead(feedid)
+ articles.reload();
+ }
+ }
+
+ function viewArticle(articleid) {
+ var index = 0;
+ for (var i=0; i<articleList.count; ++i) {
+ if (articles.get(0).articleid==articleid) {
+ index = i;
+ }
+ }
+ articleView.positionViewAtIndex(index, ListView.Contain); articleView.visible = true;
+ }
+
+ ListView {
+ id: articleList; model: visualModel.parts.list; z: 6
+ width: parent.width; height: parent.height; /*x: 0;*/
+ cacheBuffer: 100;
+ flickDeceleration: 1500
+ }
+
+ ListView {
+ id: articleView; model: visualModel.parts.flip; orientation: ListView.Horizontal
+ width: parent.width; height: parent.height; visible: false; z:8
+ //onCurrentIndexChanged: photosGridView.positionViewAtIndex(currentIndex, GridView.Contain)
+ highlightRangeMode: ListView.StrictlyEnforceRange; snapMode: ListView.SnapOneItem
+ //cacheBuffer: 5;
+ onMovementStarted: articleViewer.vertPanningEnabled=false;
+ onMovementEnded: articleViewer.vertPanningEnabled=true;
+ highlightMoveDuration: 300;
+ }
+
+ Rectangle {
+ id: noArticle
+ //width: parent.width; height: parent.height;
+ //color: "#000000"
+ anchors.centerIn: parent;
+ visible: false;
+ z:8;
+ Text { id: noText; color: "#ffffff"; anchors.centerIn: parent; text: qsTr("No articles available"); }
+ Image { id: loadingImage; anchors.centerIn: parent; source: "common/images/loading.png";
+ height: 96; width: 96;
+ NumberAnimation on rotation {
+ from: 0; to: 360; running: (loadingImage.visible == true); loops: Animation.Infinite; duration: 900
+ }
+ }
+
+ states: [ State {
+ name: "noArticle"; when: articles.count==0 && articles.status==XmlListModel.Ready
+ PropertyChanges { target: noArticle; visible: true; }
+ PropertyChanges { target: loadingImage; visible: false; }
+ PropertyChanges { target: noText; visible: true; }
+ }, State {
+ name: "loading"; when: articles.count==0 && articles.status != XmlListModel.Ready
+ PropertyChanges { target: noArticle; visible: true; }
+ PropertyChanges { target: noText; visible: false; }
+ PropertyChanges { target: loadingImage; visible: true; }
+ }
+ ]
+ }
+
+ VisualDataModel {
+ id: visualModel;
+ delegate: Package {
+ id: packageItem
+ Item { id: flipItem; Package.name: 'flip'; width: articleViewer.width; height: articleViewer.height;
+
+ property string url: (articleView.visible && Math.abs(articleView.currentIndex-index)<2) ? path: ""; //http://localhost:8000/html/" + articleViewer.feedid + "/" + articleid : "";
+ property string html: controller.getArticle(articleViewer.feedid, articleid)
+ ArticleDisplay {
+ zoomEnabled: articleViewer.zoomEnabled;
+ property bool vertPanningEnabled: articleViewer.vertPanningEnabled;
+
+ states: [ State {
+ name: 'articleIsRead';
+ when: articleView.visible && articleView.currentIndex == index;
+ StateChangeScript {
+ name: "myScript"
+ script: {
+ flipItem.url=path; //"http://localhost:8000/html/" + articleViewer.feedid + "/" + articleid;
+ controller.setEntryRead(articleViewer.feedid, articleid)
+ }
+ }
+ }, State {
+ name: 'articleIsClose'; when: articleView.visible && Math.abs(articleView.currentIndex-index)<2;
+ StateChangeScript {
+ script: { flipItem.url=path; } //"http://localhost:8000/html/" + articleViewer.feedid + "/" + articleid;}
+ }
+ }
+ ]
+ }
+ }
+
+ Item { Package.name: 'list';
+ id: wrapper; width: articleViewer.width; height: 86
+ Item {
+ id: moveMe
+ Rectangle { id: backRect; color: "black"; opacity: index % 2 ? 0.2 : 0.4; height: 84; width: wrapper.width; y: 1 }
+ Text {
+ anchors.fill: backRect
+ anchors.margins: 5
+ verticalAlignment: Text.AlignVCenter; text: title; color: (unread=="True") ? "white" : "#7b97fd";
+ width: wrapper.width; wrapMode: Text.WordWrap; font.bold: false;
+ }
+ }
+ MouseArea { anchors.fill: wrapper;
+ onClicked: { articleView.positionViewAtIndex(index, ListView.Contain); articleView.visible = true; }
+ }
+ }
+ }
+ model: articles
+ }
+
+ XmlListModel {
+ id: articles
+
+ //source: articleViewer.feedid == "" ? "" : "http://localhost:8000/articles/" + feedid + "?onlyUnread=" + hideReadArticles
+ //xml: articleViewer.feedid == "" ? "" : controller.getArticlesXml(articleViewer.feedid)
+ query: "/xml/article"
+
+ XmlRole { name: "title"; query: "title/string()" }
+ XmlRole { name: "articleid"; query: "articleid/string()"; isKey: true }
+ XmlRole { name: "path"; query: "path/string()" }
+ XmlRole { name: "unread"; query: "unread/string()"; isKey: true}
+ }
+
+
+}
--- /dev/null
+import Qt 4.7
+
+Item {
+ //anchors.fill: parent;
+ width: parent.width;
+ property string feedid : ""
+ property alias count: articles.count
+ property alias url: articles.source
+
+ x: parent.width; height: parent.height;
+ anchors.top: parent.top; anchors.bottom: parent.bottom
+
+ function getArticleid(index) {
+ return articles.get(index).articleid
+ }
+
+ function reload() {
+ //articlesModel.reload()
+ }
+
+ ListView {
+ id: articleList; model: articlesModel; delegate: articleDelegate; z: 6
+ width: parent.width; height: parent.height; /*x: 0;*/
+ cacheBuffer: 100;
+ flickDeceleration: 1500
+ }
+
+ XmlListModel {
+ id: articles
+
+ source: feedid == "" ? "" : "http://localhost:8000/articles/" + feedid + "?onlyUnread=" + hideReadArticles
+ query: "/xml/article"
+
+ XmlRole { name: "title"; query: "title/string()" }
+ XmlRole { name: "articleid"; query: "articleid/string()"; isKey: true }
+ XmlRole { name: "path"; query: "path/string()" }
+ XmlRole { name: "unread"; query: "unread/string()"; isKey: true}
+ }
+
+ Component {
+ id: articleDelegate
+
+ Item {
+ id: wrapper; width: wrapper.ListView.view.width; height: 86
+ Item {
+ id: moveMe
+ Rectangle { id: backRect; color: "black"; opacity: index % 2 ? 0.2 : 0.4; height: 84; width: wrapper.width; y: 1 }
+ Text {
+ anchors.fill: backRect
+ anchors.margins: 5
+ verticalAlignment: Text.AlignVCenter; text: title; color: (model.article.unread=="True") ? "white" : "#7b97fd";
+ width: wrapper.width; wrapMode: Text.WordWrap; font.bold: false;
+ }
+// Rectangle {
+// x: 3; y: 4; width: 77; height: 77; color: "#ff0000"; smooth: true
+
+// }
+
+// Column {
+// x: 3;
+
+// width: wrapper.width - 3; y: 5; spacing: 2
+// height: parent.height;
+// Text { Rectangle {anchors.fill: parent; color: "white"; opacity: 0.5;}
+// verticalAlignment: Text.AlignVCenter; text: model.article.title; color: (model.article.unread=="True") ? "white" : "#7b97fd"; width: parent.width; wrapMode: Text.WordWrap; font.bold: false; /*elide: Text.ElideRight;*/ /*style: Text.Raised;*/ styleColor: "black"; }
+// //Text { text: feedname; width: parent.width; elide: Text.ElideLeft; color: "#cccccc"; style: Text.Raised; styleColor: "black" }
+// }
+ }
+ MouseArea {
+ anchors.fill: wrapper;
+ onClicked: {
+ container.articleClicked(model.article.articleid, index)
+ }
+ }
+ }
+
+ }
+
+}
--- /dev/null
+import Qt 4.7
+
+Item {
+// anchors.fill: parent;
+ width: parent.width; height: parent.height;
+ //anchors.top: parent.top; anchors.bottom: parent.bottom
+ property bool inEditMode: true
+
+ function reload() {
+ categories.reload();
+ }
+
+ ListView {
+ id: categoryList; model: categories; delegate: categoryDelegate; z: 6;
+ cacheBuffer: 100; width: parent.width; height: parent.height;
+ }
+
+ XmlListModel {
+
+ id: categories
+
+ xml: controller.getCategoryXml()
+ query: "/xml/category"
+
+ XmlRole { name: "title"; query: "catname/string()" }
+ XmlRole { name: "catid"; query: "catid/string()"; isKey: true }
+ }
+
+ Component {
+ id: categoryDelegate
+
+ Item {
+
+ id: wrapper; width: wrapper.ListView.view.width; height: 86
+ Item {
+ id: moveMe
+ height: parent.height
+ Rectangle { color: "black"; opacity: index % 2 ? 0.2 : 0.4; height: 84; width: wrapper.width; y: 1 }
+ Rectangle {
+ x: 6; y: 4; width: 77; height: parent.height - 9; color: "white"; smooth: true
+
+ }
+ Column {
+ x: 92; width: wrapper.ListView.view.width - 95; y: 15; spacing: 2
+ Text { text: title; color: "white"; width: parent.width; font.bold: true; elide: Text.ElideRight; style: Text.Raised; styleColor: "black" }
+ //Text { text: feedname; width: parent.width; elide: Text.ElideLeft; color: "#cccccc"; style: Text.Raised; styleColor: "black" }
+ }
+ Item {
+ x: wrapper.ListView.view.width - 128; y: 12
+ height: 58; width: 58;
+ //anchors.horizontalCenter: parent.horizontalCenter;
+ Image { source: "common/images/wmEditIcon.png" }
+ MouseArea {
+ anchors.fill: parent; onClicked: { container.categoryEdit(catname, catid); }
+ }
+ visible: inEditMode
+ }
+ Item {
+ x: wrapper.ListView.view.width - 64; y: 12
+ height: 58; width: 58;
+ //anchors.horizontalCenter: parent.horizontalCenter;
+ Image { source: "common/images/delete.png" }
+ MouseArea {
+ anchors.fill: parent; onClicked: { container.categoryDeleted(catid); }
+ }
+ visible: inEditMode
+ }
+ }
+ MouseArea { enabled: !inEditMode; anchors.fill: wrapper; onClicked: { container.categoryClicked(catid); } }
+ }
+ }
+}
--- /dev/null
+import Qt 4.7
+import "common" as Common
+// Depends on qt4-declarative-qmlviewer
+
+Item {
+ width: 480
+ height: 640
+ anchors.fill: parent
+ id: screen
+
+ Rectangle {
+ id: container
+ anchors.fill: parent; color: "#343434";
+ anchors.centerIn: parent
+ //transformOrigin: Item.Center
+ property bool editMode: false
+ property bool inPortrait: width < height
+
+ function categoryClicked(catid) {
+ feedsItem.catid = catid;
+ feedsItem.reload();
+ categoriesItem.isShown = false;
+ feedsItem.visible = true;
+ }
+
+ function feedClicked(feedid, updating) {
+ flipper.feedid = feedid;
+ flipper.reload();
+ toolBar.feedUpdating = updating;
+ flipper.visible = true;
+ }
+
+ function backClicked() {
+ if (flipper.visible && flipper.articleShown) {
+ // We're viewing an article, and going back to article listing
+ flipper.articleShown = false;
+ flipper.reload()
+ //flipper.articleid = "";
+ //flipper.value = 1;
+ //articlesItem.reload()
+ return;
+ }
+ if (flipper.visible) {
+ feedsItem.reload();
+ toolBar.feedUpdating = false;
+ flipper.visible = false;
+ flipper.feedid = "";
+ flipper.reload();
+ return;
+ }
+
+ if (feedsItem.visible) {
+ // Viewing feeds, going back to categories
+ //feedsItem.catid = "";
+ feedsItem.visible = false;
+ //feedsItem.reload();
+ categoriesItem.isShown = true;
+ return;
+ }
+ if (!feedsItem.visible) {
+ // Viewing categories, quitting
+ Qt.quit();
+ }
+ }
+
+ function categoryDeleted(catid) {
+ confirmationMessage.catid=catid;
+ confirmationMessage.state="deleteCat";
+ }
+
+ function feedDeleted(catid, feedid) {
+ confirmationMessage.catid=catid;
+ confirmationMessage.feedid=feedid;
+ confirmationMessage.state="deleteFeed";
+ }
+
+ function feedEdit(feedname, feedid, url) {
+ addFeed.feedEdit = true;
+ addFeed.feedName = feedname;
+ addFeed.feedUrl = url;
+ addFeed.visible = true;
+ }
+
+ function addCategory(categoryName) {
+ controller.addCategory(categoryName)
+ categoriesItem.reload();
+ addCat.visible=false;
+ }
+
+ function addFeed(catid, feedName, feedURL) {
+ controller.addFeed(feedName, feedURL, catid)
+ var doc = new XMLHttpRequest();
+ feedsItem.reload();
+ addFeedDialog.visible=false;
+ }
+
+ function updateClicked(feedid) {
+ controller.updateFeed(feedid);
+ }
+
+ function updateAllClicked() {
+ controller.updateAll();
+ }
+
+ Common.Menu {
+ id: config
+ z: 5
+ property string hideReadFeeds : "False"
+ property string hideReadArticles : "False"
+
+ property bool isShown: false;
+
+ //width: parent.width; height: parent.height;
+
+ //height: 0
+ states: State {
+ name: "shown"; when: config.isShown == true
+ PropertyChanges { target: config; y: 66 }
+ }
+
+ transitions: Transition {
+ NumberAnimation { properties: "y"; duration: 300; easing.type: "InOutQuad" }
+ }
+
+ }
+
+ Common.ConfirmationMessage {
+ id: confirmationMessage;
+ property string catid: "";
+ property string feedid: "";
+
+ function action() {
+ if (state=="markAll") {
+ flipper.markAllAsRead();
+ state="hidden"
+ feedsItem.reload()
+ return;
+ }
+ if (state=="deleteCat") {
+ var doc = new XMLHttpRequest();
+ var url = "http://localhost:8000/deleteCat/"+catid
+ doc.open("GET", url);
+ doc.send();
+ categoriesItem.reload();
+ state="hidden";
+ return;
+ }
+ if (state=="deleteFeed") {
+ var doc = new XMLHttpRequest();
+ var url = "http://localhost:8000/deleteFeed/"+catid+"/"+feedid
+ doc.open("GET", url);
+ doc.send();
+ feedsItem.reload();
+ state="hidden";
+ return;
+ }
+ }
+ visible: false
+ onOkClicked: action()
+ onCancelClicked: visible=false
+ state: "hidden"
+ states: [ State {name: "markAll";
+ PropertyChanges { target: confirmationMessage; text: qsTr("Do you want to mark all items as read?") }
+ PropertyChanges { target: confirmationMessage; visible: true; }
+
+ }, State {name: "deleteCat";
+ PropertyChanges { target: confirmationMessage; text: qsTr("Do you want to delete this category?") }
+ PropertyChanges { target: confirmationMessage; visible: true; }
+ }, State {name: "deleteFeed";
+ PropertyChanges { target: confirmationMessage; text: qsTr("Do you want to delete this feed and all its articles?") }
+ PropertyChanges { target: confirmationMessage; visible: true; }
+ }, State {name: "hidden";
+ PropertyChanges { target: confirmationMessage; visible: false; }
+ }
+ ]
+
+ }
+
+ Common.ToolBar {
+ id: toolBar; z: 7
+ height: 66; anchors.top: parent.top; width: parent.width; opacity: 0.9
+ menuLabel: qsTr("Config"); backLabel: qsTr("Back")
+ nextLabel: qsTr("Next"); prevLabel: qsTr("Previous")
+ markAllLabel: qsTr("Mark All As Read"); zoomLabel: qsTr("Zoom")
+ taskSwitcherLabel: qsTr("Task Switch")
+ onMenuClicked: config.isShown = !config.isShown;
+ onBackClicked: container.backClicked()
+ onPrevClicked: flipper.prev();
+ onNextClicked: flipper.next();
+ onMarkAllClicked: {
+ confirmationMessage.state = "markAll";
+ }
+ onZoomClicked: { flipper.zoomEnabled = !flipper.zoomEnabled; }
+ onTaskSwitcherClicked: {
+ var doc = new XMLHttpRequest();
+ var url = "http://localhost:8000/task"
+ doc.open("GET", url);
+ doc.send();
+ }
+ onAddClicked: {
+ if (feedsItem.visible) {
+ addFeedDialog.feedName="";
+ addFeedDialog.catid = feedsItem.catid;
+ addFeedDialog.visible = true;
+ return;
+ }
+ if (categoriesItem.visible) {
+ addCat.catName="";
+ addCat.visible=true;
+ return;
+ }
+ }
+ onUpdateClicked: {
+ if (flipper.visible) {
+ toolBar.feedUpdating = true
+ container.updateClicked(flipper.feedid);
+ } else {
+ container.updateAllClicked();
+ }
+ }
+
+ states: [ State {
+ name: "navButtons"; when: flipper.articleShown
+ PropertyChanges { target: toolBar; nextVisible: !container.inPortrait; }
+ PropertyChanges { target: toolBar; prevVisible: !container.inPortrait; }
+ //PropertyChanges { target: toolBar; zoomVisible: true; }
+ PropertyChanges { target: toolBar; addVisible: false; }
+ },
+ State {
+ name: "feedButtons"; when: (flipper.visible)&&(!flipper.articleShown)
+ PropertyChanges { target: toolBar; markAllVisible: true; }
+ PropertyChanges { target: toolBar; addVisible: false; }
+ PropertyChanges { target: toolBar; updateVisible: true; }
+ },
+ State {
+ name: "quitButton"; when: (!feedsItem.visible)
+ PropertyChanges { target: toolBar; quitVisible: true;}
+ PropertyChanges { target: toolBar; updateVisible: true; }
+ //PropertyChanges { target: toolBar; addVisible: true; }
+ }
+ ]
+ }
+
+ Item {
+ id: views
+ //x: 2;
+ //y:66;
+ width: parent.width // - 4
+ height: parent.height-toolBar.height;
+ anchors.top: toolBar.bottom; anchors.bottom: parent.bottom
+ y: toolBar.height;
+
+ Common.AddCat {
+ visible: false;
+ id: addCat
+ width: parent.width;
+ height: parent.height;
+ z: 10;
+ }
+
+ Common.AddFeed {
+ visible: false;
+ id: addFeedDialog
+ width: parent.width;
+ height: parent.height;
+ z: 10;
+ }
+
+ Timer {
+ function checkUpdates() {
+ if (categoriesItem.visible && !feedsItem.visible) {
+ var doc = new XMLHttpRequest();
+ var url = "http://localhost:8000/isUpdating/"
+ doc.onreadystatechange = function() {
+ if (doc.readyState == XMLHttpRequest.DONE) {
+ var xmlDoc = doc.responseXML.documentElement;
+ //var els = xmlDoc.getElementsByTagName("updating");
+ var isUpdating = xmlDoc.firstChild.firstChild.nodeValue;
+
+ //console.log(isUpdating);
+ if (isUpdating=="True") {
+ toolBar.feedUpdating = true;
+ } else {
+ if (toolBar.feedUpdating) {
+ // We changed from updating to not updating, so we reload the listing
+ toolBar.feedUpdating = false;
+ categoriesItem.reload();
+ }
+ }
+ var commands = xmlDoc.lastChild.childNodes;
+ for (var ii = 0; ii < commands.length; ++ii) {
+ // process the commands
+ var command = commands[ii].attributes[0].value; //("c")
+ //console.log(command)
+ if (command=="openFeed") {
+ // Open feed feed
+ var catid = commands[ii].attributes[1].value;
+ var feedid = commands[ii].firstChild.nodeValue;
+ if (!flipper.visible) {
+ container.categoryClicked(catid);
+ container.feedClicked(feedid,false);
+ console.log("feedid: " + feedid);
+ }
+ }
+ if (command=="openArticle") {
+ // Open feed and article
+ var catid = commands[ii].attributes[1].value;
+ var feedid = commands[ii].attributes[2].value; //("key");
+ var articleid = commands[ii].firstChild.nodeValue;
+ if (!flipper.visible) {
+ container.categoryClicked(catid);
+ container.feedClicked(feedid,false);
+ flipper.viewArticle(articleid)
+ }
+ }
+ if (command=="addFeed") {
+ // Open the addFeed dialog
+ var url = commands[ii].firstChild.nodeValue;
+ //console.log("add: "+url)
+
+ }
+ }
+
+ }
+ }
+ doc.open("GET", url);
+ doc.send();
+ //categoriesItem.reload()
+ }
+ if (feedsItem.visible && !flipper.visible) {
+ //feedsItem.reload()
+ }
+ if (flipper.visible) {
+ var doc = new XMLHttpRequest();
+ var url = "http://localhost:8000/isUpdating/" + flipper.feedid
+ doc.onreadystatechange = function() {
+ if (doc.readyState == XMLHttpRequest.DONE) {
+ var xmlDoc = doc.responseXML.documentElement;
+ var isUpdating = xmlDoc.firstChild.firstChild.nodeValue;
+ //console.log(isUpdating);
+ if (isUpdating=="True") {
+ toolBar.feedUpdating = true;
+ } else {
+ if (toolBar.feedUpdating) {
+ // We changed from updating to not updating, so we reload the listing
+ toolBar.feedUpdating = false;
+ flipper.reload();
+ }
+ }
+ }
+ }
+ doc.open("GET", url);
+ doc.send();
+
+ //flipper.reload()
+ }
+ }
+ interval: 2000; running: false; repeat: true
+ onTriggered: checkUpdates();
+ }
+
+ Categories {
+ // Loads the categoryList view and delegate
+ id: categoriesItem
+ property bool isShown: true;
+ inEditMode: container.editMode;
+
+ states: State {
+ name: "shown"; when: categoriesItem.isShown == false
+ PropertyChanges { target: categoriesItem; x: -screen.width }
+ }
+
+ transitions: Transition {
+ NumberAnimation { properties: "x"; duration: 300; easing.type: "InOutQuad" }
+ }
+
+ }
+
+ Feeds {
+
+ // Loads the feedList view and delegate
+ id: feedsItem;
+ property string hideReadFeeds: config.hideReadFeeds
+ visible: false;
+ inEditMode: container.editMode;
+
+ states: [
+ State { name: "articlesShown"; when: flipper.visible; PropertyChanges { target: feedsItem; x: -parent.width } },
+ State { name: "shown"; when: feedsItem.visible; PropertyChanges { target: feedsItem; x: 0 } }
+ ]
+
+ transitions: Transition {
+ NumberAnimation { properties: "x"; duration: 300; easing.type: "InOutQuad" }
+ }
+
+ }
+
+ ArticleViewer {
+ id: flipper
+ visible: false;
+ property string hideReadArticles: config.hideReadArticles
+ property string feedid: ""
+ x: parent.width
+
+ states: State { name: "shown"; when: flipper.visible; PropertyChanges { target: flipper; x: 0 }
+ }
+
+ transitions: Transition {
+ NumberAnimation { properties: "x"; duration: 300; easing.type: "InOutQuad" }
+ }
+ }
+ }
+
+// Text {
+// x: container.width/2
+// y:container.height/2
+// text: runtime.orientation;
+// }
+
+}
+}
--- /dev/null
+import Qt 4.7
+
+Item {
+ //anchors.fill: parent;
+ width: parent.width;
+ property string catid : ""
+ property bool inEditMode: true
+ x: parent.width; height: parent.height;
+ anchors.top: parent.top; anchors.bottom: parent.bottom
+
+ function reload() {
+ feeds.xml = catid == "" ? "" : controller.getFeedsXml(catid);
+ //feeds.reload()
+ }
+
+ //Component.onCompleted: { console.log(x + " /") }
+
+ ListView {
+ id: feedList; model: feeds; delegate: feedDelegate; z: 6
+ width: parent.width; height: parent.height; /*x: 0;*/
+ cacheBuffer: 100;
+ flickDeceleration: 1500
+ }
+
+ XmlListModel {
+
+ id: feeds
+
+ //source: catid == "" ? "" : "http://localhost:8000/feeds/" + catid //+ "?onlyUnread=" + parent.hideReadArticles
+ //xml: catid == "" ? "" : controller.getFeedsXml(catid)
+ query: "/xml/feed"
+
+ XmlRole { name: "title"; query: "feedname/string()" }
+ XmlRole { name: "feedid"; query: "feedid/string()"; isKey: true }
+ XmlRole { name: "unread"; query: "unread/string()"; isKey: true }
+ XmlRole { name: "updatedDate"; query: "updatedDate/string()" }
+ XmlRole { name: "icon"; query: "icon/string()" }
+ XmlRole { name: "updating"; query: "updating/string()"; isKey: true }
+ //XmlRole { name: "url"; query: "url/string()"; }
+ }
+
+ Component {
+ id: feedDelegate
+
+ Item {
+ id: wrapper; width: wrapper.ListView.view.width;
+ visible: (unread == "0" && feedsItem.hideReadFeeds=="True") ? false : true
+ height: (visible) ? 86 : 0
+
+ Item {
+ id: moveMe
+ Rectangle { color: "black"; opacity: index % 2 ? 0.2 : 0.4; height: 84; width: wrapper.width; y: 1 }
+ Rectangle {
+ x: 3; y: 4; width: 77; height: 77; color: "#000000"; smooth: true
+ Image { width:32; height: 32; anchors.verticalCenter: parent.verticalCenter; anchors.horizontalCenter: parent.horizontalCenter;
+ source: (updating=="True")? "common/images/loading.png" : (icon == "False") ? "common/images/feedingit.png" : icon;
+ NumberAnimation on rotation {
+ from: 0; to: 360; running: (updating=="True"); loops: Animation.Infinite; duration: 900
+ }
+ }
+ }
+
+ Column {
+ x: 92; width: wrapper.ListView.view.width - 95; y: 5; spacing: 2
+ Text { text: title; color: "white"; width: parent.width; font.bold: true; elide: Text.ElideRight; style: Text.Raised; styleColor: "black" }
+ Text { text: updatedDate + " / " + qsTr("%1 unread items").arg(unread); color: (unread=="0") ? "white" : "#7b97fd"; width: parent.width; font.bold: false; elide: Text.ElideRight; style: Text.Raised; styleColor: "black" }
+ //Text { text: feedname; width: parent.width; elide: Text.ElideLeft; color: "#cccccc"; style: Text.Raised; styleColor: "black" }
+ }
+// Item {
+// x: wrapper.ListView.view.width - 128; y: 12
+// height: 58; width: 58;
+// //anchors.horizontalCenter: parent.horizontalCenter;
+// Image { source: "common/images/wmEditIcon.png" }
+// MouseArea {
+// anchors.fill: parent; onClicked: { container.feedEdit(feedname, feedid, url); }
+// }
+// visible: inEditMode
+// }
+ Item {
+ x: wrapper.ListView.view.width - 64; y: 12
+ height: 58; width: 58;
+ //anchors.horizontalCenter: parent.horizontalCenter;
+ Image { source: "common/images/delete.png" }
+ MouseArea {
+ anchors.fill: parent; onClicked: { container.feedDeleted(feedid); }
+ }
+ visible: inEditMode
+ }
+ }
+ MouseArea {
+ anchors.fill: wrapper;
+ onClicked: {
+ controller.feedClicked(model.feed)
+ container.feedClicked(feedid, updating=="True")
+
+ }
+ }
+ }
+
+ }
+
+}
--- /dev/null
+import QtQuick 1.1
+import com.meego 1.0
+
+Page {
+ id: mainPage
+ tools: commonTools
+ Label {
+ id: label
+ anchors.centerIn: parent
+ text: qsTr("Hello world!")
+ visible: false
+ }
+ Button{
+ anchors.horizontalCenter: parent.horizontalCenter
+ anchors.top: label.bottom
+ anchors.topMargin: 10
+ text: qsTr("Click here!")
+ onClicked: label.visible=true
+ }
+}
--- /dev/null
+import Qt 4.7
+import QtWebKit 1.0
+import "common" as Common
+
+Rectangle {
+ width: 380
+ height: 480
+
+ //anchors.top: parent.top; anchors.bottom: parent.bottom;
+ color: "white";
+
+ property string url: "";
+ Flickable {
+ id: flickable
+ //anchors.fill: screen;
+ height: parent.height;
+ width: parent.width;
+ contentWidth: webView.width*scale; //Math.max(screen.width,webView.width*webView.scale)
+ contentHeight: Math.max(parent.height,webView.height*webView.scale)
+
+ WebView {
+ id: webView
+ url: "http://www.google.com";
+ //url: "/home/user/.feedingit/640fb167aca8bf5318ed721c5162f5eb.d/56a86b6b1675716ab54db83b1a78ab4c.html"
+ preferredWidth: flickable.width
+ preferredHeight: flickable.height
+ settings.defaultFontSize: 32
+ scale: slider.value;
+ //smooth: false
+ //width: 200
+ //width: parent.width; height: parent.height;
+// Rectangle {
+// color: "#10000000"
+// anchors.fill: parent
+// }
+ //onLoadFinished: {console.log("Hello"); url="javascript:void(document.body.style.background='red');" }
+ onLoadFinished: {console.log(url);/* url="javascript:(function() { " +
+ "document.getElementsByTagName('body')[0].style.background = 'red'; " +
+ "})()"; console.log(url);*/ /*heuristicZoom(0,0,100)*/ }
+ }
+ }
+ Common.Slider {
+ id: slider; visible: true
+ minimum: 0.2;
+ maximum: 2;
+ value: 1
+ property real prevScale: 1
+ anchors {
+ bottom: parent.bottom; bottomMargin: 65
+ left: parent.left; leftMargin: 25
+ right: parent.right; rightMargin: 25
+ }
+ onValueChanged: {
+ if (webView.width * value > flickable.width) {
+ var xoff = (flickable.width/2 + flickable.contentX) * value / prevScale;
+ flickable.contentX = xoff - flickable.width/2;
+ }
+ if (webView.height * value > flickable.height) {
+ var yoff = (flickable.height/2 + flickable.contentY) * value / prevScale;
+ flickable.contentY = yoff - flickable.height/2;
+ }
+ prevScale = value;
+ }
+ Component.onCompleted: { value=0; value=1; }
+ }
+}
--- /dev/null
+import Qt 4.7
+
+Rectangle {
+ id: addCat;
+ width: 200 //parent.width
+ height: 172
+ color: "white"
+ property alias catName: categoryName.text
+ MouseArea { anchors.fill: parent; onClicked: {} }
+ Column {
+ Row {
+ width: addCat.width
+ height: 86;
+ Text { anchors.verticalCenter: parent.verticalCenter; text: qsTr("Category name:") }
+ LineInput{
+ id: categoryName
+ anchors.centerIn: parent
+ width: 140
+ focus: true
+ }
+ }
+ Row {
+ width: addCat.width
+ Button {
+ id: ok
+ text: qsTr("OK")
+ anchors.margins: 5; y: 3; width: 80; height: 60
+ onClicked: container.addCategory(categoryName.text)
+ }
+
+ Button {
+ id: cancel
+ text: qsTr("Cancel")
+ anchors.margins: 5; y: 3; width: 80; height: 60
+ onClicked: addCat.visible=false;
+ }
+ }
+ }
+
+}
--- /dev/null
+import Qt 4.7
+
+Rectangle {
+ id: addFeed;
+ width: 500 //parent.width
+ height: 172
+ color: "white"
+ property alias feedName: feedName.text
+ property string catid
+ property string feedUrl: feedURL.text
+ //property boolean feedEdit: false;
+
+ MouseArea { anchors.fill: parent; onClicked: {} }
+ Column {
+ Row {
+ width: addFeed.width
+ height: 86;
+ Text { anchors.verticalCenter: parent.verticalCenter; text: qsTr("Feed name:") }
+ LineInput{
+ id: feedName
+ anchors.centerIn: parent
+ width: 140
+ focus: true
+ }
+ }
+ Row {
+ width: addFeed.width
+ height: 86;
+ Text { anchors.verticalCenter: parent.verticalCenter; text: qsTr("Feed URL:") }
+ LineInput{
+ id: feedURL
+ anchors.centerIn: parent
+ width: 140
+ focus: true
+ text: "http://"
+ }
+ }
+ Row {
+ width: addFeed.width
+ Button {
+ id: ok
+ text: qsTr("OK")
+ anchors.margins: 5; y: 3; width: 80; height: 60
+ onClicked: container.addFeed(catid, feedName.text, feedURL.text)
+ }
+ Button {
+ id: cancel
+ text: qsTr("Cancel")
+ anchors.margins: 5; y: 3; width: 80; height: 60
+ onClicked: addFeed.visible=false;
+ }
+ }
+ }
+}
--- /dev/null
+import Qt 4.7
+
+Item {
+ id: container
+
+ signal clicked
+
+ property string text
+ property string imageSource: ""
+ property int imageRotation: 0
+
+ property alias iconRotation: icon.rotation
+
+ BorderImage {
+ id: buttonImage
+ source: "images/toolbutton.sci"
+ width: container.width; height: container.height
+ //visible: (container.imageSource=="")
+ }
+ BorderImage {
+ id: pressed
+ opacity: 0
+ source: "images/toolbutton.sci"
+ width: container.width; height: container.height
+ //visible: (container.imageSource=="")
+ }
+ Image {
+ id: icon
+ source: container.imageSource
+ rotation: container.imageRotation
+ //fillMode: Image.PreserveAspectFit
+ smooth: true
+ anchors.centerIn: buttonImage;
+ //width: container.width; height: container.height
+ }
+ MouseArea {
+ id: mouseRegion
+ anchors.fill: buttonImage
+ onClicked: { container.clicked(); }
+ }
+ Text {
+ color: "white"
+ anchors.centerIn: buttonImage; font.bold: true
+ text: container.text; style: Text.Raised; styleColor: "black"
+ visible: (container.imageSource=="")
+ }
+ states: [
+ State {
+ name: "Pressed"
+ when: mouseRegion.pressed == true
+ PropertyChanges { target: pressed; opacity: 1 }
+ }
+ ]
+}
--- /dev/null
+import Qt 4.7
+
+Rectangle {
+ id: confirmationMessage
+ signal okClicked
+ signal cancelClicked
+
+ property alias text: question.text
+
+ border.color: "black";
+ border.width : 4;
+ radius: 10;
+ color: "white"
+ height: 160;
+ width: 160;
+ z: 10;
+ anchors.fill: parent
+
+ Text {
+ id: question
+ text: qsTr("Are you sure?")
+ width: parent.width; height: 80
+ horizontalAlignment: Text.AlignHCenter
+ verticalAlignment: Text.AlignVCenter
+ anchors.top: parent.top
+ //anchors.bottom: parent.bottom
+ anchors.margins: 10;
+ //anchors.verticalCenter: parent.verticalCenter
+ }
+
+ Button {
+ id: ok
+ text: qsTr("OK")
+ width: parent.width/2 - 10;
+ anchors.left: parent.left; anchors.margins: 5; y: 3; height: 60
+ anchors.top: question.bottom
+ //anchors.bottom: parent.bottom
+ onClicked: confirmationMessage.okClicked()
+ }
+
+ Button {
+ id: cancel
+ text: qsTr("Cancel")
+ width: parent.width/2 - 10;
+ anchors.right: parent.right; anchors.margins: 5; y: 3; height: 60
+ anchors.top: question.bottom
+ //anchors.bottom: parent.bottom
+ anchors.left: ok.right
+ onClicked: confirmationMessage.cancelClicked()
+ }
+
+}
--- /dev/null
+import Qt 4.7
+
+FocusScope {
+ property alias text: input.text
+ property alias maximumLength: input.maximumLength
+ //anchors.centerIn: parent
+ width: 180; height: 28
+ BorderImage {
+ source: "images/lineedit.sci"
+ anchors.fill: parent
+ }
+ TextInput {
+ id: input
+ color: "#151515"; selectionColor: "green"
+ font.pixelSize: 16; font.bold: true
+ width: parent.width-16
+ anchors.centerIn: parent
+ focus: true
+ }
+}
--- /dev/null
+import Qt 4.7
+
+Rectangle {
+ width: 640
+ height: 480
+
+ ListView {
+ id: categoryList; model: categories; delegate: categoryDelegate; z: 6;
+ cacheBuffer: 100; width: parent.width; height: parent.height;
+ }
+
+
+
+}
--- /dev/null
+import Qt 4.7
+
+Item {
+// anchors.fill: parent;
+ width: 300; //height: 0;
+ //anchors.top: parent.top; anchors.bottom: parent.bottom
+ y: -parent.height
+
+ function getConfig() {
+ config.hideReadFeeds = controller.getConfig("hideReadFeeds");
+ config.hideReadArticles = controller.getConfig("hideReadArticles");
+
+ }
+
+ Switch {
+ id: hideReadFeedsSwitch;
+ text: qsTr("Hide Read Feeds");
+ value: config.hideReadFeeds
+ onClicked: config.hideReadFeeds = (config.hideReadFeeds == "False") ? "True" : "False"
+ }
+
+ Switch {
+ id: hideReadArticlesSwitch;
+ text: qsTr("Hide Read Articles");
+ value: config.hideReadArticles
+ onClicked: config.hideReadArticles = (config.hideReadArticles == "False") ? "True" : "False"
+ anchors.top: hideReadFeedsSwitch.bottom
+ }
+
+ Switch {
+ id: lockRotation;
+ text: qsTr("Lock Rotation");
+ value: container.lockRotation ? "True" : "False"
+ onClicked: { container.lockRotation=!container.lockRotation;
+ container.selectedOrientation = (container.lockRotation) ? container.activeOrientation : Orientation.UnknownOrientation }
+ anchors.top: hideReadArticlesSwitch.bottom
+ }
+
+ Switch {
+ id: editMode;
+ text: qsTr("Enter Edit Mode");
+ value: container.editMode ? "True" : "False"
+ onClicked: { container.editMode=!container.editMode; }
+ anchors.top: lockRotation.bottom
+ }
+
+ Rectangle {
+ id: closeButton
+ height: 50;
+ gradient: Gradient {
+ GradientStop {
+ position: 0.00;
+ color: "#343434";
+ }
+ GradientStop {
+ position: 1.00;
+ color: "#ffffff";
+ }
+ }
+ radius: 10;
+ width: parent.width
+ anchors.top: editMode.bottom
+
+ MouseArea {
+ id: mouseRegion
+ anchors.fill: closeButton
+ onClicked: { config.isShown = false }
+ }
+ }
+
+// ListView {
+// id: configList; model: configs; delegate: configDelegate; z: 6;
+// cacheBuffer: 100; width: parent.width; height: parent.height;
+// }
+
+// XmlListModel {
+
+// id: configs
+
+// //source: "http://api.flickr.com/services/feeds/photos_public.gne?"+(tags ? "tags="+tags+"&" : "")+"format=rss2"
+// //source: "/home/ymarcoz/feedlist.xml"
+// source: "http://localhost:8000/config"
+// query: "/xml/config"
+// //namespaceDeclarations: "declare namespace media=\"http://search.yahoo.com/mrss/\";"
+
+// XmlRole { name: "hideReadFeeds"; query: "hideReadFeeds/string()" }
+// XmlRole { name: "hideReadArticles"; query: "hideReadArticles/string()" }
+// //XmlRole { name: "catid"; query: "catid/string()"; isKey: true }
+
+// }
+
+// Component {
+// id: configDelegate
+
+// Item {
+
+// id: wrapper; width: wrapper.ListView.view.width; height: 86
+// Item {
+// id: moveMe
+// height: parent.height
+// Rectangle { color: "black"; opacity: index % 2 ? 0.2 : 0.4; height: 84; width: wrapper.width; y: 1 }
+// Rectangle {
+// x: 6; y: 4; width: 77; height: parent.height - 9; color: "white"; smooth: true
+
+// }
+// Column {
+// x: 92; width: wrapper.ListView.view.width - 95; y: 15; spacing: 2
+// Text { text: title; color: "white"; width: parent.width; font.bold: true; elide: Text.ElideRight; style: Text.Raised; styleColor: "black" }
+// //Text { text: feedname; width: parent.width; elide: Text.ElideLeft; color: "#cccccc"; style: Text.Raised; styleColor: "black" }
+// }
+// }
+// MouseArea { anchors.fill: wrapper; onClicked: { container.categoryClicked(catid); } }
+// }
+// }
+
+ Component.onCompleted: getConfig();
+}
--- /dev/null
+import Qt 4.7
+
+Item {
+ id: slider; width: 340; height: 48
+
+ // value is read/write.
+ property real value
+ onValueChanged: { handle.x = 2 + (value - minimum) * slider.xMax / (maximum - minimum); }
+ property real maximum: 1
+ property real minimum: 1
+ property int xMax: slider.width - handle.width - 4
+
+ Rectangle {
+ anchors.fill: parent
+ border.color: "white"; border.width: 0; radius: 8
+ gradient: Gradient {
+ GradientStop { position: 0.0; color: "#66343434" }
+ GradientStop { position: 1.0; color: "#66000000" }
+ }
+ }
+
+ Rectangle {
+ id: handle; smooth: true
+ x: slider.width / 2 - handle.width / 2; y: 2; width: 30; height: slider.height-4; radius: 6
+ gradient: Gradient {
+ GradientStop { position: 0.0; color: "lightgray" }
+ GradientStop { position: 1.0; color: "gray" }
+ }
+
+ MouseArea {
+ anchors.fill: parent; drag.target: parent
+ drag.axis: "XAxis"; drag.minimumX: 2; drag.maximumX: slider.xMax+2
+ onPositionChanged: { value = (maximum - minimum) * (handle.x-2) / slider.xMax + minimum; }
+ }
+ }
+}
--- /dev/null
+import Qt 4.7
+
+Item {
+ id: container
+
+ signal clicked
+
+ property string text
+ property string value
+
+ width: parent.width;
+ height: 86;
+ //anchors.fill: parent;
+
+// BorderImage {
+// id: buttonImage
+// source: "images/toolbutton.sci"
+// width: container.width; height: container.height
+// }
+// BorderImage {
+// id: pressed
+// opacity: 0
+// source: "images/toolbutton.sci"
+// width: container.width; height: container.height
+// }
+
+ Rectangle {
+ id: back
+ width: parent.width;
+ height: 82;
+ color: "#343434";
+ border.width : 4;
+ border.color: "black";
+ radius: 10;
+ }
+
+ Rectangle {
+ id: valueSwitch
+ color: (value=="False") ? "red" : "green";
+ border.width : 4;
+ border.color: "black";
+ radius: 10;
+ height: 40;
+ width: 40;
+ anchors.verticalCenter: back.verticalCenter
+ //anchors.verticalCenter: parent.verticalCenter
+ anchors.margins: 10;
+ anchors.right: back.right;
+ Text {
+ color: "white"
+ anchors.centerIn: valueSwitch; font.bold: true
+ text: (container.value == "False") ? "OFF" : "ON"; style: Text.Raised; styleColor: "black"
+ }
+ }
+
+ MouseArea {
+ id: mouseRegion
+ anchors.fill: back
+ onClicked: { container.clicked(); }
+ }
+ Text {
+ color: "white"
+ /*anchors.centerIn: back;*/ font.bold: true
+ anchors.left: parent.left;
+ anchors.margins: 10
+ anchors.verticalCenter: back.verticalCenter
+ text: container.text; style: Text.Raised; styleColor: "black"
+ }
+// states: [
+// State {
+// name: "Pressed"
+// when: mouseRegion.pressed == true
+// PropertyChanges { target: pressed; opacity: 1 }
+// }
+// ]
+}
--- /dev/null
+import Qt 4.7
+
+Item {
+ id: toolbar
+
+ property alias menuLabel: menuButton.text
+ property alias backLabel: backButton.text
+ property alias prevLabel: prevButton.text
+ property alias nextLabel: nextButton.text
+ property alias markAllLabel: markAllButton.text
+ property alias zoomLabel: zoomButton.text
+ property alias taskSwitcherLabel: taskSwitcherButton.text
+
+ property alias nextVisible: nextButton.visible
+ property alias prevVisible: prevButton.visible
+ property alias markAllVisible: markAllButton.visible
+ property alias zoomVisible: zoomButton.visible
+ property alias quitVisible: quitButton.visible
+ property alias addVisible: addButton.visible
+ property alias updateVisible: updateFeedButton.visible
+
+ property bool feedUpdating: false
+
+ signal menuClicked
+ signal backClicked
+ signal prevClicked
+ signal nextClicked
+ signal markAllClicked
+ signal zoomClicked
+ signal taskSwitcherClicked
+ signal addClicked
+ signal updateClicked
+ //signal rotateClicked
+
+ //BorderImage { source: "images/titlebar.sci"; width: parent.width; height: parent.height + 14; y: -7 }
+ Rectangle {
+ anchors.fill: parent; color: "#343434";
+ border.color: "black"
+ gradient: Gradient {
+ GradientStop {
+ position: 0.00;
+ color: "#343434";
+ }
+ GradientStop {
+ position: 1.00;
+ color: "#ffffff";
+ }
+ }
+
+ Row {
+ anchors.fill: parent
+ Button {
+ id: taskSwitcherButton
+ /*anchors.left: parent.left;*/ anchors.leftMargin: 5; y: 3; width: 116; height: 60
+ onClicked: toolbar.taskSwitcherClicked()
+ imageSource: "images/wmTaskLauncherIcon.png"
+ visible: false
+ }
+
+ Button {
+ id: menuButton
+ /*anchors.left: taskSwitcherButton.right;*/ anchors.leftMargin: 5; y: 3; width: 60; height: 60
+ onClicked: toolbar.menuClicked()
+ imageSource: "images/wmEditIcon.png"
+ }
+
+ Button {
+ id: addButton
+ visible: true; /*anchors.left: menuButton.right;*/
+ anchors.rightMargin: 5; y: 3; width: 60; height: 60
+ onClicked: toolbar.addClicked()
+ imageSource: "images/plus.png"
+
+ }
+
+ Button {
+ id: updateFeedButton
+ visible: false; /*anchors.left: menuButton.right;*/
+ anchors.rightMargin: 5; y: 3; width: 60; height: 60
+ onClicked: toolbar.updateClicked()
+ //imageSource: (!feedUpdating) ? "images/rotate.png" : "images/loading.png"
+ NumberAnimation on iconRotation {
+ from: 0; to: 360; running: (visible == true) && (feedUpdating); loops: Animation.Infinite; duration: 900
+ }
+ state: "update"
+ states : [State {name: "loading"; when: (feedUpdating);
+ PropertyChanges {target: updateFeedButton; imageSource: "images/loading2.png" }
+ }, State { name: "update"; when: (!feedUpdating);
+ PropertyChanges {target: updateFeedButton; iconRotation: 0}
+ PropertyChanges {target: updateFeedButton; imageSource: "images/rotate.png"}
+ }
+ ]
+ }
+
+ Button {
+ id: markAllButton
+ visible: false
+ /*anchors.left: updateFeedButton.right;*/ anchors.rightMargin: 5; y: 3; width: 60; height: 60
+ onClicked: toolbar.markAllClicked()
+ imageSource: "images/checkmark.png"
+ }
+
+ Button {
+ id: prevButton
+ visible: false
+ /*anchors.left: menuButton.right;*/ anchors.rightMargin: 5; y: 3; width: 120; height: 60
+ onClicked: toolbar.prevClicked()
+ imageSource: "images/InputMethodShiftButtonNormal.png"
+ imageRotation: -90;
+ }
+
+ Button {
+ id: zoomButton
+ visible: false
+ /*anchors.right: backButton.left; */anchors.rightMargin: 5; y: 3; width: 80; height: 60
+ onClicked: toolbar.zoomClicked()
+ imageSource: "images/Zoom-In-icon.png"
+ }
+
+ Button {
+ id: nextButton
+ visible: false
+ /*anchors.right: zoomButton.left;*/ anchors.rightMargin: 5; y: 3; width: 120; height: 60
+ onClicked: toolbar.nextClicked()
+ imageSource: "images/InputMethodShiftButtonNormal.png"
+ imageRotation: 90
+ }
+
+ Button {
+ id: backButton
+ anchors.rightMargin: 5; y: 3; width: 116; height: 60
+ anchors.right: parent.right
+ onClicked: toolbar.backClicked()
+ imageSource: "images/wmBackIcon.png"
+ visible: !quitButton.visible
+ }
+
+ Button {
+ id: quitButton
+ visible: false
+ anchors.rightMargin: 5; y: 3; width: 116; height: 60
+ anchors.right: parent.right
+ onClicked: toolbar.backClicked()
+ imageSource: "images/wmCloseIcon.png"
+ }
+ }
+ }
+}
--- /dev/null
+border.left: 10
+border.top: 10
+border.bottom: 10
+border.right: 10
+source: lineedit.png
--- /dev/null
+border.left: 15
+border.top: 4
+border.bottom: 4
+border.right: 15
+source: toolbutton.png
--- /dev/null
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE TS>
+<TS version="2.0" language="en_CA">
+<context>
+ <name>FeedingItUI2</name>
+ <message>
+ <location filename="../FeedingItUI2.qml" line="53"/>
+ <source>Back</source>
+ <translation type="unfinished">Back 2</translation>
+ </message>
+</context>
+</TS>
--- /dev/null
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE TS>
+<TS version="2.0" language="en_CA">
+<context>
+ <name>FeedingItUI2</name>
+ <message>
+ <location filename="../FeedingItUI2.qml" line="75"/>
+ <source>Back</source>
+ <translation>Back 2</translation>
+ </message>
+ <message>
+ <location filename="../FeedingItUI2.qml" line="75"/>
+ <source>Config</source>
+ <translation>Config</translation>
+ </message>
+</context>
+<context>
+ <name>Feeds</name>
+ <message>
+ <source>unreadItems</source>
+ <translation type="obsolete"> %1 unread items</translation>
+ </message>
+ <message>
+ <location filename="../Feeds.qml" line="55"/>
+ <source>%1 unread items</source>
+ <translation>%1 unread items</translation>
+ </message>
+</context>
+</TS>
--- /dev/null
+import QtQuick 1.0
+import com.nokia.meego 1.0
+
+
+PageStackWindow {
+ initialPage: mainPage
+
+ Page{
+ id: mainPage
+ Component.onCompleted: {
+ var main = Qt.createComponent("FeedingIt.qml");
+ main.createObject(mainPage);
+ }
+ }
+}
--- /dev/null
+#!/usr/bin/env python2.5
+
+#
+# Copyright (c) 2007-2008 INdT.
+# Copyright (c) 2011 Neal H. Walfield
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+# ============================================================================
+# Name : FeedingIt.py
+# Author : Yves Marcoz
+# Version : 0.5.4
+# Description : Simple RSS Reader
+# ============================================================================
+
+from __future__ import with_statement
+
+import sqlite3
+from os.path import isfile, isdir
+from shutil import rmtree
+from os import mkdir, remove, utime
+import os
+import md5
+import feedparser
+import time
+import urllib2
+from BeautifulSoup import BeautifulSoup
+from urlparse import urljoin
+from calendar import timegm
+import threading
+import traceback
+from wc import wc, wc_init, woodchuck
+import subprocess
+import dbus
+from updatedbus import update_server_object
+
+from jobmanager import JobManager
+import mainthread
+from httpprogresshandler import HTTPProgressHandler
+import random
+import sys
+import logging
+logger = logging.getLogger(__name__)
+
+def getId(string):
+ return md5.new(string).hexdigest()
+
+def download_callback(connection):
+ if JobManager().do_quit:
+ raise KeyboardInterrupt
+
+def downloader(progress_handler=None, proxy=None):
+ openers = []
+
+ if progress_handler is not None:
+ openers.append(progress_handler)
+ else:
+ openers.append(HTTPProgressHandler(download_callback))
+
+ if proxy:
+ openers.append(proxy)
+
+ return urllib2.build_opener(*openers)
+
+def transfer_stats(sent, received, **kwargs):
+ """
+ This function takes two arguments: sent is the number of bytes
+ sent so far, received is the number of bytes received. The
+ function returns a continuation that you can call later.
+
+ The continuation takes the same two arguments. It returns a tuple
+ of the number of bytes sent, the number of bytes received and the
+ time since the original function was invoked.
+ """
+ start_time = time.time()
+ start_sent = sent
+ start_received = received
+
+ def e(sent, received, **kwargs):
+ return (sent - start_sent,
+ received - start_received,
+ time.time() - start_time)
+
+ return e
+
+# If not None, a subprocess.Popen object corresponding to a
+# update_feeds.py process.
+update_feed_process = None
+
+update_feeds_iface = None
+
+jobs_at_start = 0
+
+class BaseObject(object):
+ # Columns to cache. Classes that inherit from this and use the
+ # cache mechanism should set this to a list of tuples, each of
+ # which contains two entries: the table and the column. Note that
+ # both are case sensitive.
+ cached_columns = ()
+
+ def cache_invalidate(self, table=None):
+ """
+ Invalidate the cache.
+
+ If table is not None, invalidate only the specified table.
+ Otherwise, drop the whole cache.
+ """
+ if not hasattr(self, 'cache'):
+ return
+
+ if table is None:
+ del self.cache
+ else:
+ if table in self.cache:
+ del self.cache[table]
+
+ def lookup(self, table, column, id=None):
+ """
+ Look up a column or value. Uses a cache for columns in
+ cached_columns. Note: the column is returned unsorted.
+ """
+ if not hasattr(self, 'cache'):
+ self.cache = {}
+
+ # Cache data for at most 60 seconds.
+ now = time.time()
+ try:
+ cache = self.cache[table]
+
+ if time.time() - cache[None] > 60:
+ # logger.debug("%s: Cache too old: clearing" % (table,))
+ del self.cache[table]
+ cache = None
+ except KeyError:
+ cache = None
+
+ if (cache is None
+ or (table, column) not in self.cached_columns):
+ # The cache is empty or the caller wants a column that we
+ # don't cache.
+ if (table, column) in self.cached_columns:
+ # logger.debug("%s: Rebuilding cache" % (table,))
+
+ do_cache = True
+
+ self.cache[table] = cache = {}
+ columns = []
+ for t, c in self.cached_columns:
+ if table == t:
+ cache[c] = {}
+ columns.append(c)
+
+ columns.append('id')
+ where = ""
+ else:
+ do_cache = False
+
+ columns = (colums,)
+ if id is not None:
+ where = "where id = '%s'" % id
+ else:
+ where = ""
+
+ results = self.db.execute(
+ "SELECT %s FROM %s %s" % (','.join(columns), table, where))
+
+ if do_cache:
+ for r in results:
+ values = list(r)
+ i = values.pop()
+ for index, value in enumerate(values):
+ cache[columns[index]][i] = value
+
+ cache[None] = now
+ else:
+ results = []
+ for r in results:
+ if id is not None:
+ return values[0]
+
+ results.append(values[0])
+
+ return results
+ else:
+ cache = self.cache[table]
+
+ try:
+ if id is not None:
+ value = cache[column][id]
+ # logger.debug("%s.%s:%s -> %s" % (table, column, id, value))
+ return value
+ else:
+ return cache[column].values()
+ except KeyError:
+ # logger.debug("%s.%s:%s -> Not found" % (table, column, id))
+ return None
+
+class Feed(BaseObject):
+ # Columns to cache.
+ cached_columns = (('feed', 'read'),
+ ('feed', 'title'))
+
+ serial_execution_lock = threading.Lock()
+
+ def _getdb(self):
+ try:
+ db = self.tls.db
+ except AttributeError:
+ db = sqlite3.connect("%s/%s.db" % (self.dir, self.key), timeout=120)
+ self.tls.db = db
+ return db
+ db = property(_getdb)
+
+ def __init__(self, configdir, key):
+ self.key = key
+ self.configdir = configdir
+ self.dir = "%s/%s.d" %(self.configdir, self.key)
+ self.tls = threading.local ()
+
+ if not isdir(self.dir):
+ mkdir(self.dir)
+ if not isfile("%s/%s.db" %(self.dir, self.key)):
+ self.db.execute("CREATE TABLE feed (id text, title text, contentLink text, date float, updated float, link text, read int);")
+ self.db.execute("CREATE TABLE images (id text, imagePath text);")
+ self.db.commit()
+
+ def addImage(self, configdir, key, baseurl, url, proxy=None, opener=None):
+ filename = configdir+key+".d/"+getId(url)
+ if not isfile(filename):
+ try:
+ if not opener:
+ opener = downloader(proxy=proxy)
+
+ abs_url = urljoin(baseurl,url)
+ f = opener.open(abs_url)
+ try:
+ with open(filename, "w") as outf:
+ for data in f:
+ outf.write(data)
+ finally:
+ f.close()
+ except (urllib2.HTTPError, urllib2.URLError, IOError), exception:
+ logger.info("Could not download image %s: %s"
+ % (abs_url, str (exception)))
+ return None
+ except:
+ exception = sys.exc_info()[0]
+
+ logger.info("Downloading image %s: %s" %
+ (abs_url, traceback.format_exc()))
+ try:
+ remove(filename)
+ except OSError:
+ pass
+
+ return None
+ else:
+ #open(filename,"a").close() # "Touch" the file
+ file = open(filename,"a")
+ utime(filename, None)
+ file.close()
+ return filename
+
+ def updateFeed(self, configdir, url, etag, modified, expiryTime=24, proxy=None, imageCache=False, priority=0, postFeedUpdateFunc=None, *postFeedUpdateFuncArgs):
+ if (os.path.basename(sys.argv[0]) == 'update_feeds.py'):
+ def doit():
+ def it():
+ self._updateFeed(configdir, url, etag, modified, expiryTime, proxy, imageCache, postFeedUpdateFunc, *postFeedUpdateFuncArgs)
+ return it
+ JobManager().execute(doit(), self.key, priority=priority)
+ else:
+ def send_update_request():
+ global update_feeds_iface
+ if update_feeds_iface is None:
+ bus=dbus.SessionBus()
+ remote_object = bus.get_object(
+ "org.marcoz.feedingit", # Connection name
+ "/org/marcoz/feedingit/update" # Object's path
+ )
+ update_feeds_iface = dbus.Interface(
+ remote_object, 'org.marcoz.feedingit')
+
+ try:
+ update_feeds_iface.Update(self.key)
+ except Exception, e:
+ logger.error("Invoking org.marcoz.feedingit.Update: %s"
+ % str(e))
+ update_feeds_iface = None
+ else:
+ return True
+
+ if send_update_request():
+ # Success! It seems we were able to start the update
+ # daemon via dbus (or, it was already running).
+ return
+
+ global update_feed_process
+ if (update_feed_process is None
+ or update_feed_process.poll() is not None):
+ # The update_feeds process is not running. Start it.
+ update_feeds = os.path.join(os.path.dirname(__file__),
+ 'update_feeds.py')
+ argv = ['/usr/bin/env', 'python', update_feeds, '--daemon' ]
+ logger.debug("Starting update_feeds: running %s"
+ % (str(argv),))
+ update_feed_process = subprocess.Popen(argv)
+ # Make sure the dbus calls go to the right process:
+ # rebind.
+ update_feeds_iface = None
+
+ for _ in xrange(5):
+ if send_update_request():
+ break
+ time.sleep(1)
+
+ def _updateFeed(self, configdir, url, etag, modified, expiryTime=24, proxy=None, imageCache=False, postFeedUpdateFunc=None, *postFeedUpdateFuncArgs):
+ logger.debug("Updating %s" % url)
+
+ success = False
+ have_serial_execution_lock = False
+ try:
+ update_start = time.time ()
+
+ progress_handler = HTTPProgressHandler(download_callback)
+
+ openers = [progress_handler]
+ if proxy:
+ openers.append (proxy)
+ kwargs = {'handlers':openers}
+
+ feed_transfer_stats = transfer_stats(0, 0)
+
+ tmp=feedparser.parse(url, etag=etag, modified=modified, **kwargs)
+ download_duration = time.time () - update_start
+
+ opener = downloader(progress_handler, proxy)
+
+ if JobManager().do_quit:
+ raise KeyboardInterrupt
+
+ process_start = time.time()
+
+ # Expiry time is in hours
+ expiry = float(expiryTime) * 3600.
+
+ currentTime = 0
+
+ updated_objects = 0
+ new_objects = 0
+
+ def wc_success():
+ try:
+ wc().stream_register (self.key, "", 6 * 60 * 60)
+ except woodchuck.ObjectExistsError:
+ pass
+ try:
+ wc()[self.key].updated (
+ indicator=(woodchuck.Indicator.ApplicationVisual
+ |woodchuck.Indicator.StreamWide),
+ transferred_down=progress_handler.stats['received'],
+ transferred_up=progress_handler.stats['sent'],
+ transfer_time=update_start,
+ transfer_duration=download_duration,
+ new_objects=new_objects,
+ updated_objects=updated_objects,
+ objects_inline=new_objects + updated_objects)
+ except KeyError:
+ logger.warn(
+ "Failed to register update of %s with woodchuck!"
+ % (self.key))
+
+ http_status = tmp.get ('status', 200)
+
+ # Check if the parse was succesful. If the http status code
+ # is 304, then the download was successful, but there is
+ # nothing new. Indeed, no content is returned. This make a
+ # 304 look like an error because there are no entries and the
+ # parse fails. But really, everything went great! Check for
+ # this first.
+ if http_status == 304:
+ logger.debug("%s: No changes to feed." % (self.key,))
+ mainthread.execute(wc_success, async=True)
+ success = True
+ elif len(tmp["entries"])==0 and not tmp.version:
+ # An error occured fetching or parsing the feed. (Version
+ # will be either None if e.g. the connection timed our or
+ # '' if the data is not a proper feed)
+ logger.error(
+ "Error fetching %s: version is: %s: error: %s"
+ % (url, str (tmp.version),
+ str (tmp.get ('bozo_exception', 'Unknown error'))))
+ logger.debug(tmp)
+ def register_stream_update_failed(http_status):
+ def doit():
+ logger.debug("%s: stream update failed!" % self.key)
+
+ try:
+ # It's not easy to get the feed's title from here.
+ # At the latest, the next time the application is
+ # started, we'll fix up the human readable name.
+ wc().stream_register (self.key, "", 6 * 60 * 60)
+ except woodchuck.ObjectExistsError:
+ pass
+ ec = woodchuck.TransferStatus.TransientOther
+ if 300 <= http_status and http_status < 400:
+ ec = woodchuck.TransferStatus.TransientNetwork
+ if 400 <= http_status and http_status < 500:
+ ec = woodchuck.TransferStatus.FailureGone
+ if 500 <= http_status and http_status < 600:
+ ec = woodchuck.TransferStatus.TransientNetwork
+ wc()[self.key].update_failed(ec)
+ return doit
+ if wc().available:
+ mainthread.execute(
+ register_stream_update_failed(
+ http_status=http_status),
+ async=True)
+ else:
+ currentTime = time.time()
+ # The etag and modified value should only be updated if the content was not null
+ try:
+ etag = tmp["etag"]
+ except KeyError:
+ etag = None
+ try:
+ modified = tmp["modified"]
+ except KeyError:
+ modified = None
+ try:
+ abs_url = urljoin(tmp["feed"]["link"],"/favicon.ico")
+ f = opener.open(abs_url)
+ data = f.read()
+ f.close()
+ outf = open(self.dir+"/favicon.ico", "w")
+ outf.write(data)
+ outf.close()
+ del data
+ except (urllib2.HTTPError, urllib2.URLError), exception:
+ logger.debug("Could not download favicon %s: %s"
+ % (abs_url, str (exception)))
+
+ self.serial_execution_lock.acquire ()
+ have_serial_execution_lock = True
+
+ #reversedEntries = self.getEntries()
+ #reversedEntries.reverse()
+
+ ids = self.getIds()
+
+ tmp["entries"].reverse()
+ for entry in tmp["entries"]:
+ # Yield so as to make the main thread a bit more
+ # responsive.
+ time.sleep(0)
+
+ entry_transfer_stats = transfer_stats(
+ *feed_transfer_stats(**progress_handler.stats)[0:2])
+
+ if JobManager().do_quit:
+ raise KeyboardInterrupt
+
+ object_size = 0
+
+ date = self.extractDate(entry)
+ try:
+ entry["title"]
+ except KeyError:
+ entry["title"] = "No Title"
+ try :
+ entry["link"]
+ except KeyError:
+ entry["link"] = ""
+ try:
+ entry["author"]
+ except KeyError:
+ entry["author"] = None
+ if(not(entry.has_key("id"))):
+ entry["id"] = None
+ content = self.extractContent(entry)
+ object_size = len (content)
+ tmpEntry = {"title":entry["title"], "content":content,
+ "date":date, "link":entry["link"], "author":entry["author"], "id":entry["id"]}
+ id = self.generateUniqueId(tmpEntry)
+
+ current_version \
+ = self.db.execute('select date from feed where id=?',
+ (id,)).fetchone()
+ if (current_version is not None
+ and current_version[0] == date):
+ logger.debug("ALREADY DOWNLOADED %s (%s)"
+ % (entry["title"], entry["link"]))
+ continue
+
+ if current_version is not None:
+ # The version was updated. Mark it as unread.
+ logger.debug("UPDATED: %s (%s)"
+ % (entry["title"], entry["link"]))
+ self.setEntryUnread(id)
+ updated_objects += 1
+ else:
+ logger.debug("NEW: %s (%s)"
+ % (entry["title"], entry["link"]))
+ new_objects += 1
+
+ #articleTime = time.mktime(self.entries[id]["dateTuple"])
+ soup = BeautifulSoup(self.getArticle(tmpEntry)) #tmpEntry["content"])
+ images = soup('img')
+ baseurl = tmpEntry["link"]
+ #if not id in ids:
+ if imageCache and len(images) > 0:
+ self.serial_execution_lock.release ()
+ have_serial_execution_lock = False
+ for img in images:
+ filename = self.addImage(
+ configdir, self.key, baseurl, img['src'],
+ opener=opener)
+ if filename:
+ img['src']="file://%s" %filename
+ count = self.db.execute("SELECT count(1) FROM images where id=? and imagePath=?;", (id, filename )).fetchone()[0]
+ if count == 0:
+ self.db.execute("INSERT INTO images (id, imagePath) VALUES (?, ?);", (id, filename) )
+ self.db.commit()
+
+ try:
+ object_size += os.path.getsize (filename)
+ except os.error, exception:
+ logger.error ("Error getting size of %s: %s"
+ % (filename, exception))
+ self.serial_execution_lock.acquire ()
+ have_serial_execution_lock = True
+
+ tmpEntry["contentLink"] = configdir+self.key+".d/"+id+".html"
+ file = open(tmpEntry["contentLink"], "w")
+ file.write(soup.prettify())
+ file.close()
+ if id in ids:
+ self.db.execute("UPDATE feed SET updated=? WHERE id=?;", (currentTime, id) )
+ self.db.commit()
+ else:
+ values = (id, tmpEntry["title"], tmpEntry["contentLink"], tmpEntry["date"], currentTime, tmpEntry["link"], 0)
+ self.db.execute("INSERT INTO feed (id, title, contentLink, date, updated, link, read) VALUES (?, ?, ?, ?, ?, ?, ?);", values)
+ self.db.commit()
+# else:
+# try:
+# self.db.execute("UPDATE feed SET updated=? WHERE id=?;", (currentTime, id) )
+# self.db.commit()
+# filename = configdir+self.key+".d/"+id+".html"
+# file = open(filename,"a")
+# utime(filename, None)
+# file.close()
+# images = self.db.execute("SELECT imagePath FROM images where id=?;", (id, )).fetchall()
+# for image in images:
+# file = open(image[0],"a")
+# utime(image[0], None)
+# file.close()
+# except:
+# pass
+
+ # Register the object with Woodchuck and mark it as
+ # downloaded.
+ def register_object_transferred(
+ id, title, publication_time,
+ sent, received, object_size):
+ def doit():
+ logger.debug("Registering transfer of object %s"
+ % title)
+ try:
+ obj = wc()[self.key].object_register(
+ object_identifier=id,
+ human_readable_name=title)
+ except woodchuck.ObjectExistsError:
+ obj = wc()[self.key][id]
+ else:
+ obj.publication_time = publication_time
+ obj.transferred(
+ indicator=(
+ woodchuck.Indicator.ApplicationVisual
+ |woodchuck.Indicator.StreamWide),
+ transferred_down=received,
+ transferred_up=sent,
+ object_size=object_size)
+ return doit
+ if wc().available:
+ # If the entry does not contain a publication
+ # time, the attribute won't exist.
+ pubtime = entry.get('date_parsed', None)
+ if pubtime:
+ publication_time = time.mktime (pubtime)
+ else:
+ publication_time = None
+
+ sent, received, _ \
+ = entry_transfer_stats(**progress_handler.stats)
+ # sent and received are for objects (in
+ # particular, images) associated with this
+ # item. We also want to attribute the data
+ # transferred for the item's content. This is
+ # a good first approximation.
+ received += len(content)
+
+ mainthread.execute(
+ register_object_transferred(
+ id=id,
+ title=tmpEntry["title"],
+ publication_time=publication_time,
+ sent=sent, received=received,
+ object_size=object_size),
+ async=True)
+ self.db.commit()
+
+ sent, received, _ \
+ = feed_transfer_stats(**progress_handler.stats)
+ logger.debug (
+ "%s: Update successful: transferred: %d/%d; objects: %d)"
+ % (url, sent, received, len (tmp.entries)))
+ mainthread.execute (wc_success, async=True)
+ success = True
+
+ rows = self.db.execute("SELECT id FROM feed WHERE (read=0 AND updated<?) OR (read=1 AND updated<?);", (currentTime-2*expiry, currentTime-expiry))
+ for row in rows:
+ self.removeEntry(row[0])
+
+ from glob import glob
+ from os import stat
+ for file in glob(configdir+self.key+".d/*"):
+ #
+ stats = stat(file)
+ #
+ # put the two dates into matching format
+ #
+ lastmodDate = stats[8]
+ #
+ expDate = time.time()-expiry*3
+ # check if image-last-modified-date is outdated
+ #
+ if expDate > lastmodDate:
+ #
+ try:
+ #
+ #print 'Removing', file
+ #
+ # XXX: Tell woodchuck.
+ remove(file) # commented out for testing
+ #
+ except OSError, exception:
+ #
+ logger.error('Could not remove %s: %s'
+ % (file, str (exception)))
+ logger.debug("updated %s: %fs in download, %fs in processing"
+ % (self.key, download_duration,
+ time.time () - process_start))
+ except:
+ logger.error("Updating %s: %s" % (self.key, traceback.format_exc()))
+ finally:
+ self.db.commit ()
+
+ if have_serial_execution_lock:
+ self.serial_execution_lock.release ()
+
+ updateTime = 0
+ try:
+ rows = self.db.execute("SELECT MAX(date) FROM feed;")
+ for row in rows:
+ updateTime=row[0]
+ except Exception, e:
+ logger.error("Fetching update time: %s: %s"
+ % (str(e), traceback.format_exc()))
+ finally:
+ if not success:
+ etag = None
+ modified = None
+ title = None
+ try:
+ title = tmp.feed.title
+ except (AttributeError, UnboundLocalError), exception:
+ pass
+ if postFeedUpdateFunc is not None:
+ postFeedUpdateFunc (self.key, updateTime, etag, modified,
+ title, *postFeedUpdateFuncArgs)
+
+ self.cache_invalidate()
+
+ def setEntryRead(self, id):
+ self.db.execute("UPDATE feed SET read=1 WHERE id=?;", (id,) )
+ self.db.commit()
+
+ def doit():
+ try:
+ wc()[self.key][id].used()
+ except KeyError:
+ pass
+ if wc().available():
+ mainthread.execute(doit, async=True)
+ self.cache_invalidate('feed')
+
+ def setEntryUnread(self, id):
+ self.db.execute("UPDATE feed SET read=0 WHERE id=?;", (id,) )
+ self.db.commit()
+ self.cache_invalidate('feed')
+
+ def markAllAsRead(self):
+ self.db.execute("UPDATE feed SET read=1 WHERE read=0;")
+ self.db.commit()
+ self.cache_invalidate('feed')
+
+ def isEntryRead(self, id):
+ return self.lookup('feed', 'read', id) == 1
+
+ def getTitle(self, id):
+ return self.lookup('feed', 'title', id)
+
+ def getContentLink(self, id):
+ return self.db.execute("SELECT contentLink FROM feed WHERE id=?;", (id,) ).fetchone()[0]
+
+ def getExternalLink(self, id):
+ return self.db.execute("SELECT link FROM feed WHERE id=?;", (id,) ).fetchone()[0]
+
+ def getDate(self, id):
+ dateStamp = self.db.execute("SELECT date FROM feed WHERE id=?;", (id,) ).fetchone()[0]
+ return time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime(dateStamp))
+
+ def getDateTuple(self, id):
+ dateStamp = self.db.execute("SELECT date FROM feed WHERE id=?;", (id,) ).fetchone()[0]
+ return time.localtime(dateStamp)
+
+ def getDateStamp(self, id):
+ return self.db.execute("SELECT date FROM feed WHERE id=?;", (id,) ).fetchone()[0]
+
+ def generateUniqueId(self, entry):
+ """
+ Generate a stable identifier for the article. For the same
+ entry, this should result in the same identifier. If
+ possible, the identifier should remain the same even if the
+ article is updated.
+ """
+ # Prefer the entry's id, which is supposed to be globally
+ # unique.
+ key = entry.get('id', None)
+ if not key:
+ # Next, try the link to the content.
+ key = entry.get('link', None)
+ if not key:
+ # Ok, the title and the date concatenated are likely to be
+ # relatively stable.
+ key = entry.get('title', None) + entry.get('date', None)
+ if not key:
+ # Hmm, the article's content will at least guarantee no
+ # false negatives (i.e., missing articles)
+ key = entry.get('content', None)
+ if not key:
+ # If all else fails, just use a random number.
+ key = str (random.random ())
+ return getId (key)
+
+ def getIds(self, onlyUnread=False):
+ if onlyUnread:
+ rows = self.db.execute("SELECT id FROM feed where read=0 ORDER BY date DESC;").fetchall()
+ else:
+ rows = self.db.execute("SELECT id FROM feed ORDER BY date DESC;").fetchall()
+ ids = []
+ for row in rows:
+ ids.append(row[0])
+ #ids.reverse()
+ return ids
+
+ def getNextId(self, id, forward=True):
+ if forward:
+ delta = 1
+ else:
+ delta = -1
+ ids = self.getIds()
+ index = ids.index(id)
+ return ids[(index + delta) % len(ids)]
+
+ def getPreviousId(self, id):
+ return self.getNextId(id, forward=False)
+
+ def getNumberOfUnreadItems(self):
+ return self.db.execute("SELECT count(*) FROM feed WHERE read=0;").fetchone()[0]
+
+ def getNumberOfEntries(self):
+ return self.db.execute("SELECT count(*) FROM feed;").fetchone()[0]
+
+ def getArticle(self, entry):
+ #self.setEntryRead(id)
+ #entry = self.entries[id]
+ title = entry['title']
+ #content = entry.get('content', entry.get('summary_detail', {}))
+ content = entry["content"]
+
+ link = entry['link']
+ author = entry['author']
+ date = time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime(entry["date"]) )
+
+ #text = '''<div style="color: black; background-color: white;">'''
+ text = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">'
+ text += "<html><head><title>" + title + "</title>"
+ text += '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>\n'
+ #text += '<style> body {-webkit-user-select: none;} </style>'
+ text += '</head><body bgcolor=\"#ffffff\"><div><a href=\"' + link + '\">' + title + "</a>"
+ if author != None:
+ text += "<BR /><small><i>Author: " + author + "</i></small>"
+ text += "<BR /><small><i>Date: " + date + "</i></small></div>"
+ text += "<BR /><BR />"
+ text += content
+ text += "</body></html>"
+ return text
+
+ def getContent(self, id):
+ contentLink = self.db.execute("SELECT contentLink FROM feed WHERE id=?;", (id,)).fetchone()[0]
+ try:
+ file = open(self.entries[id]["contentLink"])
+ content = file.read()
+ file.close()
+ except:
+ content = "Content unavailable"
+ return content
+
+ def extractDate(self, entry):
+ if entry.has_key("updated_parsed"):
+ return timegm(entry["updated_parsed"])
+ elif entry.has_key("published_parsed"):
+ return timegm(entry["published_parsed"])
+ else:
+ return time.time()
+
+ def extractContent(self, entry):
+ content = ""
+ if entry.has_key('summary'):
+ content = entry.get('summary', '')
+ if entry.has_key('content'):
+ if len(entry.content[0].value) > len(content):
+ content = entry.content[0].value
+ if content == "":
+ content = entry.get('description', '')
+ return content
+
+ def removeEntry(self, id):
+ contentLink = self.db.execute("SELECT contentLink FROM feed WHERE id=?;", (id,)).fetchone()[0]
+ if contentLink:
+ try:
+ remove(contentLink)
+ except OSError, exception:
+ logger.error("Deleting %s: %s" % (contentLink, str (exception)))
+ self.db.execute("DELETE FROM feed WHERE id=?;", (id,) )
+ self.db.execute("DELETE FROM images WHERE id=?;", (id,) )
+ self.db.commit()
+
+ def doit():
+ try:
+ wc()[self.key][id].files_deleted (
+ woodchuck.DeletionResponse.Deleted)
+ del wc()[self.key][id]
+ except KeyError:
+ pass
+ if wc().available():
+ mainthread.execute (doit, async=True)
+
+class ArchivedArticles(Feed):
+ def addArchivedArticle(self, title, link, date, configdir):
+ id = self.generateUniqueId({"date":date, "title":title})
+ values = (id, title, link, date, 0, link, 0)
+ self.db.execute("INSERT INTO feed (id, title, contentLink, date, updated, link, read) VALUES (?, ?, ?, ?, ?, ?, ?);", values)
+ self.db.commit()
+
+ def updateFeed(self, configdir, url, etag, modified, expiryTime=24, proxy=None, imageCache=False):
+ currentTime = 0
+ rows = self.db.execute("SELECT id, link FROM feed WHERE updated=0;")
+ for row in rows:
+ currentTime = time.time()
+ id = row[0]
+ link = row[1]
+ f = urllib2.urlopen(link)
+ #entry["content"] = f.read()
+ html = f.read()
+ f.close()
+ soup = BeautifulSoup(html)
+ images = soup('img')
+ baseurl = link
+ for img in images:
+ filename = self.addImage(configdir, self.key, baseurl, img['src'], proxy=proxy)
+ img['src']=filename
+ self.db.execute("INSERT INTO images (id, imagePath) VALUES (?, ?);", (id, filename) )
+ self.db.commit()
+ contentLink = configdir+self.key+".d/"+id+".html"
+ file = open(contentLink, "w")
+ file.write(soup.prettify())
+ file.close()
+
+ self.db.execute("UPDATE feed SET read=0, contentLink=?, updated=? WHERE id=?;", (contentLink, time.time(), id) )
+ self.db.commit()
+ return (currentTime, None, None)
+
+ def purgeReadArticles(self):
+ rows = self.db.execute("SELECT id FROM feed WHERE read=1;")
+ #ids = self.getIds()
+ for row in rows:
+ self.removeArticle(row[0])
+
+ def removeArticle(self, id):
+ rows = self.db.execute("SELECT imagePath FROM images WHERE id=?;", (id,) )
+ for row in rows:
+ try:
+ count = self.db.execute("SELECT count(*) FROM images WHERE id!=? and imagePath=?;", (id,row[0]) ).fetchone()[0]
+ if count == 0:
+ os.remove(row[0])
+ except:
+ pass
+ self.removeEntry(id)
+
+class Listing(BaseObject):
+ # Columns to cache.
+ cached_columns = (('feeds', 'updateTime'),
+ ('feeds', 'unread'),
+ ('feeds', 'title'),
+ ('categories', 'title'))
+
+ def _getdb(self):
+ try:
+ db = self.tls.db
+ except AttributeError:
+ db = sqlite3.connect("%s/feeds.db" % self.configdir, timeout=120)
+ self.tls.db = db
+ return db
+ db = property(_getdb)
+
+ # Lists all the feeds in a dictionary, and expose the data
+ def __init__(self, config, configdir):
+ self.config = config
+ self.configdir = configdir
+
+ self.tls = threading.local ()
+
+ try:
+ table = self.db.execute("SELECT sql FROM sqlite_master").fetchone()
+ if table == None:
+ self.db.execute("CREATE TABLE feeds(id text, url text, title text, unread int, updateTime float, rank int, etag text, modified text, widget int, category int);")
+ self.db.execute("CREATE TABLE categories(id text, title text, unread int, rank int);")
+ self.addCategory("Default Category")
+ if isfile(self.configdir+"feeds.pickle"):
+ self.importOldFormatFeeds()
+ else:
+ self.addFeed("Maemo News", "http://maemo.org/news/items.xml")
+ else:
+ from string import find, upper
+ if find(upper(table[0]), "WIDGET")<0:
+ self.db.execute("ALTER TABLE feeds ADD COLUMN widget int;")
+ self.db.execute("UPDATE feeds SET widget=1;")
+ self.db.commit()
+ if find(upper(table[0]), "CATEGORY")<0:
+ self.db.execute("CREATE TABLE categories(id text, title text, unread int, rank int);")
+ self.addCategory("Default Category")
+ self.db.execute("ALTER TABLE feeds ADD COLUMN category int;")
+ self.db.execute("UPDATE feeds SET category=1;")
+ self.db.commit()
+ except:
+ pass
+
+ # Check that Woodchuck's state is up to date with respect our
+ # state.
+ updater = os.path.basename(sys.argv[0]) == 'update_feeds.py'
+ wc_init (self, True if updater else False)
+ if wc().available() and updater:
+ # The list of known streams.
+ streams = wc().streams_list ()
+ stream_ids = [s.identifier for s in streams]
+
+ # Register any unknown streams. Remove known streams from
+ # STREAMS_IDS.
+ for key in self.getListOfFeeds():
+ title = self.getFeedTitle(key)
+ # XXX: We should also check whether the list of
+ # articles/objects in each feed/stream is up to date.
+ if key not in stream_ids:
+ logger.debug(
+ "Registering previously unknown channel: %s (%s)"
+ % (key, title,))
+ # Use a default refresh interval of 6 hours.
+ wc().stream_register (key, title, 6 * 60 * 60)
+ else:
+ # Make sure the human readable name is up to date.
+ if wc()[key].human_readable_name != title:
+ wc()[key].human_readable_name = title
+ stream_ids.remove (key)
+
+
+ # Unregister any streams that are no longer subscribed to.
+ for id in stream_ids:
+ logger.debug("Unregistering %s" % (id,))
+ w.stream_unregister (id)
+
+ def importOldFormatFeeds(self):
+ """This function loads feeds that are saved in an outdated format, and converts them to sqlite"""
+ import rss
+ listing = rss.Listing(self.configdir)
+ rank = 0
+ for id in listing.getListOfFeeds():
+ try:
+ rank += 1
+ values = (id, listing.getFeedTitle(id) , listing.getFeedUrl(id), 0, time.time(), rank, None, "None", 1)
+ self.db.execute("INSERT INTO feeds (id, title, url, unread, updateTime, rank, etag, modified, widget, category) VALUES (?, ?, ? ,? ,? ,?, ?, ?, ?, 1);", values)
+ self.db.commit()
+
+ feed = listing.getFeed(id)
+ new_feed = self.getFeed(id)
+
+ items = feed.getIds()[:]
+ items.reverse()
+ for item in items:
+ if feed.isEntryRead(item):
+ read_status = 1
+ else:
+ read_status = 0
+ date = timegm(feed.getDateTuple(item))
+ title = feed.getTitle(item)
+ newId = new_feed.generateUniqueId({"date":date, "title":title})
+ values = (newId, title , feed.getContentLink(item), date, tuple(time.time()), feed.getExternalLink(item), read_status)
+ new_feed.db.execute("INSERT INTO feed (id, title, contentLink, date, updated, link, read) VALUES (?, ?, ?, ?, ?, ?, ?);", values)
+ new_feed.db.commit()
+ try:
+ images = feed.getImages(item)
+ for image in images:
+ new_feed.db.execute("INSERT INTO images (id, imagePath) VALUES (?, ?);", (item, image) )
+ new_feed.db.commit()
+ except:
+ pass
+ self.updateUnread(id)
+ except:
+ logger.error("importOldFormatFeeds: %s"
+ % (traceback.format_exc(),))
+ remove(self.configdir+"feeds.pickle")
+
+
+ def addArchivedArticle(self, key, index):
+ feed = self.getFeed(key)
+ title = feed.getTitle(index)
+ link = feed.getExternalLink(index)
+ date = feed.getDate(index)
+ count = self.db.execute("SELECT count(*) FROM feeds where id=?;", ("ArchivedArticles",) ).fetchone()[0]
+ if count == 0:
+ self.addFeed("Archived Articles", "", id="ArchivedArticles")
+
+ archFeed = self.getFeed("ArchivedArticles")
+ archFeed.addArchivedArticle(title, link, date, self.configdir)
+ self.updateUnread("ArchivedArticles")
+
+ def updateFeed(self, key, expiryTime=None, proxy=None, imageCache=None,
+ priority=0):
+ if expiryTime is None:
+ expiryTime = self.config.getExpiry()
+ if not expiryTime:
+ # Default to 24 hours
+ expriyTime = 24
+ if proxy is None:
+ (use_proxy, proxy) = self.config.getProxy()
+ if not use_proxy:
+ proxy = None
+ if imageCache is None:
+ imageCache = self.config.getImageCache()
+
+ feed = self.getFeed(key)
+ (url, etag, modified) = self.db.execute("SELECT url, etag, modified FROM feeds WHERE id=?;", (key,) ).fetchone()
+ try:
+ modified = time.struct_time(eval(modified))
+ except:
+ modified = None
+ feed.updateFeed(
+ self.configdir, url, etag, modified, expiryTime, proxy, imageCache,
+ priority, postFeedUpdateFunc=self._queuePostFeedUpdate)
+
+ def _queuePostFeedUpdate(self, *args, **kwargs):
+ mainthread.execute (self._postFeedUpdate, async=True, *args, **kwargs)
+
+ def _postFeedUpdate(self, key, updateTime, etag, modified, title):
+ if modified==None:
+ modified="None"
+ else:
+ modified=str(tuple(modified))
+ if updateTime > 0:
+ self.db.execute("UPDATE feeds SET updateTime=?, etag=?, modified=? WHERE id=?;", (updateTime, etag, modified, key) )
+ else:
+ self.db.execute("UPDATE feeds SET etag=?, modified=? WHERE id=?;", (etag, modified, key) )
+
+ if title is not None:
+ self.db.execute("UPDATE feeds SET title=(case WHEN title=='' THEN ? ELSE title END) where id=?;",
+ (title, key))
+ self.db.commit()
+ self.cache_invalidate('feeds')
+ self.updateUnread(key)
+
+ update_server_object().ArticleCountUpdated()
+
+ stats = JobManager().stats()
+ global jobs_at_start
+ completed = stats['jobs-completed'] - jobs_at_start
+ in_progress = stats['jobs-in-progress']
+ queued = stats['jobs-queued']
+
+ try:
+ percent = (100 * ((completed + in_progress / 2.))
+ / (completed + in_progress + queued))
+ except ZeroDivisionError:
+ percent = 100
+
+ update_server_object().UpdateProgress(
+ percent, completed, in_progress, queued, 0, 0, 0, key)
+
+ if in_progress == 0 and queued == 0:
+ jobs_at_start = stats['jobs-completed']
+
+ def getFeed(self, key):
+ if key == "ArchivedArticles":
+ return ArchivedArticles(self.configdir, key)
+ return Feed(self.configdir, key)
+
+ def editFeed(self, key, title, url, category=None):
+ if category:
+ self.db.execute("UPDATE feeds SET title=?, url=?, category=? WHERE id=?;", (title, url, category, key))
+ else:
+ self.db.execute("UPDATE feeds SET title=?, url=? WHERE id=?;", (title, url, key))
+ self.db.commit()
+ self.cache_invalidate('feeds')
+
+ if wc().available():
+ try:
+ wc()[key].human_readable_name = title
+ except KeyError:
+ logger.debug("Feed %s (%s) unknown." % (key, title))
+
+ def getFeedUpdateTime(self, key):
+ update_time = self.lookup('feeds', 'updateTime', key)
+
+ if not update_time:
+ return "Never"
+
+ delta = time.time() - update_time
+
+ delta_hours = delta / (60. * 60.)
+ if delta_hours < .1:
+ return "A few minutes ago"
+ if delta_hours < .75:
+ return "Less than an hour ago"
+ if delta_hours < 1.5:
+ return "About an hour ago"
+ if delta_hours < 18:
+ return "About %d hours ago" % (int(delta_hours + 0.5),)
+
+ delta_days = delta_hours / 24.
+ if delta_days < 1.5:
+ return "About a day ago"
+ if delta_days < 18:
+ return "%d days ago" % (int(delta_days + 0.5),)
+
+ delta_weeks = delta_days / 7.
+ if delta_weeks <= 8:
+ return "%d weeks ago" % int(delta_weeks + 0.5)
+
+ delta_months = delta_days / 30.
+ if delta_months <= 30:
+ return "%d months ago" % int(delta_months + 0.5)
+
+ return time.strftime("%x", time.gmtime(update_time))
+
+ def getFeedNumberOfUnreadItems(self, key):
+ return self.lookup('feeds', 'unread', key)
+
+ def getFeedTitle(self, key):
+ title = self.lookup('feeds', 'title', key)
+ if title:
+ return title
+
+ return self.getFeedUrl(key)
+
+ def getFeedUrl(self, key):
+ return self.db.execute("SELECT url FROM feeds WHERE id=?;", (key,)).fetchone()[0]
+
+ def getFeedCategory(self, key):
+ return self.db.execute("SELECT category FROM feeds WHERE id=?;", (key,)).fetchone()[0]
+
+ def getListOfFeeds(self, category=None):
+ if category:
+ rows = self.db.execute("SELECT id FROM feeds WHERE category=? ORDER BY rank;", (category, ) )
+ else:
+ rows = self.db.execute("SELECT id FROM feeds ORDER BY rank;" )
+ keys = []
+ for row in rows:
+ if row[0]:
+ keys.append(row[0])
+ return keys
+
+ def getListOfCategories(self):
+ return list(row[0] for row in self.db.execute(
+ "SELECT id FROM categories ORDER BY rank;"))
+
+ def getCategoryTitle(self, id):
+ return self.lookup('categories', 'title', id)
+
+ def getSortedListOfKeys(self, order, onlyUnread=False, category=1):
+ if order == "Most unread":
+ tmp = "ORDER BY unread DESC"
+ #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][1], reverse=True)
+ elif order == "Least unread":
+ tmp = "ORDER BY unread"
+ #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][1])
+ elif order == "Most recent":
+ tmp = "ORDER BY updateTime DESC"
+ #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][2], reverse=True)
+ elif order == "Least recent":
+ tmp = "ORDER BY updateTime"
+ #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][2])
+ else: # order == "Manual" or invalid value...
+ tmp = "ORDER BY rank"
+ #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][0])
+ if onlyUnread:
+ sql = "SELECT id FROM feeds WHERE unread>0 AND category=%s " %category + tmp
+ else:
+ sql = "SELECT id FROM feeds WHERE category=%s " %category + tmp
+ rows = self.db.execute(sql)
+ keys = []
+ for row in rows:
+ if row[0]:
+ keys.append(row[0])
+ return keys
+
+ def getFavicon(self, key):
+ filename = "%s%s.d/favicon.ico" % (self.configdir, key)
+ if isfile(filename):
+ return filename
+ else:
+ return False
+
+ def updateUnread(self, key):
+ feed = self.getFeed(key)
+ self.db.execute("UPDATE feeds SET unread=? WHERE id=?;", (feed.getNumberOfUnreadItems(), key))
+ self.db.commit()
+ self.cache_invalidate('feeds')
+
+ def addFeed(self, title, url, id=None, category=1):
+ if not id:
+ id = getId(url)
+ count = self.db.execute("SELECT count(*) FROM feeds WHERE id=?;", (id,) ).fetchone()[0]
+ if count == 0:
+ max_rank = self.db.execute("SELECT MAX(rank) FROM feeds;").fetchone()[0]
+ if max_rank == None:
+ max_rank = 0
+ values = (id, title, url, 0, 0, max_rank+1, None, "None", 1, category)
+ self.db.execute("INSERT INTO feeds (id, title, url, unread, updateTime, rank, etag, modified, widget, category) VALUES (?, ?, ? ,? ,? ,?, ?, ?, ?,?);", values)
+ self.db.commit()
+ # Ask for the feed object, it will create the necessary tables
+ self.getFeed(id)
+
+ if wc().available():
+ # Register the stream with Woodchuck. Update approximately
+ # every 6 hours.
+ wc().stream_register(stream_identifier=id,
+ human_readable_name=title,
+ freshness=6*60*60)
+
+ return True
+ else:
+ return False
+
+ def addCategory(self, title):
+ rank = self.db.execute("SELECT MAX(rank)+1 FROM categories;").fetchone()[0]
+ if rank==None:
+ rank=1
+ id = self.db.execute("SELECT MAX(id)+1 FROM categories;").fetchone()[0]
+ if id==None:
+ id=1
+ self.db.execute("INSERT INTO categories (id, title, unread, rank) VALUES (?, ?, 0, ?)", (id, title, rank))
+ self.db.commit()
+
+ def removeFeed(self, key):
+ if wc().available ():
+ try:
+ del wc()[key]
+ except KeyError:
+ logger.debug("Removing unregistered feed %s failed" % (key,))
+
+ rank = self.db.execute("SELECT rank FROM feeds WHERE id=?;", (key,) ).fetchone()[0]
+ self.db.execute("DELETE FROM feeds WHERE id=?;", (key, ))
+ self.db.execute("UPDATE feeds SET rank=rank-1 WHERE rank>?;", (rank,) )
+ self.db.commit()
+
+ if isdir(self.configdir+key+".d/"):
+ rmtree(self.configdir+key+".d/")
+
+ def removeCategory(self, key):
+ if self.db.execute("SELECT count(*) FROM categories;").fetchone()[0] > 1:
+ rank = self.db.execute("SELECT rank FROM categories WHERE id=?;", (key,) ).fetchone()[0]
+ self.db.execute("DELETE FROM categories WHERE id=?;", (key, ))
+ self.db.execute("UPDATE categories SET rank=rank-1 WHERE rank>?;", (rank,) )
+ self.db.execute("UPDATE feeds SET category=1 WHERE category=?;", (key,) )
+ self.db.commit()
+
+ #def saveConfig(self):
+ # self.listOfFeeds["feedingit-order"] = self.sortedKeys
+ # file = open(self.configdir+"feeds.pickle", "w")
+ # pickle.dump(self.listOfFeeds, file)
+ # file.close()
+
+ def moveUp(self, key):
+ rank = self.db.execute("SELECT rank FROM feeds WHERE id=?;", (key,)).fetchone()[0]
+ if rank>0:
+ self.db.execute("UPDATE feeds SET rank=? WHERE rank=?;", (rank, rank-1) )
+ self.db.execute("UPDATE feeds SET rank=? WHERE id=?;", (rank-1, key) )
+ self.db.commit()
+
+ def moveCategoryUp(self, key):
+ rank = self.db.execute("SELECT rank FROM categories WHERE id=?;", (key,)).fetchone()[0]
+ if rank>0:
+ self.db.execute("UPDATE categories SET rank=? WHERE rank=?;", (rank, rank-1) )
+ self.db.execute("UPDATE categories SET rank=? WHERE id=?;", (rank-1, key) )
+ self.db.commit()
+
+ def moveDown(self, key):
+ rank = self.db.execute("SELECT rank FROM feeds WHERE id=?;", (key,)).fetchone()[0]
+ max_rank = self.db.execute("SELECT MAX(rank) FROM feeds;").fetchone()[0]
+ if rank<max_rank:
+ self.db.execute("UPDATE feeds SET rank=? WHERE rank=?;", (rank, rank+1) )
+ self.db.execute("UPDATE feeds SET rank=? WHERE id=?;", (rank+1, key) )
+ self.db.commit()
+
+ def moveCategoryDown(self, key):
+ rank = self.db.execute("SELECT rank FROM categories WHERE id=?;", (key,)).fetchone()[0]
+ max_rank = self.db.execute("SELECT MAX(rank) FROM categories;").fetchone()[0]
+ if rank<max_rank:
+ self.db.execute("UPDATE categories SET rank=? WHERE rank=?;", (rank, rank+1) )
+ self.db.execute("UPDATE categories SET rank=? WHERE id=?;", (rank+1, key) )
+ self.db.commit()
+
+
--- /dev/null
+#!/usr/bin/env python2.5
+
+#
+# Copyright (c) 2007-2008 INdT.
+# Copyright (c) 2011 Neal H. Walfield
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+# ============================================================================
+# Name : update_feeds.py
+# Author : Yves Marcoz
+# Version : 0.6.1
+# Description : Simple RSS Reader
+# ============================================================================
+
+from rss_sqlite import Listing
+from config import Config
+from updatedbus import UpdateServerObject
+
+import os
+import traceback
+import sys
+import dbus
+
+from jobmanager import JobManager
+import mainthread
+
+import gobject
+gobject.threads_init()
+
+import logging
+logger = logging.getLogger(__name__)
+import debugging
+debugging.init(dot_directory=".feedingit", program_name="update_feeds")
+
+#CONFIGDIR="/home/user/.feedingit/"
+CONFIGDIR = os.environ.get("HOME", "/home/user") + "/.feedingit/"
+#DESKTOP_FILE = "/usr/share/applications/hildon-status-menu/feedingit_status.desktop"
+
+from socket import setdefaulttimeout
+timeout = 5
+setdefaulttimeout(timeout)
+del timeout
+
+class FeedUpdate(UpdateServerObject):
+ def __init__(self, bus_name):
+ UpdateServerObject.__init__(self, bus_name)
+
+ self.config = Config(self, CONFIGDIR+"config.ini")
+ self.listing = Listing(self.config, CONFIGDIR)
+
+ jm = JobManager(True)
+ jm.stats_hook_register (self.job_manager_update,
+ run_in_main_thread=True)
+
+ # Whether or no an update is in progress.
+ self.am_updating = False
+
+ # After an update an finished, we start the inactivity timer.
+ # If this fires before a new job arrives, we quit.
+ self.inactivity_timer = 0
+
+ # Whether we started in daemon mode, or not.
+ self.daemon = '--daemon' in sys.argv
+
+ if self.daemon:
+ logger.debug("Running in daemon mode: waiting for commands.")
+ self.inactivity_timer = gobject.timeout_add(
+ 5 * 60 * 1000, self.inactivity_cb)
+ else:
+ # Update all feeds.
+ logger.debug("Not running in daemon mode: updating all feeds.")
+ gobject.idle_add(self.UpdateAll)
+
+# # If the system becomes idle
+# bus = dbus.SystemBus()
+#
+# mce_request_proxy = bus.get_object(
+# 'com.nokia.mce', '/com/nokia/mce/request')
+# mce_request_iface = dbus.Interface(
+# mce_request_proxy, 'com.nokia.mce.request')
+# system_idle = mce_request_iface.get_inactivity_status()
+# # Force self.system_inactivity_ind to run: ensure that a state
+# # change occurs.
+# self.system_idle = not system_idle
+# self.system_inactivity_ind(system_idle)
+#
+# mce_signal_proxy = bus.get_object(
+# 'com.nokia.mce', '/com/nokia/mce/signal')
+# mce_signal_iface = dbus.Interface(
+# mce_signal_proxy, 'com.nokia.mce.signal')
+# mce_signal_iface.connect_to_signal(
+# 'system_inactivity_ind', self.system_inactivity_ind)
+
+ def increase_download_parallelism(self):
+ # The system has been idle for a while. Enable parallel
+ # downloads.
+ logger.debug("Increasing parallelism to 4 workers.")
+ JobManager().num_threads = 4
+ gobject.source_remove (self.increase_download_parallelism_id)
+ del self.increase_download_parallelism_id
+ return False
+
+ def system_inactivity_ind(self, idle):
+ # The system's idle state changed.
+ if (self.system_idle and idle) or (not self.system_idle and not idle):
+ # No change.
+ return
+
+ if not idle:
+ if hasattr (self, 'increase_download_parallelism_id'):
+ gobject.source_remove (self.increase_download_parallelism_id)
+ del self.increase_download_parallelism_id
+ else:
+ self.increase_download_parallelism_id = \
+ gobject.timeout_add_seconds(
+ 60, self.increase_download_parallelism)
+
+ if not idle:
+ logger.debug("Reducing parallelism to 1 worker.")
+ JobManager().num_threads = 1
+
+ self.system_idle = idle
+
+ def job_manager_update(self, jm, old_stats, new_stats, updated_feed):
+ queued = new_stats['jobs-queued']
+ in_progress = new_stats['jobs-in-progress']
+
+ if (queued or in_progress) and not self.am_updating:
+ logger.debug("new update started")
+ self.am_updating = True
+ self.UpdateStarted()
+ self.UpdateProgress(0, 0, in_progress, queued, 0, 0, 0, "")
+
+ if not queued and not in_progress:
+ logger.debug("update finished!")
+ self.am_updating = False
+ self.UpdateFinished()
+ self.ArticleCountUpdated()
+
+ if self.daemon:
+ self.inactivity_timer = gobject.timeout_add(
+ 60 * 1000, self.inactivity_cb)
+ else:
+ logger.debug("update finished, not running in daemon mode: "
+ "quitting")
+ mainloop.quit()
+
+ if (queued or in_progress) and self.inactivity_timer:
+ gobject.source_remove(self.inactivity_timer)
+ self.inactivity_timer = 0
+
+ def inactivity_cb(self):
+ """
+ The updater has been inactive for a while. Quit.
+ """
+ assert self.inactivity_timer
+ self.inactivity_timer = 0
+
+ if not self.am_updating:
+ logger.info("Nothing to do for a while. Quitting.")
+ mainloop.quit()
+
+ def StopUpdate(self):
+ """
+ Stop updating.
+ """
+ super(FeedUpdate, self).stopUpdate()
+
+ JobManager().quit()
+
+ def UpdateAll(self):
+ """
+ Update all feeds.
+ """
+ logger.info("starting update.")
+ super(FeedUpdate, self).UpdateAll()
+
+ feeds = self.listing.getListOfFeeds()
+ for k in feeds:
+ self.listing.updateFeed(k)
+ logger.debug("Queued all feeds (%d) for update." % len(feeds))
+
+ def Update(self, feed):
+ """
+ Update a particular feed.
+ """
+ super(FeedUpdate, self).Update(feed)
+
+ # We got a request via dbus. If we weren't in daemon mode
+ # before, enter it now.
+ self.daemon = True
+
+ self.listing.updateFeed(feed)
+
+
+import dbus.mainloop.glib
+dbus.mainloop.glib.DBusGMainLoop(set_as_default=True)
+
+mainloop = gobject.MainLoop()
+mainthread.init()
+
+# Acquire our name on the session bus. If this doesn't work, most
+# likely another update_feeds instance is already running. In this
+# case, just quit.
+try:
+ bus_name = dbus.service.BusName('org.marcoz.feedingit',
+ bus=dbus.SessionBus(),
+ do_not_queue=True)
+except Exception:
+ # We failed to acquire our bus name. Die.
+ try:
+ dbus_proxy = dbus.SessionBus().get_object(
+ 'org.freedesktop.DBus', '/org/freedesktop/DBus')
+ dbus_iface = dbus.Interface(dbus_proxy, 'org.freedesktop.DBus')
+ pid = dbus_iface.GetConnectionUnixProcessID('org.marcoz.feedingit')
+ logger.error("update_feeds already running: pid %d." % pid)
+ except Exception, e:
+ logger.error("Getting pid associated with org.marcoz.feedingit: %s"
+ % str(e))
+ logger.error("update_feeds already running.")
+
+ sys.exit(1)
+
+# Run the updater. Note: we run this until feed.am_updating is false.
+# Only is this case have all worker threads exited. If the main
+# thread exits before all threads have exited and the process gets a
+# signal, the Python interpreter is unable to handle the signal and it
+# runs really slow (rescheduling after ever single instruction instead
+# of every few thousand).
+feed = FeedUpdate(bus_name)
+while True:
+ try:
+ mainloop.run()
+ except KeyboardInterrupt:
+ logger.error("Interrupted. Quitting.")
+ JobManager().quit()
+
+ if not feed.am_updating:
+ break
+
--- /dev/null
+#!/usr/bin/env python2.5
+
+#
+# Copyright (c) 2007-2008 INdT.
+# Copyright (c) 2011 Neal H. Walfield
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+# ============================================================================
+# Name : FeedingIt.py
+# Author : Yves Marcoz
+# Version : 0.6.1
+# Description : Simple RSS Reader
+# ============================================================================
+
+import dbus.service
+import logging
+logger = logging.getLogger(__name__)
+
+_update_server_object = None
+def update_server_object():
+ global _update_server_object
+ assert _update_server_object is not None, \
+ "No UpdateServerObject instantiated!"
+ return _update_server_object
+
+class UpdateServerObject(dbus.service.Object):
+ def __init__(self, bus_name):
+ """
+ Start listening for requests.
+ """
+ global _update_server_object
+ assert _update_server_object is None, \
+ "Attempt to instantiate multiple UpdateServerObject objects."
+ _update_server_object = self
+
+ dbus.service.Object.__init__(self, bus_name,
+ '/org/marcoz/feedingit/update')
+
+ @dbus.service.method('org.marcoz.feedingit')
+ def StopUpdate(self):
+ logger.debug("Stop update called.")
+
+ @dbus.service.method('org.marcoz.feedingit')
+ def UpdateAll(self):
+ logger.debug("UpdateAll called.")
+
+ @dbus.service.method('org.marcoz.feedingit', in_signature='s')
+ def Update(self, feed):
+ logger.debug("Update(%s) called." % feed)
+
+ # A signal that will be exported to dbus
+ @dbus.service.signal('org.marcoz.feedingit', signature='')
+ def ArticleCountUpdated(self):
+ pass
+
+ # A signal that will be exported to dbus
+ @dbus.service.signal('org.marcoz.feedingit', signature='uuuuttus')
+ def UpdateProgress(self, percent_complete,
+ feeds_downloaded, feeds_downloading, feeds_pending,
+ bytes_downloaded, bytes_uploaded, bytes_per_second,
+ updated_feed):
+ pass
+
+ # A signal that will be exported to dbus
+ @dbus.service.signal('org.marcoz.feedingit', signature='')
+ def UpdateStarted(self):
+ pass
+
+ # A signal that will be exported to dbus
+ @dbus.service.signal('org.marcoz.feedingit', signature='')
+ def UpdateFinished(self):
+ pass
+
+
--- /dev/null
+# Copyright (c) 2011 Neal H. Walfield
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import logging
+logger = logging.getLogger(__name__)
+import traceback
+
+# Don't fail if the Woodchuck modules are not available. Just disable
+# Woodchuck's functionality.
+
+# Whether we imported the woodchuck modules successfully.
+woodchuck_imported = True
+try:
+ import pywoodchuck
+ from pywoodchuck import PyWoodchuck
+ from pywoodchuck import woodchuck
+except ImportError, exception:
+ logger.info(
+ "Unable to load Woodchuck modules: disabling Woodchuck support: %s"
+ % traceback.format_exc ())
+ woodchuck_imported = False
+ class PyWoodchuck (object):
+ def available(self):
+ return False
+ woodchuck = None
+
+# The default channel refresh interval: 6 hours.
+refresh_interval = 6 * 60 * 60
+
+class mywoodchuck (PyWoodchuck):
+ def __init__(self, listing, human_readable_name, identifier,
+ request_feedback):
+ try:
+ PyWoodchuck.__init__ (self, human_readable_name, identifier,
+ request_feedback)
+ except Exception, e:
+ logger.error(
+ "Failed to establish a connection to the Woodchuck server: %s"
+ % (str(e),))
+ self.available = self.not_available
+ return
+
+ self.listing = listing
+
+ def not_available(self):
+ return False
+
+ # Woodchuck upcalls.
+ def stream_update_cb(self, stream):
+ logger.debug("stream update called on %s (%s)"
+ % (stream.human_readable_name, stream.identifier,))
+
+ # Make sure no one else is concurrently updating this
+ # feed.
+ try:
+ self.listing.updateFeed(stream.identifier)
+ except:
+ logger.debug("Updating %s: %s"
+ % (stream.identifier, traceback.format_exc ()))
+
+ def object_transfer_cb(self, stream, object,
+ version, filename, quality):
+ log ("object transfer called on %s (%s) in stream %s (%s)"
+ % (object.human_readable_name, object.identifier,
+ stream.human_readable_name, stream.identifier))
+
+_w = None
+def wc_init(listing, request_feedback=False):
+ """Connect to the woodchuck server and initialize any state."""
+ global _w
+ assert _w is None
+
+ _w = mywoodchuck (listing, "FeedingIt", "org.marcoz.feedingit",
+ request_feedback)
+
+ if not woodchuck_imported or not _w.available ():
+ logger.info("Unable to contact Woodchuck server.")
+ else:
+ logger.debug("Woodchuck appears to be available.")
+
+def wc():
+ """Return the Woodchuck singleton."""
+ global _w
+ assert _w is not None
+ return _w
--- /dev/null
+Metadata-Version: 1.0
+Name: feedingit
+Version: 0.1.0
+Summary: FeedingIt - RSS Reader
+Home-page: UNKNOWN
+Author: Yves
+Author-email: yves@marcoz.org
+License: UNKNOWN
+Description: This file should contain a writeup describing what your application does,
+ and how to use it. The content of this file goes into the long_description
+ field of setup.py, which in turn becomes the long version of the Description
+ field in the debian/control file of the project.
+
+Platform: UNKNOWN
--- /dev/null
+#! /bin/sh -e
+if [ "$1" = rtupdate ]; then
+ pyclean /usr/share/feedingit
+ pycompile -V 2.6 /usr/share/feedingit
+fi
\ No newline at end of file
--- /dev/null
+feedingit_0.1.0-1_all.deb user/development optional
--- /dev/null
+#!/usr/bin/make -f
+
+# This file was automatically generated by stdeb 0.6.0+git at
+# Fri, 07 Oct 2011 20:59:08 -0700
+
+%:
+ dh $@ --with python2 --buildsystem=python_distutils
+
+
--- /dev/null
+3.0 (quilt)
--- /dev/null
+psa build-deb
+#scp /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit_harmattan/feedingit/deb_dist/feedingit_0.1.0-1_all.deb root@192.168.1.136:
+scp /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit_0.1.0-1_all.deb root@192.168.1.136:
+ssh root@192.168.1.136 "dpkg -i --force-depends-version feedingit_0.1.0-1_all.deb"
--- /dev/null
+#!/bin/sh
+
+case "$1" in
+dbus)
+ nice python /usr/share/feedingit/update_feeds.py
+ ;;
+*)
+ cd /usr/share/feedingit
+ python feedingit.py 2>&1 >/dev/null
+ ;;
+
+esac
+
--- /dev/null
+begin-base64 600 /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/feedingit.png
+iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAYAAACqaXHeAAAAGXRFWHRTb2Z0
+d2FyZQBBZG9iZSBJbWFnZVJlYWR5ccllPAAAEptJREFUeNrkWwl0FFXWvlW9
+pLN0kiaEEAiYsAaQNXBYZJmBYfGI/8jyH/k9uCC4/IAMjMgIDDMIItsoKKsH
+GVEQxWERGRgDCOIgQYUEkElAZTGSBQPpJJ2k93pzX/Wr7qrqqk4Hg2fOmTrn
+ppZUVb/73f2+VxwhBP6bNyP9w3HcL/V70g/x7Fg6J4wEtv9lAbiLzFJGDex3
+TGwvkUEGAGXcj+RTkZddJ3cLGONdYpq+18IojlEsUgy99vupkN3cBklofSIA
+PA/CuUIo2XUQivHUg+SUUT2SC8nNQBGadMDUBzSBCXCM6RjGrBUpidLGJTC6
+fVvI7pQFXVumQhtLjAiK7lZVA/byCiguuQlXPz4KJ954B/Lx8m2kSqQaBobw
+nwIAx1SZSjcBqRlSyvrFMHbEIBiV3R56SjcSjxH8zhgQ6s2iMvtrYkU2qA/m
+eD8Y4jzisTHZBZzFC7zFJz7n9oDzu+vw1ab34NWNO+BLihEzDWgKs7hTACRV
+j2GMp/btDh1WzIPHc+6FXyUngo0IPPgd8eCvs4C/Oh6IjxeHSwSZq2PHRLJy
+f+j/vNkHhmQnmFrVgrGZE34ogdOZQ+Fx/K+dmYlPtie/pA+Q1J2qegpS632b
+4ekxQ2EiVW/iM4Lnpg38tTKmgemJQCNO4A1EUL5QOuckbNxGEMqs4gkF4OBx
++Jya1KxZszpcvHix9tixYxUMDAfzD/47AaKxGkBvRB2GRKSWsx6H4Qumwwtp
+zaE1ZdxbZQOfwxoahjywyaUvyM79Kk1QHScMKoYar89u6w1TqP07HI7tZrM5
+/uTJk5+MGDFiHV4rZT6ilmlEo0DgGyn5GGbnme/+Baa//idY26IZ39pXYwNX
+WRu08QTgDCRAPN0zyRsCv0TPOfk5HzqW/i8dUzK1cgAf64O9ubCX2v2iRYsG
+JyQkpCIAccOHDx9fVVW1Z/369dPwf1nUDJlWysNrk2mAgvmCA7CqV1e4T/Ca
+UeqpQPfBTR6xiVzanML2FdKWaULwOu7jet8Ah+Ch0n+CSreoqGhFdnZ2T/Xg
+CgoKTj3zzDMrvv7660I8rWChMyqT4Bsp+az8A7CaMk+l7bGn44DNQcnKKShZ
+dky1Qk/aHK86xl802FxgSPDAsTz4Bw2Bjz76aLIW83Tr3bv3oAMHDmyaM2fO
+/XjaijnmqDSBj9LmbVTyyPyq3l1hkN+VAF5HamC0RpmaG0J5XxgQPAOBjwAC
+FzqOaWsHlxtcyzbAbrx6y2az1ZSWlhbqDTQtLa31K6+8svq5554bg6fp0YIQ
+yQQkb0+Zz0K1f1WUPGW+NjWYydNH+ZQJwFsHAGfOoAEe/PbD4L+1R6nygtzB
+cSFn6A+pvbhH4i0eiOt2Aw7/E/aPfgL+hFfL2FjSly5d+sC0adOebdmyZSut
+Qbtwmzdv3vx169Z9whxkHTOHRgNgZBldWwxzcx8aCZP9bmS+LjUYysCYCKZ2
+bwKf0D/sYVJfCJ7LjwDx1oTFeAUI/hDjEgiWthVgTHFAr7Hw8PkiOMvCHWGm
+SDPM9Nzc3NmjRo36rR4I8+fPn7d27dpDeFrOUmqhMSYg2X3zpb+H/6HMC15L
+gHmZXRtb/U6TefEFcV3B1OHNkJob1CpPFCov1Yd8jE9kPv9fkIfMl7FxtKBj
+YUKhYFwZPXr0yy/hRplV/7YFtxdffPEPffv27cY02KRnCnoA0AeSMLvr8txj
+MBsIj8ynKUOYJQMMzaco8/iqKkDUAccF27ZtQ7PoD1x8V00npwBBRuZ0u/iu
+l96A7VTqa9asGePxeM589tlny5ChPswZ0+yvZPHixXvmzp27UAsE6hN27Njx
+EnOKVjby8E2jIUKHmIzU6+SHcJhcwTT+fBpx5WcR1zmkC1nEfTGLeEvmEvXW
+q1cvIguEZPbs2cRXvpa4CvC5M1nE+RVSXhapP4l0AulYFqk72o7U5SIdQjqY
+SejvndkPefj8SKQRly9fzpfej3zWY9xfhddzkDJoMobUbebMmXOcuBGNDX3B
+arwnmzlFLhoNoNJPnHg/9L4vB0YKnjgQfHFKz03VOSZD8RBKCM6dO6e4RrUh
+KF255DmZRnAhLTA1rxafw7SXOjDn+PHjszp16tRbel9MTEzsjBkzXvjiiy9e
+wdM2LELdRlCOPP/884u0BPzUU0/NQM3pxLTA2BAAPKvsmmOKSzMs8DlTgoOX
+qzJxfRldqiXUaKq/WvU5g4A5fzVcvwHf/XktHKfJDGrQb7ReOWjQoFGY/Kxm
+WkCZur1x48ZP9+7du1N9LwVt+fLlU5gPsai1gNfy/Cj9nmK8R69PiDE0aBkQ
+xHM6wBzbMjMzITk5WfGyYUP6YFTcE2JczrDq3JDkwPsEeHcfvMdy+5rY2Fi3
+Hq5obvchCKtYzKdbxYQJEzZfu3bte/W9gwcPHpOTk9NeSwt4Dc9vWzgdpgak
+bwsfsEwL/JUvKAA4fvw4HZh4/tBDD8GeLeizSI0+43K7S66GagdUovQ/Zc0P
+e79+/ZZjdjfr5s2bJXogbN++fQpzjLQQKkNTWKUVFVauXPkYiwgx8l+X5wEG
+Vt52dRbCP8xcnIV6/lAqq9xLx1zcSDAkovkZQj6B+G6AULkUBMeRYIwPxnof
+hF0zxNaCuXkFbN8Hmx+bC+g44AZresQx1b3nzJkzL6MUB2jF/ClTpjz9wQcf
+fM66RW3Onj37Rp8+fQaqIpQds0maN1xmGuZXawBVjYQNL8H9tK73e+K1swMV
+Ec8RzPyGglA1FvxVj6BWIJXjufOIQtKc1juC0reLrTBk/gPW8XEzABwsm7uM
+jmxefn5+npZ0MfN7lgmPbpXLli3bqr4PzdOGfuIBFg2MWiZAPWrCwD4wWPRd
+7jjthrZO2kT8hZj1nQbiPt2ohoQhrh44ow/2H4W/0ZyfqbKF9RykfJ46mx9Q
+AxZo2TgthiZOnNiTPVePzvAiVo4X1PcNGDBgCHunWQ2AVPRYszKgC2WeEJV/
+JCqSXSd+ZauLQPi9ROtdVBQJ1WLR88Y2yGXMJ7IQ14FRBrtG09mScePG/VEr
+8Vm4cOFUlibTUVSjyZxS35OVldVFBgAnB4CibHn4AWhP+3mCNybACAlnnBBl
+P0/R0fcrOz66JKEf4xLp868gF1NfmvbGTp48edD58+ffRJs9TOnChQubsRQe
+zECow/99d+LEicNq5rp06dKHJXCUlzrMII9qmcGkSZPaM0fIywEQG5zD+kNX
+Uf09Fn3GibKpEXRuTAsUmhAJQNrriw9If95KeJ8+hYPr+dZbb23t0aPHkKSk
+pGaUunfvPnTLli1/RRD6sSStGgud7VrxHm1/CDMDH4bIsvLy8lL1fd26dctk
+ABjkAFCnYO6UBZ0CAJj1pacqbSVvTlRaoOj4aLyH431gsNTD2YvwTyx6aJgz
+LFmyZDZlRIs5zPmnszjuQeZ+LCws/EZ9X//+/XMYc/RXnCUlJcUaSVQvLQDo
+3uT1Bbwj8fOaKk/UNbwOEbUpaJiF0Rooel7fBjuZkzN07Nixj56z7Ny5c18W
+FumbHJcuXQoDICMjI0sGgNtut5er7/F6vdI0XRgAxux2kC24LCHGhXBbV9b0
+2gR+HU1ge1H6GPtpyfu3Q/AN6+j6I0ULphkc0zMXasAljXviZSHOi5HgigaQ
+Xdg9vDoKhMpFlfS0VF4hdZ+GFugwT8kQFyh6Fr0Gu1jHhjYx64uLiy/rAfDt
+t98WsEhAcwQPFl4lGpUtJ5ta8Ffjphd91VGAC2SFYrKq3cZSaUOYxH0RtED+
+LP6h0r9dBTWHPhMTnSTmuDw7d+58Rw+A/fv30ySpmoVK2LNnT1kDU/BEp/zn
+5Pfx8siMGTFRtLA1WtZEx+Y1fYKgancxAEDgISUZEivz4dW/rhSrznbUwaF3
+z9uwYcNqFFylNOCamprbW7dufRmzvY9ZlkjfZkCPr+cvJMZ5QRDCyn1OrL9D
+4jDKHvLLTYAqQ+DewD/lcBEiK2zUWaI6/MkjgWgaRnDdaINmUAvJNnvylIm+
+aeNGwYR9h2HPk3+Av8+cOfM40jH2pIv5BzujeiliWa1Wmw5zfqljieG0g/qe
+Gzdu/CCfT+RlVu+7fBWK+FinsnEZQfoNOkF/uBYFzjnwVVvBebUtuEtTISnW
+aJsyEabZC+DtrStgAvPSdWySo4ylyLVs4GLShsVODzVzGPauy2aOzVgmJGkU
+T7UyrxUEgJ54TSbwKZwgGyz4NWJ+FKTtDzgFMD67FeqLEIhiBMJitD35vzAV
+gXjn3b/AZDbdlSRbXCG1VQGl21vN3PXr179jTpJqgIV5fMWGpXUp8yNhALhP
+5YPY0+ItLhXTnC5zIvlkpAMSZVyhUarQ6r1lxTy3LbiupEJijNH26Dh4FoH4
+CIH4HevptWEVn+XBBx+0lJaWnlczd+jQoVMsUlCg4tPT0zPU92CNcEk2mxzs
+B9AH0iaNhRHvvw7vespSwGtPUrTAQv08olnS6hVOotMlGrPCGnlF8Bz3plQH
+mO+xiwslaKl84FPYxcplO1PzBKwA750zZ84kLJUHV1ZWXkWG/w+v/0T/h37k
+t+vWrXtNw0/8CndFzLyIBADHuiqdEfW/WyHe5ipOU/btNXp5XADCsChEdFJo
+9RS5VmgNAyLNATHt7eIsMQWCls1PvCBOl9VIEWHYsGFxAwcONKxYseICAyct
+Ly9vDZa/Y+Qjw+zxPBZNT9J5BfY8kXeErGzyc1Ovzvyg+ouZGtLXaGhwOkFI
+qwgStIHQWxsgB8LcCoHohEDEhQFhZ8zUsAYKHXFLj8dTYDKZFF2dHTt2bMKi
+ag0e/sgijKIhQh1DbV4+nKQdWoO1TtfrQyOdINHLDTRqB73s0f2DFWpy20L9
+WfQRRqPt8fHwNGrrLvQRU9nkh43V+jSCOKdPn/7Y6dOnP3G73U6JQcwxDsii
+iWZPkPbfutCeoNEZb3FdSVP28bWkH6UGaGlB+KRpAyYhm90z3+MAS5eQRjAf
+8T7r90n5QgL6hzYLFiwYhqrfBmk+8xG10kjlAHAMwXZoBhtpW7z+QlssjY0K
+5rlI6q/hCEkUviCiP5Afa2w6QOxineV62VulHqNTlisoTIAwM6jaths+FJuV
+Laojhz+95MgXZdocDfNC5FWBHmoan4RMg4XPvWga/8+mznjGuASIL9L0uDQv
+2PHaCdiemQEd6/IDWhCVA4xgBrrpcSQtkJtLlJtcI2qw5jp4HNY8MhveZqbh
+bmhqjDCUKnZ+DGJlZs6wayY+emVwg07Qr/TuTcm8XCOcF1IgMQFSfj0AJmlN
+iUUCgNpH1cJXIbegEE6ZWjgwIriiMoVo1V8zCmhFhZ+xDpRqAJto/SjSYkqt
+FSJSizy1Tzfod2o3vGcSjLF1ZzLEVhkHjcgDACInRZGiws/YjKkuSBhSCljc
+ncseCU/jpauyUrrB6XHJGVbn/wuK3toF62k6Gtu5QlvyPv1aQC9nIIJ2bhDJ
+20e97s8kQPyAcnGNMdr+ErZEpraxS2QICxcVMxfDri/OwhFjah1YOtyOOglS
++4iIUaMJVF5iPmFoqbhfvQWWowALZYurozYBtSnQGqFd0WHYQFd/O79JBU+p
+NWQKOvCRhpIiQWem6Wcyb0jySJOsm1jK61CHvmjXCQbzAqRifOGim7egJLZ7
+BZjTHfpOMIqWeVjbvAmZ/+gI7MCxbmGTqg12m6NZKsuzMEKXxPc6uBXWt0iB
+DM+PVnCeS9XWAqJqj0HTSlvt7eMHlgeZH/csvE4F1pDqN3atcHDpDJ2rx1R5
+OU2Vfbdioe7LNCBeXh8EgLv2GZSpVR3E5VSIGsDUfgtbWyB1jxv85cYsl5c0
+gVZdGfs2w0y6fpAyT5MOmoD8UhtlmGZ7MR2qRW9PHd6i12A/U/uaaCR/JwDI
+HSPt09HvBX4tfS9AtcFVZANfheWuMk9T3dget0UQaJynoY55+59kpW7UOncn
+n8xIa4jjWYRofXQ7zLkvJ7CyhALh/j4RvKXxTSpxyjiVOLV5upZox0ewFUP0
+btY1rmJhu9FfjTTFN0NU95vn3Asd6TdDg/vCGAqEUG8UQfCWxd+RVlCmjalO
+MKXXi8zTjTL+8VH4EG19J0twKpnUvXeaQjXFV2PSN4IUiGYYKdqhadw/agg8
+SE1DupFqBgWC+gx/dYx2CtvcGWScevVgL+8KnD+WB7kz/gwH2ByBxLgH7vBb
+oaYCQAuIBLaaI/nhB6Dz2OEwoFsn6JmeCpktU6FVNC+79iN8X/oTXDtfBAVv
+74ZTZ74RPXsVc3B1TcF4UwMAqllmMwMjFkJfjdK95flp0DUlGZKlr0aD07UG
+8H94EC6jQ7sNyq9GpWOXTNWbLLA2NQBaYMi/GzbJFieoP54WQPnNsFd2ftc+
+qr6bAOhNW/MR2qkC3MUPpbW2fwswAOLT2NCG5vJpAAAAAElFTkSuQmCC
+====
--- /dev/null
+[Desktop Entry]
+Encoding=UTF-8
+Version=1.0
+Type=Application
+Name=FeedingIt RSS Reader
+Exec=invoker --single-instance --type=e /usr/bin/feedingit
+Icon=/usr/share/icons/hicolor/64x64/apps/feedingit.png
+Categories=Development;
--- /dev/null
+This file should contain a writeup describing what your application does,
+and how to use it. The content of this file goes into the long_description
+field of setup.py, which in turn becomes the long version of the Description
+field in the debian/control file of the project.
--- /dev/null
+[Project]
+category = Development
+maintainer = Yves
+appname = PySide app
+section = development
+pyversion = 2.7
+project = feedingit
+email = email@example.com
+desc = A PySide example
+template = harmattan
+
--- /dev/null
+[D-BUS Service]
+Name=org.marcoz.feedingit
+Exec=/usr/bin/feedingit dbus
--- /dev/null
+"""Beautiful Soup
+Elixir and Tonic
+"The Screen-Scraper's Friend"
+http://www.crummy.com/software/BeautifulSoup/
+
+Beautiful Soup parses a (possibly invalid) XML or HTML document into a
+tree representation. It provides methods and Pythonic idioms that make
+it easy to navigate, search, and modify the tree.
+
+A well-formed XML/HTML document yields a well-formed data
+structure. An ill-formed XML/HTML document yields a correspondingly
+ill-formed data structure. If your document is only locally
+well-formed, you can use this library to find and process the
+well-formed part of it.
+
+Beautiful Soup works with Python 2.2 and up. It has no external
+dependencies, but you'll have more success at converting data to UTF-8
+if you also install these three packages:
+
+* chardet, for auto-detecting character encodings
+ http://chardet.feedparser.org/
+* cjkcodecs and iconv_codec, which add more encodings to the ones supported
+ by stock Python.
+ http://cjkpython.i18n.org/
+
+Beautiful Soup defines classes for two main parsing strategies:
+
+ * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
+ language that kind of looks like XML.
+
+ * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
+ or invalid. This class has web browser-like heuristics for
+ obtaining a sensible parse tree in the face of common HTML errors.
+
+Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
+the encoding of an HTML or XML document, and converting it to
+Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
+
+For more than you ever wanted to know about Beautiful Soup, see the
+documentation:
+http://www.crummy.com/software/BeautifulSoup/documentation.html
+
+Here, have some legalese:
+
+Copyright (c) 2004-2010, Leonard Richardson
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials provided
+ with the distribution.
+
+ * Neither the name of the the Beautiful Soup Consortium and All
+ Night Kosher Bakery nor the names of its contributors may be
+ used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
+
+"""
+from __future__ import generators
+
+__author__ = "Leonard Richardson (leonardr@segfault.org)"
+__version__ = "3.2.0"
+__copyright__ = "Copyright (c) 2004-2010 Leonard Richardson"
+__license__ = "New-style BSD"
+
+from sgmllib import SGMLParser, SGMLParseError
+import codecs
+import markupbase
+import types
+import re
+import sgmllib
+try:
+ from htmlentitydefs import name2codepoint
+except ImportError:
+ name2codepoint = {}
+try:
+ set
+except NameError:
+ from sets import Set as set
+
+#These hacks make Beautiful Soup able to parse XML with namespaces
+sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
+markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
+
+DEFAULT_OUTPUT_ENCODING = "utf-8"
+
+def _match_css_class(str):
+ """Build a RE to match the given CSS class."""
+ return re.compile(r"(^|.*\s)%s($|\s)" % str)
+
+# First, the classes that represent markup elements.
+
+class PageElement(object):
+ """Contains the navigational information for some part of the page
+ (either a tag or a piece of text)"""
+
+ def setup(self, parent=None, previous=None):
+ """Sets up the initial relations between this element and
+ other elements."""
+ self.parent = parent
+ self.previous = previous
+ self.next = None
+ self.previousSibling = None
+ self.nextSibling = None
+ if self.parent and self.parent.contents:
+ self.previousSibling = self.parent.contents[-1]
+ self.previousSibling.nextSibling = self
+
+ def replaceWith(self, replaceWith):
+ oldParent = self.parent
+ myIndex = self.parent.index(self)
+ if hasattr(replaceWith, "parent")\
+ and replaceWith.parent is self.parent:
+ # We're replacing this element with one of its siblings.
+ index = replaceWith.parent.index(replaceWith)
+ if index and index < myIndex:
+ # Furthermore, it comes before this element. That
+ # means that when we extract it, the index of this
+ # element will change.
+ myIndex = myIndex - 1
+ self.extract()
+ oldParent.insert(myIndex, replaceWith)
+
+ def replaceWithChildren(self):
+ myParent = self.parent
+ myIndex = self.parent.index(self)
+ self.extract()
+ reversedChildren = list(self.contents)
+ reversedChildren.reverse()
+ for child in reversedChildren:
+ myParent.insert(myIndex, child)
+
+ def extract(self):
+ """Destructively rips this element out of the tree."""
+ if self.parent:
+ try:
+ del self.parent.contents[self.parent.index(self)]
+ except ValueError:
+ pass
+
+ #Find the two elements that would be next to each other if
+ #this element (and any children) hadn't been parsed. Connect
+ #the two.
+ lastChild = self._lastRecursiveChild()
+ nextElement = lastChild.next
+
+ if self.previous:
+ self.previous.next = nextElement
+ if nextElement:
+ nextElement.previous = self.previous
+ self.previous = None
+ lastChild.next = None
+
+ self.parent = None
+ if self.previousSibling:
+ self.previousSibling.nextSibling = self.nextSibling
+ if self.nextSibling:
+ self.nextSibling.previousSibling = self.previousSibling
+ self.previousSibling = self.nextSibling = None
+ return self
+
+ def _lastRecursiveChild(self):
+ "Finds the last element beneath this object to be parsed."
+ lastChild = self
+ while hasattr(lastChild, 'contents') and lastChild.contents:
+ lastChild = lastChild.contents[-1]
+ return lastChild
+
+ def insert(self, position, newChild):
+ if isinstance(newChild, basestring) \
+ and not isinstance(newChild, NavigableString):
+ newChild = NavigableString(newChild)
+
+ position = min(position, len(self.contents))
+ if hasattr(newChild, 'parent') and newChild.parent is not None:
+ # We're 'inserting' an element that's already one
+ # of this object's children.
+ if newChild.parent is self:
+ index = self.index(newChild)
+ if index > position:
+ # Furthermore we're moving it further down the
+ # list of this object's children. That means that
+ # when we extract this element, our target index
+ # will jump down one.
+ position = position - 1
+ newChild.extract()
+
+ newChild.parent = self
+ previousChild = None
+ if position == 0:
+ newChild.previousSibling = None
+ newChild.previous = self
+ else:
+ previousChild = self.contents[position-1]
+ newChild.previousSibling = previousChild
+ newChild.previousSibling.nextSibling = newChild
+ newChild.previous = previousChild._lastRecursiveChild()
+ if newChild.previous:
+ newChild.previous.next = newChild
+
+ newChildsLastElement = newChild._lastRecursiveChild()
+
+ if position >= len(self.contents):
+ newChild.nextSibling = None
+
+ parent = self
+ parentsNextSibling = None
+ while not parentsNextSibling:
+ parentsNextSibling = parent.nextSibling
+ parent = parent.parent
+ if not parent: # This is the last element in the document.
+ break
+ if parentsNextSibling:
+ newChildsLastElement.next = parentsNextSibling
+ else:
+ newChildsLastElement.next = None
+ else:
+ nextChild = self.contents[position]
+ newChild.nextSibling = nextChild
+ if newChild.nextSibling:
+ newChild.nextSibling.previousSibling = newChild
+ newChildsLastElement.next = nextChild
+
+ if newChildsLastElement.next:
+ newChildsLastElement.next.previous = newChildsLastElement
+ self.contents.insert(position, newChild)
+
+ def append(self, tag):
+ """Appends the given tag to the contents of this tag."""
+ self.insert(len(self.contents), tag)
+
+ def findNext(self, name=None, attrs={}, text=None, **kwargs):
+ """Returns the first item that matches the given criteria and
+ appears after this Tag in the document."""
+ return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
+
+ def findAllNext(self, name=None, attrs={}, text=None, limit=None,
+ **kwargs):
+ """Returns all items that match the given criteria and appear
+ after this Tag in the document."""
+ return self._findAll(name, attrs, text, limit, self.nextGenerator,
+ **kwargs)
+
+ def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
+ """Returns the closest sibling to this Tag that matches the
+ given criteria and appears after this Tag in the document."""
+ return self._findOne(self.findNextSiblings, name, attrs, text,
+ **kwargs)
+
+ def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
+ **kwargs):
+ """Returns the siblings of this Tag that match the given
+ criteria and appear after this Tag in the document."""
+ return self._findAll(name, attrs, text, limit,
+ self.nextSiblingGenerator, **kwargs)
+ fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
+
+ def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
+ """Returns the first item that matches the given criteria and
+ appears before this Tag in the document."""
+ return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
+
+ def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
+ **kwargs):
+ """Returns all items that match the given criteria and appear
+ before this Tag in the document."""
+ return self._findAll(name, attrs, text, limit, self.previousGenerator,
+ **kwargs)
+ fetchPrevious = findAllPrevious # Compatibility with pre-3.x
+
+ def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
+ """Returns the closest sibling to this Tag that matches the
+ given criteria and appears before this Tag in the document."""
+ return self._findOne(self.findPreviousSiblings, name, attrs, text,
+ **kwargs)
+
+ def findPreviousSiblings(self, name=None, attrs={}, text=None,
+ limit=None, **kwargs):
+ """Returns the siblings of this Tag that match the given
+ criteria and appear before this Tag in the document."""
+ return self._findAll(name, attrs, text, limit,
+ self.previousSiblingGenerator, **kwargs)
+ fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
+
+ def findParent(self, name=None, attrs={}, **kwargs):
+ """Returns the closest parent of this Tag that matches the given
+ criteria."""
+ # NOTE: We can't use _findOne because findParents takes a different
+ # set of arguments.
+ r = None
+ l = self.findParents(name, attrs, 1)
+ if l:
+ r = l[0]
+ return r
+
+ def findParents(self, name=None, attrs={}, limit=None, **kwargs):
+ """Returns the parents of this Tag that match the given
+ criteria."""
+
+ return self._findAll(name, attrs, None, limit, self.parentGenerator,
+ **kwargs)
+ fetchParents = findParents # Compatibility with pre-3.x
+
+ #These methods do the real heavy lifting.
+
+ def _findOne(self, method, name, attrs, text, **kwargs):
+ r = None
+ l = method(name, attrs, text, 1, **kwargs)
+ if l:
+ r = l[0]
+ return r
+
+ def _findAll(self, name, attrs, text, limit, generator, **kwargs):
+ "Iterates over a generator looking for things that match."
+
+ if isinstance(name, SoupStrainer):
+ strainer = name
+ # (Possibly) special case some findAll*(...) searches
+ elif text is None and not limit and not attrs and not kwargs:
+ # findAll*(True)
+ if name is True:
+ return [element for element in generator()
+ if isinstance(element, Tag)]
+ # findAll*('tag-name')
+ elif isinstance(name, basestring):
+ return [element for element in generator()
+ if isinstance(element, Tag) and
+ element.name == name]
+ else:
+ strainer = SoupStrainer(name, attrs, text, **kwargs)
+ # Build a SoupStrainer
+ else:
+ strainer = SoupStrainer(name, attrs, text, **kwargs)
+ results = ResultSet(strainer)
+ g = generator()
+ while True:
+ try:
+ i = g.next()
+ except StopIteration:
+ break
+ if i:
+ found = strainer.search(i)
+ if found:
+ results.append(found)
+ if limit and len(results) >= limit:
+ break
+ return results
+
+ #These Generators can be used to navigate starting from both
+ #NavigableStrings and Tags.
+ def nextGenerator(self):
+ i = self
+ while i is not None:
+ i = i.next
+ yield i
+
+ def nextSiblingGenerator(self):
+ i = self
+ while i is not None:
+ i = i.nextSibling
+ yield i
+
+ def previousGenerator(self):
+ i = self
+ while i is not None:
+ i = i.previous
+ yield i
+
+ def previousSiblingGenerator(self):
+ i = self
+ while i is not None:
+ i = i.previousSibling
+ yield i
+
+ def parentGenerator(self):
+ i = self
+ while i is not None:
+ i = i.parent
+ yield i
+
+ # Utility methods
+ def substituteEncoding(self, str, encoding=None):
+ encoding = encoding or "utf-8"
+ return str.replace("%SOUP-ENCODING%", encoding)
+
+ def toEncoding(self, s, encoding=None):
+ """Encodes an object to a string in some encoding, or to Unicode.
+ ."""
+ if isinstance(s, unicode):
+ if encoding:
+ s = s.encode(encoding)
+ elif isinstance(s, str):
+ if encoding:
+ s = s.encode(encoding)
+ else:
+ s = unicode(s)
+ else:
+ if encoding:
+ s = self.toEncoding(str(s), encoding)
+ else:
+ s = unicode(s)
+ return s
+
+class NavigableString(unicode, PageElement):
+
+ def __new__(cls, value):
+ """Create a new NavigableString.
+
+ When unpickling a NavigableString, this method is called with
+ the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
+ passed in to the superclass's __new__ or the superclass won't know
+ how to handle non-ASCII characters.
+ """
+ if isinstance(value, unicode):
+ return unicode.__new__(cls, value)
+ return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
+
+ def __getnewargs__(self):
+ return (NavigableString.__str__(self),)
+
+ def __getattr__(self, attr):
+ """text.string gives you text. This is for backwards
+ compatibility for Navigable*String, but for CData* it lets you
+ get the string without the CData wrapper."""
+ if attr == 'string':
+ return self
+ else:
+ raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
+
+ def __unicode__(self):
+ return str(self).decode(DEFAULT_OUTPUT_ENCODING)
+
+ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ if encoding:
+ return self.encode(encoding)
+ else:
+ return self
+
+class CData(NavigableString):
+
+ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
+
+class ProcessingInstruction(NavigableString):
+ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ output = self
+ if "%SOUP-ENCODING%" in output:
+ output = self.substituteEncoding(output, encoding)
+ return "<?%s?>" % self.toEncoding(output, encoding)
+
+class Comment(NavigableString):
+ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ return "<!--%s-->" % NavigableString.__str__(self, encoding)
+
+class Declaration(NavigableString):
+ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ return "<!%s>" % NavigableString.__str__(self, encoding)
+
+class Tag(PageElement):
+
+ """Represents a found HTML tag with its attributes and contents."""
+
+ def _invert(h):
+ "Cheap function to invert a hash."
+ i = {}
+ for k,v in h.items():
+ i[v] = k
+ return i
+
+ XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
+ "quot" : '"',
+ "amp" : "&",
+ "lt" : "<",
+ "gt" : ">" }
+
+ XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
+
+ def _convertEntities(self, match):
+ """Used in a call to re.sub to replace HTML, XML, and numeric
+ entities with the appropriate Unicode characters. If HTML
+ entities are being converted, any unrecognized entities are
+ escaped."""
+ x = match.group(1)
+ if self.convertHTMLEntities and x in name2codepoint:
+ return unichr(name2codepoint[x])
+ elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
+ if self.convertXMLEntities:
+ return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
+ else:
+ return u'&%s;' % x
+ elif len(x) > 0 and x[0] == '#':
+ # Handle numeric entities
+ if len(x) > 1 and x[1] == 'x':
+ return unichr(int(x[2:], 16))
+ else:
+ return unichr(int(x[1:]))
+
+ elif self.escapeUnrecognizedEntities:
+ return u'&%s;' % x
+ else:
+ return u'&%s;' % x
+
+ def __init__(self, parser, name, attrs=None, parent=None,
+ previous=None):
+ "Basic constructor."
+
+ # We don't actually store the parser object: that lets extracted
+ # chunks be garbage-collected
+ self.parserClass = parser.__class__
+ self.isSelfClosing = parser.isSelfClosingTag(name)
+ self.name = name
+ if attrs is None:
+ attrs = []
+ elif isinstance(attrs, dict):
+ attrs = attrs.items()
+ self.attrs = attrs
+ self.contents = []
+ self.setup(parent, previous)
+ self.hidden = False
+ self.containsSubstitutions = False
+ self.convertHTMLEntities = parser.convertHTMLEntities
+ self.convertXMLEntities = parser.convertXMLEntities
+ self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
+
+ # Convert any HTML, XML, or numeric entities in the attribute values.
+ convert = lambda(k, val): (k,
+ re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
+ self._convertEntities,
+ val))
+ self.attrs = map(convert, self.attrs)
+
+ def getString(self):
+ if (len(self.contents) == 1
+ and isinstance(self.contents[0], NavigableString)):
+ return self.contents[0]
+
+ def setString(self, string):
+ """Replace the contents of the tag with a string"""
+ self.clear()
+ self.append(string)
+
+ string = property(getString, setString)
+
+ def getText(self, separator=u""):
+ if not len(self.contents):
+ return u""
+ stopNode = self._lastRecursiveChild().next
+ strings = []
+ current = self.contents[0]
+ while current is not stopNode:
+ if isinstance(current, NavigableString):
+ strings.append(current.strip())
+ current = current.next
+ return separator.join(strings)
+
+ text = property(getText)
+
+ def get(self, key, default=None):
+ """Returns the value of the 'key' attribute for the tag, or
+ the value given for 'default' if it doesn't have that
+ attribute."""
+ return self._getAttrMap().get(key, default)
+
+ def clear(self):
+ """Extract all children."""
+ for child in self.contents[:]:
+ child.extract()
+
+ def index(self, element):
+ for i, child in enumerate(self.contents):
+ if child is element:
+ return i
+ raise ValueError("Tag.index: element not in tag")
+
+ def has_key(self, key):
+ return self._getAttrMap().has_key(key)
+
+ def __getitem__(self, key):
+ """tag[key] returns the value of the 'key' attribute for the tag,
+ and throws an exception if it's not there."""
+ return self._getAttrMap()[key]
+
+ def __iter__(self):
+ "Iterating over a tag iterates over its contents."
+ return iter(self.contents)
+
+ def __len__(self):
+ "The length of a tag is the length of its list of contents."
+ return len(self.contents)
+
+ def __contains__(self, x):
+ return x in self.contents
+
+ def __nonzero__(self):
+ "A tag is non-None even if it has no contents."
+ return True
+
+ def __setitem__(self, key, value):
+ """Setting tag[key] sets the value of the 'key' attribute for the
+ tag."""
+ self._getAttrMap()
+ self.attrMap[key] = value
+ found = False
+ for i in range(0, len(self.attrs)):
+ if self.attrs[i][0] == key:
+ self.attrs[i] = (key, value)
+ found = True
+ if not found:
+ self.attrs.append((key, value))
+ self._getAttrMap()[key] = value
+
+ def __delitem__(self, key):
+ "Deleting tag[key] deletes all 'key' attributes for the tag."
+ for item in self.attrs:
+ if item[0] == key:
+ self.attrs.remove(item)
+ #We don't break because bad HTML can define the same
+ #attribute multiple times.
+ self._getAttrMap()
+ if self.attrMap.has_key(key):
+ del self.attrMap[key]
+
+ def __call__(self, *args, **kwargs):
+ """Calling a tag like a function is the same as calling its
+ findAll() method. Eg. tag('a') returns a list of all the A tags
+ found within this tag."""
+ return apply(self.findAll, args, kwargs)
+
+ def __getattr__(self, tag):
+ #print "Getattr %s.%s" % (self.__class__, tag)
+ if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
+ return self.find(tag[:-3])
+ elif tag.find('__') != 0:
+ return self.find(tag)
+ raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
+
+ def __eq__(self, other):
+ """Returns true iff this tag has the same name, the same attributes,
+ and the same contents (recursively) as the given tag.
+
+ NOTE: right now this will return false if two tags have the
+ same attributes in a different order. Should this be fixed?"""
+ if other is self:
+ return True
+ if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
+ return False
+ for i in range(0, len(self.contents)):
+ if self.contents[i] != other.contents[i]:
+ return False
+ return True
+
+ def __ne__(self, other):
+ """Returns true iff this tag is not identical to the other tag,
+ as defined in __eq__."""
+ return not self == other
+
+ def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ """Renders this tag as a string."""
+ return self.__str__(encoding)
+
+ def __unicode__(self):
+ return self.__str__(None)
+
+ BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
+ + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
+ + ")")
+
+ def _sub_entity(self, x):
+ """Used with a regular expression to substitute the
+ appropriate XML entity for an XML special character."""
+ return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
+
+ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
+ prettyPrint=False, indentLevel=0):
+ """Returns a string or Unicode representation of this tag and
+ its contents. To get Unicode, pass None for encoding.
+
+ NOTE: since Python's HTML parser consumes whitespace, this
+ method is not certain to reproduce the whitespace present in
+ the original string."""
+
+ encodedName = self.toEncoding(self.name, encoding)
+
+ attrs = []
+ if self.attrs:
+ for key, val in self.attrs:
+ fmt = '%s="%s"'
+ if isinstance(val, basestring):
+ if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
+ val = self.substituteEncoding(val, encoding)
+
+ # The attribute value either:
+ #
+ # * Contains no embedded double quotes or single quotes.
+ # No problem: we enclose it in double quotes.
+ # * Contains embedded single quotes. No problem:
+ # double quotes work here too.
+ # * Contains embedded double quotes. No problem:
+ # we enclose it in single quotes.
+ # * Embeds both single _and_ double quotes. This
+ # can't happen naturally, but it can happen if
+ # you modify an attribute value after parsing
+ # the document. Now we have a bit of a
+ # problem. We solve it by enclosing the
+ # attribute in single quotes, and escaping any
+ # embedded single quotes to XML entities.
+ if '"' in val:
+ fmt = "%s='%s'"
+ if "'" in val:
+ # TODO: replace with apos when
+ # appropriate.
+ val = val.replace("'", "&squot;")
+
+ # Now we're okay w/r/t quotes. But the attribute
+ # value might also contain angle brackets, or
+ # ampersands that aren't part of entities. We need
+ # to escape those to XML entities too.
+ val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
+
+ attrs.append(fmt % (self.toEncoding(key, encoding),
+ self.toEncoding(val, encoding)))
+ close = ''
+ closeTag = ''
+ if self.isSelfClosing:
+ close = ' /'
+ else:
+ closeTag = '</%s>' % encodedName
+
+ indentTag, indentContents = 0, 0
+ if prettyPrint:
+ indentTag = indentLevel
+ space = (' ' * (indentTag-1))
+ indentContents = indentTag + 1
+ contents = self.renderContents(encoding, prettyPrint, indentContents)
+ if self.hidden:
+ s = contents
+ else:
+ s = []
+ attributeString = ''
+ if attrs:
+ attributeString = ' ' + ' '.join(attrs)
+ if prettyPrint:
+ s.append(space)
+ s.append('<%s%s%s>' % (encodedName, attributeString, close))
+ if prettyPrint:
+ s.append("\n")
+ s.append(contents)
+ if prettyPrint and contents and contents[-1] != "\n":
+ s.append("\n")
+ if prettyPrint and closeTag:
+ s.append(space)
+ s.append(closeTag)
+ if prettyPrint and closeTag and self.nextSibling:
+ s.append("\n")
+ s = ''.join(s)
+ return s
+
+ def decompose(self):
+ """Recursively destroys the contents of this tree."""
+ self.extract()
+ if len(self.contents) == 0:
+ return
+ current = self.contents[0]
+ while current is not None:
+ next = current.next
+ if isinstance(current, Tag):
+ del current.contents[:]
+ current.parent = None
+ current.previous = None
+ current.previousSibling = None
+ current.next = None
+ current.nextSibling = None
+ current = next
+
+ def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ return self.__str__(encoding, True)
+
+ def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
+ prettyPrint=False, indentLevel=0):
+ """Renders the contents of this tag as a string in the given
+ encoding. If encoding is None, returns a Unicode string.."""
+ s=[]
+ for c in self:
+ text = None
+ if isinstance(c, NavigableString):
+ text = c.__str__(encoding)
+ elif isinstance(c, Tag):
+ s.append(c.__str__(encoding, prettyPrint, indentLevel))
+ if text and prettyPrint:
+ text = text.strip()
+ if text:
+ if prettyPrint:
+ s.append(" " * (indentLevel-1))
+ s.append(text)
+ if prettyPrint:
+ s.append("\n")
+ return ''.join(s)
+
+ #Soup methods
+
+ def find(self, name=None, attrs={}, recursive=True, text=None,
+ **kwargs):
+ """Return only the first child of this Tag matching the given
+ criteria."""
+ r = None
+ l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
+ if l:
+ r = l[0]
+ return r
+ findChild = find
+
+ def findAll(self, name=None, attrs={}, recursive=True, text=None,
+ limit=None, **kwargs):
+ """Extracts a list of Tag objects that match the given
+ criteria. You can specify the name of the Tag and any
+ attributes you want the Tag to have.
+
+ The value of a key-value pair in the 'attrs' map can be a
+ string, a list of strings, a regular expression object, or a
+ callable that takes a string and returns whether or not the
+ string matches for some custom definition of 'matches'. The
+ same is true of the tag name."""
+ generator = self.recursiveChildGenerator
+ if not recursive:
+ generator = self.childGenerator
+ return self._findAll(name, attrs, text, limit, generator, **kwargs)
+ findChildren = findAll
+
+ # Pre-3.x compatibility methods
+ first = find
+ fetch = findAll
+
+ def fetchText(self, text=None, recursive=True, limit=None):
+ return self.findAll(text=text, recursive=recursive, limit=limit)
+
+ def firstText(self, text=None, recursive=True):
+ return self.find(text=text, recursive=recursive)
+
+ #Private methods
+
+ def _getAttrMap(self):
+ """Initializes a map representation of this tag's attributes,
+ if not already initialized."""
+ if not getattr(self, 'attrMap'):
+ self.attrMap = {}
+ for (key, value) in self.attrs:
+ self.attrMap[key] = value
+ return self.attrMap
+
+ #Generator methods
+ def childGenerator(self):
+ # Just use the iterator from the contents
+ return iter(self.contents)
+
+ def recursiveChildGenerator(self):
+ if not len(self.contents):
+ raise StopIteration
+ stopNode = self._lastRecursiveChild().next
+ current = self.contents[0]
+ while current is not stopNode:
+ yield current
+ current = current.next
+
+
+# Next, a couple classes to represent queries and their results.
+class SoupStrainer:
+ """Encapsulates a number of ways of matching a markup element (tag or
+ text)."""
+
+ def __init__(self, name=None, attrs={}, text=None, **kwargs):
+ self.name = name
+ if isinstance(attrs, basestring):
+ kwargs['class'] = _match_css_class(attrs)
+ attrs = None
+ if kwargs:
+ if attrs:
+ attrs = attrs.copy()
+ attrs.update(kwargs)
+ else:
+ attrs = kwargs
+ self.attrs = attrs
+ self.text = text
+
+ def __str__(self):
+ if self.text:
+ return self.text
+ else:
+ return "%s|%s" % (self.name, self.attrs)
+
+ def searchTag(self, markupName=None, markupAttrs={}):
+ found = None
+ markup = None
+ if isinstance(markupName, Tag):
+ markup = markupName
+ markupAttrs = markup
+ callFunctionWithTagData = callable(self.name) \
+ and not isinstance(markupName, Tag)
+
+ if (not self.name) \
+ or callFunctionWithTagData \
+ or (markup and self._matches(markup, self.name)) \
+ or (not markup and self._matches(markupName, self.name)):
+ if callFunctionWithTagData:
+ match = self.name(markupName, markupAttrs)
+ else:
+ match = True
+ markupAttrMap = None
+ for attr, matchAgainst in self.attrs.items():
+ if not markupAttrMap:
+ if hasattr(markupAttrs, 'get'):
+ markupAttrMap = markupAttrs
+ else:
+ markupAttrMap = {}
+ for k,v in markupAttrs:
+ markupAttrMap[k] = v
+ attrValue = markupAttrMap.get(attr)
+ if not self._matches(attrValue, matchAgainst):
+ match = False
+ break
+ if match:
+ if markup:
+ found = markup
+ else:
+ found = markupName
+ return found
+
+ def search(self, markup):
+ #print 'looking for %s in %s' % (self, markup)
+ found = None
+ # If given a list of items, scan it for a text element that
+ # matches.
+ if hasattr(markup, "__iter__") \
+ and not isinstance(markup, Tag):
+ for element in markup:
+ if isinstance(element, NavigableString) \
+ and self.search(element):
+ found = element
+ break
+ # If it's a Tag, make sure its name or attributes match.
+ # Don't bother with Tags if we're searching for text.
+ elif isinstance(markup, Tag):
+ if not self.text:
+ found = self.searchTag(markup)
+ # If it's text, make sure the text matches.
+ elif isinstance(markup, NavigableString) or \
+ isinstance(markup, basestring):
+ if self._matches(markup, self.text):
+ found = markup
+ else:
+ raise Exception, "I don't know how to match against a %s" \
+ % markup.__class__
+ return found
+
+ def _matches(self, markup, matchAgainst):
+ #print "Matching %s against %s" % (markup, matchAgainst)
+ result = False
+ if matchAgainst is True:
+ result = markup is not None
+ elif callable(matchAgainst):
+ result = matchAgainst(markup)
+ else:
+ #Custom match methods take the tag as an argument, but all
+ #other ways of matching match the tag name as a string.
+ if isinstance(markup, Tag):
+ markup = markup.name
+ if markup and not isinstance(markup, basestring):
+ markup = unicode(markup)
+ #Now we know that chunk is either a string, or None.
+ if hasattr(matchAgainst, 'match'):
+ # It's a regexp object.
+ result = markup and matchAgainst.search(markup)
+ elif hasattr(matchAgainst, '__iter__'): # list-like
+ result = markup in matchAgainst
+ elif hasattr(matchAgainst, 'items'):
+ result = markup.has_key(matchAgainst)
+ elif matchAgainst and isinstance(markup, basestring):
+ if isinstance(markup, unicode):
+ matchAgainst = unicode(matchAgainst)
+ else:
+ matchAgainst = str(matchAgainst)
+
+ if not result:
+ result = matchAgainst == markup
+ return result
+
+class ResultSet(list):
+ """A ResultSet is just a list that keeps track of the SoupStrainer
+ that created it."""
+ def __init__(self, source):
+ list.__init__([])
+ self.source = source
+
+# Now, some helper functions.
+
+def buildTagMap(default, *args):
+ """Turns a list of maps, lists, or scalars into a single map.
+ Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
+ NESTING_RESET_TAGS maps out of lists and partial maps."""
+ built = {}
+ for portion in args:
+ if hasattr(portion, 'items'):
+ #It's a map. Merge it.
+ for k,v in portion.items():
+ built[k] = v
+ elif hasattr(portion, '__iter__'): # is a list
+ #It's a list. Map each item to the default.
+ for k in portion:
+ built[k] = default
+ else:
+ #It's a scalar. Map it to the default.
+ built[portion] = default
+ return built
+
+# Now, the parser classes.
+
+class BeautifulStoneSoup(Tag, SGMLParser):
+
+ """This class contains the basic parser and search code. It defines
+ a parser that knows nothing about tag behavior except for the
+ following:
+
+ You can't close a tag without closing all the tags it encloses.
+ That is, "<foo><bar></foo>" actually means
+ "<foo><bar></bar></foo>".
+
+ [Another possible explanation is "<foo><bar /></foo>", but since
+ this class defines no SELF_CLOSING_TAGS, it will never use that
+ explanation.]
+
+ This class is useful for parsing XML or made-up markup languages,
+ or when BeautifulSoup makes an assumption counter to what you were
+ expecting."""
+
+ SELF_CLOSING_TAGS = {}
+ NESTABLE_TAGS = {}
+ RESET_NESTING_TAGS = {}
+ QUOTE_TAGS = {}
+ PRESERVE_WHITESPACE_TAGS = []
+
+ MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
+ lambda x: x.group(1) + ' />'),
+ (re.compile('<!\s+([^<>]*)>'),
+ lambda x: '<!' + x.group(1) + '>')
+ ]
+
+ ROOT_TAG_NAME = u'[document]'
+
+ HTML_ENTITIES = "html"
+ XML_ENTITIES = "xml"
+ XHTML_ENTITIES = "xhtml"
+ # TODO: This only exists for backwards-compatibility
+ ALL_ENTITIES = XHTML_ENTITIES
+
+ # Used when determining whether a text node is all whitespace and
+ # can be replaced with a single space. A text node that contains
+ # fancy Unicode spaces (usually non-breaking) should be left
+ # alone.
+ STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
+
+ def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
+ markupMassage=True, smartQuotesTo=XML_ENTITIES,
+ convertEntities=None, selfClosingTags=None, isHTML=False):
+ """The Soup object is initialized as the 'root tag', and the
+ provided markup (which can be a string or a file-like object)
+ is fed into the underlying parser.
+
+ sgmllib will process most bad HTML, and the BeautifulSoup
+ class has some tricks for dealing with some HTML that kills
+ sgmllib, but Beautiful Soup can nonetheless choke or lose data
+ if your data uses self-closing tags or declarations
+ incorrectly.
+
+ By default, Beautiful Soup uses regexes to sanitize input,
+ avoiding the vast majority of these problems. If the problems
+ don't apply to you, pass in False for markupMassage, and
+ you'll get better performance.
+
+ The default parser massage techniques fix the two most common
+ instances of invalid HTML that choke sgmllib:
+
+ <br/> (No space between name of closing tag and tag close)
+ <! --Comment--> (Extraneous whitespace in declaration)
+
+ You can pass in a custom list of (RE object, replace method)
+ tuples to get Beautiful Soup to scrub your input the way you
+ want."""
+
+ self.parseOnlyThese = parseOnlyThese
+ self.fromEncoding = fromEncoding
+ self.smartQuotesTo = smartQuotesTo
+ self.convertEntities = convertEntities
+ # Set the rules for how we'll deal with the entities we
+ # encounter
+ if self.convertEntities:
+ # It doesn't make sense to convert encoded characters to
+ # entities even while you're converting entities to Unicode.
+ # Just convert it all to Unicode.
+ self.smartQuotesTo = None
+ if convertEntities == self.HTML_ENTITIES:
+ self.convertXMLEntities = False
+ self.convertHTMLEntities = True
+ self.escapeUnrecognizedEntities = True
+ elif convertEntities == self.XHTML_ENTITIES:
+ self.convertXMLEntities = True
+ self.convertHTMLEntities = True
+ self.escapeUnrecognizedEntities = False
+ elif convertEntities == self.XML_ENTITIES:
+ self.convertXMLEntities = True
+ self.convertHTMLEntities = False
+ self.escapeUnrecognizedEntities = False
+ else:
+ self.convertXMLEntities = False
+ self.convertHTMLEntities = False
+ self.escapeUnrecognizedEntities = False
+
+ self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
+ SGMLParser.__init__(self)
+
+ if hasattr(markup, 'read'): # It's a file-type object.
+ markup = markup.read()
+ self.markup = markup
+ self.markupMassage = markupMassage
+ try:
+ self._feed(isHTML=isHTML)
+ except StopParsing:
+ pass
+ self.markup = None # The markup can now be GCed
+
+ def convert_charref(self, name):
+ """This method fixes a bug in Python's SGMLParser."""
+ try:
+ n = int(name)
+ except ValueError:
+ return
+ if not 0 <= n <= 127 : # ASCII ends at 127, not 255
+ return
+ return self.convert_codepoint(n)
+
+ def _feed(self, inDocumentEncoding=None, isHTML=False):
+ # Convert the document to Unicode.
+ markup = self.markup
+ if isinstance(markup, unicode):
+ if not hasattr(self, 'originalEncoding'):
+ self.originalEncoding = None
+ else:
+ dammit = UnicodeDammit\
+ (markup, [self.fromEncoding, inDocumentEncoding],
+ smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
+ markup = dammit.unicode
+ self.originalEncoding = dammit.originalEncoding
+ self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
+ if markup:
+ if self.markupMassage:
+ if not hasattr(self.markupMassage, "__iter__"):
+ self.markupMassage = self.MARKUP_MASSAGE
+ for fix, m in self.markupMassage:
+ markup = fix.sub(m, markup)
+ # TODO: We get rid of markupMassage so that the
+ # soup object can be deepcopied later on. Some
+ # Python installations can't copy regexes. If anyone
+ # was relying on the existence of markupMassage, this
+ # might cause problems.
+ del(self.markupMassage)
+ self.reset()
+
+ SGMLParser.feed(self, markup)
+ # Close out any unfinished strings and close all the open tags.
+ self.endData()
+ while self.currentTag.name != self.ROOT_TAG_NAME:
+ self.popTag()
+
+ def __getattr__(self, methodName):
+ """This method routes method call requests to either the SGMLParser
+ superclass or the Tag superclass, depending on the method name."""
+ #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
+
+ if methodName.startswith('start_') or methodName.startswith('end_') \
+ or methodName.startswith('do_'):
+ return SGMLParser.__getattr__(self, methodName)
+ elif not methodName.startswith('__'):
+ return Tag.__getattr__(self, methodName)
+ else:
+ raise AttributeError
+
+ def isSelfClosingTag(self, name):
+ """Returns true iff the given string is the name of a
+ self-closing tag according to this parser."""
+ return self.SELF_CLOSING_TAGS.has_key(name) \
+ or self.instanceSelfClosingTags.has_key(name)
+
+ def reset(self):
+ Tag.__init__(self, self, self.ROOT_TAG_NAME)
+ self.hidden = 1
+ SGMLParser.reset(self)
+ self.currentData = []
+ self.currentTag = None
+ self.tagStack = []
+ self.quoteStack = []
+ self.pushTag(self)
+
+ def popTag(self):
+ tag = self.tagStack.pop()
+
+ #print "Pop", tag.name
+ if self.tagStack:
+ self.currentTag = self.tagStack[-1]
+ return self.currentTag
+
+ def pushTag(self, tag):
+ #print "Push", tag.name
+ if self.currentTag:
+ self.currentTag.contents.append(tag)
+ self.tagStack.append(tag)
+ self.currentTag = self.tagStack[-1]
+
+ def endData(self, containerClass=NavigableString):
+ if self.currentData:
+ currentData = u''.join(self.currentData)
+ if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
+ not set([tag.name for tag in self.tagStack]).intersection(
+ self.PRESERVE_WHITESPACE_TAGS)):
+ if '\n' in currentData:
+ currentData = '\n'
+ else:
+ currentData = ' '
+ self.currentData = []
+ if self.parseOnlyThese and len(self.tagStack) <= 1 and \
+ (not self.parseOnlyThese.text or \
+ not self.parseOnlyThese.search(currentData)):
+ return
+ o = containerClass(currentData)
+ o.setup(self.currentTag, self.previous)
+ if self.previous:
+ self.previous.next = o
+ self.previous = o
+ self.currentTag.contents.append(o)
+
+
+ def _popToTag(self, name, inclusivePop=True):
+ """Pops the tag stack up to and including the most recent
+ instance of the given tag. If inclusivePop is false, pops the tag
+ stack up to but *not* including the most recent instqance of
+ the given tag."""
+ #print "Popping to %s" % name
+ if name == self.ROOT_TAG_NAME:
+ return
+
+ numPops = 0
+ mostRecentTag = None
+ for i in range(len(self.tagStack)-1, 0, -1):
+ if name == self.tagStack[i].name:
+ numPops = len(self.tagStack)-i
+ break
+ if not inclusivePop:
+ numPops = numPops - 1
+
+ for i in range(0, numPops):
+ mostRecentTag = self.popTag()
+ return mostRecentTag
+
+ def _smartPop(self, name):
+
+ """We need to pop up to the previous tag of this type, unless
+ one of this tag's nesting reset triggers comes between this
+ tag and the previous tag of this type, OR unless this tag is a
+ generic nesting trigger and another generic nesting trigger
+ comes between this tag and the previous tag of this type.
+
+ Examples:
+ <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
+ <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
+ <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
+
+ <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
+ <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
+ <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
+ """
+
+ nestingResetTriggers = self.NESTABLE_TAGS.get(name)
+ isNestable = nestingResetTriggers != None
+ isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
+ popTo = None
+ inclusive = True
+ for i in range(len(self.tagStack)-1, 0, -1):
+ p = self.tagStack[i]
+ if (not p or p.name == name) and not isNestable:
+ #Non-nestable tags get popped to the top or to their
+ #last occurance.
+ popTo = name
+ break
+ if (nestingResetTriggers is not None
+ and p.name in nestingResetTriggers) \
+ or (nestingResetTriggers is None and isResetNesting
+ and self.RESET_NESTING_TAGS.has_key(p.name)):
+
+ #If we encounter one of the nesting reset triggers
+ #peculiar to this tag, or we encounter another tag
+ #that causes nesting to reset, pop up to but not
+ #including that tag.
+ popTo = p.name
+ inclusive = False
+ break
+ p = p.parent
+ if popTo:
+ self._popToTag(popTo, inclusive)
+
+ def unknown_starttag(self, name, attrs, selfClosing=0):
+ #print "Start tag %s: %s" % (name, attrs)
+ if self.quoteStack:
+ #This is not a real tag.
+ #print "<%s> is not real!" % name
+ attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs])
+ self.handle_data('<%s%s>' % (name, attrs))
+ return
+ self.endData()
+
+ if not self.isSelfClosingTag(name) and not selfClosing:
+ self._smartPop(name)
+
+ if self.parseOnlyThese and len(self.tagStack) <= 1 \
+ and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
+ return
+
+ tag = Tag(self, name, attrs, self.currentTag, self.previous)
+ if self.previous:
+ self.previous.next = tag
+ self.previous = tag
+ self.pushTag(tag)
+ if selfClosing or self.isSelfClosingTag(name):
+ self.popTag()
+ if name in self.QUOTE_TAGS:
+ #print "Beginning quote (%s)" % name
+ self.quoteStack.append(name)
+ self.literal = 1
+ return tag
+
+ def unknown_endtag(self, name):
+ #print "End tag %s" % name
+ if self.quoteStack and self.quoteStack[-1] != name:
+ #This is not a real end tag.
+ #print "</%s> is not real!" % name
+ self.handle_data('</%s>' % name)
+ return
+ self.endData()
+ self._popToTag(name)
+ if self.quoteStack and self.quoteStack[-1] == name:
+ self.quoteStack.pop()
+ self.literal = (len(self.quoteStack) > 0)
+
+ def handle_data(self, data):
+ self.currentData.append(data)
+
+ def _toStringSubclass(self, text, subclass):
+ """Adds a certain piece of text to the tree as a NavigableString
+ subclass."""
+ self.endData()
+ self.handle_data(text)
+ self.endData(subclass)
+
+ def handle_pi(self, text):
+ """Handle a processing instruction as a ProcessingInstruction
+ object, possibly one with a %SOUP-ENCODING% slot into which an
+ encoding will be plugged later."""
+ if text[:3] == "xml":
+ text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
+ self._toStringSubclass(text, ProcessingInstruction)
+
+ def handle_comment(self, text):
+ "Handle comments as Comment objects."
+ self._toStringSubclass(text, Comment)
+
+ def handle_charref(self, ref):
+ "Handle character references as data."
+ if self.convertEntities:
+ data = unichr(int(ref))
+ else:
+ data = '&#%s;' % ref
+ self.handle_data(data)
+
+ def handle_entityref(self, ref):
+ """Handle entity references as data, possibly converting known
+ HTML and/or XML entity references to the corresponding Unicode
+ characters."""
+ data = None
+ if self.convertHTMLEntities:
+ try:
+ data = unichr(name2codepoint[ref])
+ except KeyError:
+ pass
+
+ if not data and self.convertXMLEntities:
+ data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
+
+ if not data and self.convertHTMLEntities and \
+ not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
+ # TODO: We've got a problem here. We're told this is
+ # an entity reference, but it's not an XML entity
+ # reference or an HTML entity reference. Nonetheless,
+ # the logical thing to do is to pass it through as an
+ # unrecognized entity reference.
+ #
+ # Except: when the input is "&carol;" this function
+ # will be called with input "carol". When the input is
+ # "AT&T", this function will be called with input
+ # "T". We have no way of knowing whether a semicolon
+ # was present originally, so we don't know whether
+ # this is an unknown entity or just a misplaced
+ # ampersand.
+ #
+ # The more common case is a misplaced ampersand, so I
+ # escape the ampersand and omit the trailing semicolon.
+ data = "&%s" % ref
+ if not data:
+ # This case is different from the one above, because we
+ # haven't already gone through a supposedly comprehensive
+ # mapping of entities to Unicode characters. We might not
+ # have gone through any mapping at all. So the chances are
+ # very high that this is a real entity, and not a
+ # misplaced ampersand.
+ data = "&%s;" % ref
+ self.handle_data(data)
+
+ def handle_decl(self, data):
+ "Handle DOCTYPEs and the like as Declaration objects."
+ self._toStringSubclass(data, Declaration)
+
+ def parse_declaration(self, i):
+ """Treat a bogus SGML declaration as raw data. Treat a CDATA
+ declaration as a CData object."""
+ j = None
+ if self.rawdata[i:i+9] == '<![CDATA[':
+ k = self.rawdata.find(']]>', i)
+ if k == -1:
+ k = len(self.rawdata)
+ data = self.rawdata[i+9:k]
+ j = k+3
+ self._toStringSubclass(data, CData)
+ else:
+ try:
+ j = SGMLParser.parse_declaration(self, i)
+ except SGMLParseError:
+ toHandle = self.rawdata[i:]
+ self.handle_data(toHandle)
+ j = i + len(toHandle)
+ return j
+
+class BeautifulSoup(BeautifulStoneSoup):
+
+ """This parser knows the following facts about HTML:
+
+ * Some tags have no closing tag and should be interpreted as being
+ closed as soon as they are encountered.
+
+ * The text inside some tags (ie. 'script') may contain tags which
+ are not really part of the document and which should be parsed
+ as text, not tags. If you want to parse the text as tags, you can
+ always fetch it and parse it explicitly.
+
+ * Tag nesting rules:
+
+ Most tags can't be nested at all. For instance, the occurance of
+ a <p> tag should implicitly close the previous <p> tag.
+
+ <p>Para1<p>Para2
+ should be transformed into:
+ <p>Para1</p><p>Para2
+
+ Some tags can be nested arbitrarily. For instance, the occurance
+ of a <blockquote> tag should _not_ implicitly close the previous
+ <blockquote> tag.
+
+ Alice said: <blockquote>Bob said: <blockquote>Blah
+ should NOT be transformed into:
+ Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
+
+ Some tags can be nested, but the nesting is reset by the
+ interposition of other tags. For instance, a <tr> tag should
+ implicitly close the previous <tr> tag within the same <table>,
+ but not close a <tr> tag in another table.
+
+ <table><tr>Blah<tr>Blah
+ should be transformed into:
+ <table><tr>Blah</tr><tr>Blah
+ but,
+ <tr>Blah<table><tr>Blah
+ should NOT be transformed into
+ <tr>Blah<table></tr><tr>Blah
+
+ Differing assumptions about tag nesting rules are a major source
+ of problems with the BeautifulSoup class. If BeautifulSoup is not
+ treating as nestable a tag your page author treats as nestable,
+ try ICantBelieveItsBeautifulSoup, MinimalSoup, or
+ BeautifulStoneSoup before writing your own subclass."""
+
+ def __init__(self, *args, **kwargs):
+ if not kwargs.has_key('smartQuotesTo'):
+ kwargs['smartQuotesTo'] = self.HTML_ENTITIES
+ kwargs['isHTML'] = True
+ BeautifulStoneSoup.__init__(self, *args, **kwargs)
+
+ SELF_CLOSING_TAGS = buildTagMap(None,
+ ('br' , 'hr', 'input', 'img', 'meta',
+ 'spacer', 'link', 'frame', 'base', 'col'))
+
+ PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
+
+ QUOTE_TAGS = {'script' : None, 'textarea' : None}
+
+ #According to the HTML standard, each of these inline tags can
+ #contain another tag of the same type. Furthermore, it's common
+ #to actually use these tags this way.
+ NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
+ 'center')
+
+ #According to the HTML standard, these block tags can contain
+ #another tag of the same type. Furthermore, it's common
+ #to actually use these tags this way.
+ NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del')
+
+ #Lists can contain other lists, but there are restrictions.
+ NESTABLE_LIST_TAGS = { 'ol' : [],
+ 'ul' : [],
+ 'li' : ['ul', 'ol'],
+ 'dl' : [],
+ 'dd' : ['dl'],
+ 'dt' : ['dl'] }
+
+ #Tables can contain other tables, but there are restrictions.
+ NESTABLE_TABLE_TAGS = {'table' : [],
+ 'tr' : ['table', 'tbody', 'tfoot', 'thead'],
+ 'td' : ['tr'],
+ 'th' : ['tr'],
+ 'thead' : ['table'],
+ 'tbody' : ['table'],
+ 'tfoot' : ['table'],
+ }
+
+ NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre')
+
+ #If one of these tags is encountered, all tags up to the next tag of
+ #this type are popped.
+ RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
+ NON_NESTABLE_BLOCK_TAGS,
+ NESTABLE_LIST_TAGS,
+ NESTABLE_TABLE_TAGS)
+
+ NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
+ NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
+
+ # Used to detect the charset in a META tag; see start_meta
+ CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
+
+ def start_meta(self, attrs):
+ """Beautiful Soup can detect a charset included in a META tag,
+ try to convert the document to that charset, and re-parse the
+ document from the beginning."""
+ httpEquiv = None
+ contentType = None
+ contentTypeIndex = None
+ tagNeedsEncodingSubstitution = False
+
+ for i in range(0, len(attrs)):
+ key, value = attrs[i]
+ key = key.lower()
+ if key == 'http-equiv':
+ httpEquiv = value
+ elif key == 'content':
+ contentType = value
+ contentTypeIndex = i
+
+ if httpEquiv and contentType: # It's an interesting meta tag.
+ match = self.CHARSET_RE.search(contentType)
+ if match:
+ if (self.declaredHTMLEncoding is not None or
+ self.originalEncoding == self.fromEncoding):
+ # An HTML encoding was sniffed while converting
+ # the document to Unicode, or an HTML encoding was
+ # sniffed during a previous pass through the
+ # document, or an encoding was specified
+ # explicitly and it worked. Rewrite the meta tag.
+ def rewrite(match):
+ return match.group(1) + "%SOUP-ENCODING%"
+ newAttr = self.CHARSET_RE.sub(rewrite, contentType)
+ attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
+ newAttr)
+ tagNeedsEncodingSubstitution = True
+ else:
+ # This is our first pass through the document.
+ # Go through it again with the encoding information.
+ newCharset = match.group(3)
+ if newCharset and newCharset != self.originalEncoding:
+ self.declaredHTMLEncoding = newCharset
+ self._feed(self.declaredHTMLEncoding)
+ raise StopParsing
+ pass
+ tag = self.unknown_starttag("meta", attrs)
+ if tag and tagNeedsEncodingSubstitution:
+ tag.containsSubstitutions = True
+
+class StopParsing(Exception):
+ pass
+
+class ICantBelieveItsBeautifulSoup(BeautifulSoup):
+
+ """The BeautifulSoup class is oriented towards skipping over
+ common HTML errors like unclosed tags. However, sometimes it makes
+ errors of its own. For instance, consider this fragment:
+
+ <b>Foo<b>Bar</b></b>
+
+ This is perfectly valid (if bizarre) HTML. However, the
+ BeautifulSoup class will implicitly close the first b tag when it
+ encounters the second 'b'. It will think the author wrote
+ "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
+ there's no real-world reason to bold something that's already
+ bold. When it encounters '</b></b>' it will close two more 'b'
+ tags, for a grand total of three tags closed instead of two. This
+ can throw off the rest of your document structure. The same is
+ true of a number of other tags, listed below.
+
+ It's much more common for someone to forget to close a 'b' tag
+ than to actually use nested 'b' tags, and the BeautifulSoup class
+ handles the common case. This class handles the not-co-common
+ case: where you can't believe someone wrote what they did, but
+ it's valid HTML and BeautifulSoup screwed up by assuming it
+ wouldn't be."""
+
+ I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
+ ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
+ 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
+ 'big')
+
+ I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',)
+
+ NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
+ I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
+ I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
+
+class MinimalSoup(BeautifulSoup):
+ """The MinimalSoup class is for parsing HTML that contains
+ pathologically bad markup. It makes no assumptions about tag
+ nesting, but it does know which tags are self-closing, that
+ <script> tags contain Javascript and should not be parsed, that
+ META tags may contain encoding information, and so on.
+
+ This also makes it better for subclassing than BeautifulStoneSoup
+ or BeautifulSoup."""
+
+ RESET_NESTING_TAGS = buildTagMap('noscript')
+ NESTABLE_TAGS = {}
+
+class BeautifulSOAP(BeautifulStoneSoup):
+ """This class will push a tag with only a single string child into
+ the tag's parent as an attribute. The attribute's name is the tag
+ name, and the value is the string child. An example should give
+ the flavor of the change:
+
+ <foo><bar>baz</bar></foo>
+ =>
+ <foo bar="baz"><bar>baz</bar></foo>
+
+ You can then access fooTag['bar'] instead of fooTag.barTag.string.
+
+ This is, of course, useful for scraping structures that tend to
+ use subelements instead of attributes, such as SOAP messages. Note
+ that it modifies its input, so don't print the modified version
+ out.
+
+ I'm not sure how many people really want to use this class; let me
+ know if you do. Mainly I like the name."""
+
+ def popTag(self):
+ if len(self.tagStack) > 1:
+ tag = self.tagStack[-1]
+ parent = self.tagStack[-2]
+ parent._getAttrMap()
+ if (isinstance(tag, Tag) and len(tag.contents) == 1 and
+ isinstance(tag.contents[0], NavigableString) and
+ not parent.attrMap.has_key(tag.name)):
+ parent[tag.name] = tag.contents[0]
+ BeautifulStoneSoup.popTag(self)
+
+#Enterprise class names! It has come to our attention that some people
+#think the names of the Beautiful Soup parser classes are too silly
+#and "unprofessional" for use in enterprise screen-scraping. We feel
+#your pain! For such-minded folk, the Beautiful Soup Consortium And
+#All-Night Kosher Bakery recommends renaming this file to
+#"RobustParser.py" (or, in cases of extreme enterprisiness,
+#"RobustParserBeanInterface.class") and using the following
+#enterprise-friendly class aliases:
+class RobustXMLParser(BeautifulStoneSoup):
+ pass
+class RobustHTMLParser(BeautifulSoup):
+ pass
+class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
+ pass
+class RobustInsanelyWackAssHTMLParser(MinimalSoup):
+ pass
+class SimplifyingSOAPParser(BeautifulSOAP):
+ pass
+
+######################################################
+#
+# Bonus library: Unicode, Dammit
+#
+# This class forces XML data into a standard format (usually to UTF-8
+# or Unicode). It is heavily based on code from Mark Pilgrim's
+# Universal Feed Parser. It does not rewrite the XML or HTML to
+# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
+# (XML) and BeautifulSoup.start_meta (HTML).
+
+# Autodetects character encodings.
+# Download from http://chardet.feedparser.org/
+try:
+ import chardet
+# import chardet.constants
+# chardet.constants._debug = 1
+except ImportError:
+ chardet = None
+
+# cjkcodecs and iconv_codec make Python know about more character encodings.
+# Both are available from http://cjkpython.i18n.org/
+# They're built in if you use Python 2.4.
+try:
+ import cjkcodecs.aliases
+except ImportError:
+ pass
+try:
+ import iconv_codec
+except ImportError:
+ pass
+
+class UnicodeDammit:
+ """A class for detecting the encoding of a *ML document and
+ converting it to a Unicode string. If the source encoding is
+ windows-1252, can replace MS smart quotes with their HTML or XML
+ equivalents."""
+
+ # This dictionary maps commonly seen values for "charset" in HTML
+ # meta tags to the corresponding Python codec names. It only covers
+ # values that aren't in Python's aliases and can't be determined
+ # by the heuristics in find_codec.
+ CHARSET_ALIASES = { "macintosh" : "mac-roman",
+ "x-sjis" : "shift-jis" }
+
+ def __init__(self, markup, overrideEncodings=[],
+ smartQuotesTo='xml', isHTML=False):
+ self.declaredHTMLEncoding = None
+ self.markup, documentEncoding, sniffedEncoding = \
+ self._detectEncoding(markup, isHTML)
+ self.smartQuotesTo = smartQuotesTo
+ self.triedEncodings = []
+ if markup == '' or isinstance(markup, unicode):
+ self.originalEncoding = None
+ self.unicode = unicode(markup)
+ return
+
+ u = None
+ for proposedEncoding in overrideEncodings:
+ u = self._convertFrom(proposedEncoding)
+ if u: break
+ if not u:
+ for proposedEncoding in (documentEncoding, sniffedEncoding):
+ u = self._convertFrom(proposedEncoding)
+ if u: break
+
+ # If no luck and we have auto-detection library, try that:
+ if not u and chardet and not isinstance(self.markup, unicode):
+ u = self._convertFrom(chardet.detect(self.markup)['encoding'])
+
+ # As a last resort, try utf-8 and windows-1252:
+ if not u:
+ for proposed_encoding in ("utf-8", "windows-1252"):
+ u = self._convertFrom(proposed_encoding)
+ if u: break
+
+ self.unicode = u
+ if not u: self.originalEncoding = None
+
+ def _subMSChar(self, orig):
+ """Changes a MS smart quote character to an XML or HTML
+ entity."""
+ sub = self.MS_CHARS.get(orig)
+ if isinstance(sub, tuple):
+ if self.smartQuotesTo == 'xml':
+ sub = '&#x%s;' % sub[1]
+ else:
+ sub = '&%s;' % sub[0]
+ return sub
+
+ def _convertFrom(self, proposed):
+ proposed = self.find_codec(proposed)
+ if not proposed or proposed in self.triedEncodings:
+ return None
+ self.triedEncodings.append(proposed)
+ markup = self.markup
+
+ # Convert smart quotes to HTML if coming from an encoding
+ # that might have them.
+ if self.smartQuotesTo and proposed.lower() in("windows-1252",
+ "iso-8859-1",
+ "iso-8859-2"):
+ markup = re.compile("([\x80-\x9f])").sub \
+ (lambda(x): self._subMSChar(x.group(1)),
+ markup)
+
+ try:
+ # print "Trying to convert document to %s" % proposed
+ u = self._toUnicode(markup, proposed)
+ self.markup = u
+ self.originalEncoding = proposed
+ except Exception, e:
+ # print "That didn't work!"
+ # print e
+ return None
+ #print "Correct encoding: %s" % proposed
+ return self.markup
+
+ def _toUnicode(self, data, encoding):
+ '''Given a string and its encoding, decodes the string into Unicode.
+ %encoding is a string recognized by encodings.aliases'''
+
+ # strip Byte Order Mark (if present)
+ if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
+ and (data[2:4] != '\x00\x00'):
+ encoding = 'utf-16be'
+ data = data[2:]
+ elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
+ and (data[2:4] != '\x00\x00'):
+ encoding = 'utf-16le'
+ data = data[2:]
+ elif data[:3] == '\xef\xbb\xbf':
+ encoding = 'utf-8'
+ data = data[3:]
+ elif data[:4] == '\x00\x00\xfe\xff':
+ encoding = 'utf-32be'
+ data = data[4:]
+ elif data[:4] == '\xff\xfe\x00\x00':
+ encoding = 'utf-32le'
+ data = data[4:]
+ newdata = unicode(data, encoding)
+ return newdata
+
+ def _detectEncoding(self, xml_data, isHTML=False):
+ """Given a document, tries to detect its XML encoding."""
+ xml_encoding = sniffed_xml_encoding = None
+ try:
+ if xml_data[:4] == '\x4c\x6f\xa7\x94':
+ # EBCDIC
+ xml_data = self._ebcdic_to_ascii(xml_data)
+ elif xml_data[:4] == '\x00\x3c\x00\x3f':
+ # UTF-16BE
+ sniffed_xml_encoding = 'utf-16be'
+ xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
+ elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
+ and (xml_data[2:4] != '\x00\x00'):
+ # UTF-16BE with BOM
+ sniffed_xml_encoding = 'utf-16be'
+ xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
+ elif xml_data[:4] == '\x3c\x00\x3f\x00':
+ # UTF-16LE
+ sniffed_xml_encoding = 'utf-16le'
+ xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
+ elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
+ (xml_data[2:4] != '\x00\x00'):
+ # UTF-16LE with BOM
+ sniffed_xml_encoding = 'utf-16le'
+ xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
+ elif xml_data[:4] == '\x00\x00\x00\x3c':
+ # UTF-32BE
+ sniffed_xml_encoding = 'utf-32be'
+ xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
+ elif xml_data[:4] == '\x3c\x00\x00\x00':
+ # UTF-32LE
+ sniffed_xml_encoding = 'utf-32le'
+ xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
+ elif xml_data[:4] == '\x00\x00\xfe\xff':
+ # UTF-32BE with BOM
+ sniffed_xml_encoding = 'utf-32be'
+ xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
+ elif xml_data[:4] == '\xff\xfe\x00\x00':
+ # UTF-32LE with BOM
+ sniffed_xml_encoding = 'utf-32le'
+ xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
+ elif xml_data[:3] == '\xef\xbb\xbf':
+ # UTF-8 with BOM
+ sniffed_xml_encoding = 'utf-8'
+ xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
+ else:
+ sniffed_xml_encoding = 'ascii'
+ pass
+ except:
+ xml_encoding_match = None
+ xml_encoding_match = re.compile(
+ '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
+ if not xml_encoding_match and isHTML:
+ regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I)
+ xml_encoding_match = regexp.search(xml_data)
+ if xml_encoding_match is not None:
+ xml_encoding = xml_encoding_match.groups()[0].lower()
+ if isHTML:
+ self.declaredHTMLEncoding = xml_encoding
+ if sniffed_xml_encoding and \
+ (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
+ 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
+ 'utf-16', 'utf-32', 'utf_16', 'utf_32',
+ 'utf16', 'u16')):
+ xml_encoding = sniffed_xml_encoding
+ return xml_data, xml_encoding, sniffed_xml_encoding
+
+
+ def find_codec(self, charset):
+ return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
+ or (charset and self._codec(charset.replace("-", ""))) \
+ or (charset and self._codec(charset.replace("-", "_"))) \
+ or charset
+
+ def _codec(self, charset):
+ if not charset: return charset
+ codec = None
+ try:
+ codecs.lookup(charset)
+ codec = charset
+ except (LookupError, ValueError):
+ pass
+ return codec
+
+ EBCDIC_TO_ASCII_MAP = None
+ def _ebcdic_to_ascii(self, s):
+ c = self.__class__
+ if not c.EBCDIC_TO_ASCII_MAP:
+ emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
+ 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
+ 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
+ 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
+ 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
+ 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
+ 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
+ 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
+ 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
+ 201,202,106,107,108,109,110,111,112,113,114,203,204,205,
+ 206,207,208,209,126,115,116,117,118,119,120,121,122,210,
+ 211,212,213,214,215,216,217,218,219,220,221,222,223,224,
+ 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
+ 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
+ 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
+ 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
+ 250,251,252,253,254,255)
+ import string
+ c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
+ ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
+ return s.translate(c.EBCDIC_TO_ASCII_MAP)
+
+ MS_CHARS = { '\x80' : ('euro', '20AC'),
+ '\x81' : ' ',
+ '\x82' : ('sbquo', '201A'),
+ '\x83' : ('fnof', '192'),
+ '\x84' : ('bdquo', '201E'),
+ '\x85' : ('hellip', '2026'),
+ '\x86' : ('dagger', '2020'),
+ '\x87' : ('Dagger', '2021'),
+ '\x88' : ('circ', '2C6'),
+ '\x89' : ('permil', '2030'),
+ '\x8A' : ('Scaron', '160'),
+ '\x8B' : ('lsaquo', '2039'),
+ '\x8C' : ('OElig', '152'),
+ '\x8D' : '?',
+ '\x8E' : ('#x17D', '17D'),
+ '\x8F' : '?',
+ '\x90' : '?',
+ '\x91' : ('lsquo', '2018'),
+ '\x92' : ('rsquo', '2019'),
+ '\x93' : ('ldquo', '201C'),
+ '\x94' : ('rdquo', '201D'),
+ '\x95' : ('bull', '2022'),
+ '\x96' : ('ndash', '2013'),
+ '\x97' : ('mdash', '2014'),
+ '\x98' : ('tilde', '2DC'),
+ '\x99' : ('trade', '2122'),
+ '\x9a' : ('scaron', '161'),
+ '\x9b' : ('rsaquo', '203A'),
+ '\x9c' : ('oelig', '153'),
+ '\x9d' : '?',
+ '\x9e' : ('#x17E', '17E'),
+ '\x9f' : ('Yuml', ''),}
+
+#######################################################################
+
+
+#By default, act as an HTML pretty-printer.
+if __name__ == '__main__':
+ import sys
+ soup = BeautifulSoup(sys.stdin)
+ print soup.prettify()
--- /dev/null
+import sys
+from rss_sqlite import Listing
+from xml import sax
+from cgi import escape
+from re import sub
+from htmlentitydefs import name2codepoint
+from gconf import client_get_default
+
+import logging
+logger = logging.getLogger(__name__)
+
+def unescape(text):
+ def fixup(m):
+ text = m.group(0)
+ if text[:2] == "&#":
+ # character reference
+ try:
+ if text[:3] == "&#x":
+ return unichr(int(text[3:-1], 16))
+ else:
+ return unichr(int(text[2:-1]))
+ except ValueError:
+ pass
+ else:
+ # named entity
+ try:
+ text = unichr(name2codepoint[text[1:-1]])
+ except KeyError:
+ pass
+ return text # leave as is
+ return sub("&#?\w+;", fixup, text)
+
+def sanitize(text):
+ from cgi import escape
+ return escape(text).encode('ascii', 'xmlcharrefreplace')
+
+class XmlHandler():
+
+ def __init__(self, listing):
+ self.listing=listing
+
+ def getConfigXml(self):
+ xml = "<?xml version=\"1.0\" encoding=\"utf-8\"?><xml>"
+ xml += "<hideReadFeed>True</hideReadFeed>"
+ xml += "<hideReadArticles>True</hideReadArticles>"
+ xml += "</xml>"
+ return xml
+
+ def generateCategoryXml(self):
+ xml = "<?xml version=\"1.0\" encoding=\"utf-8\"?><xml>"
+ for cat in self.listing.getListOfCategories():
+ xml += "<category>"
+ xml += "<catname>%s</catname>" %sanitize(self.listing.getCategoryTitle(cat))
+ xml += "<catid>%s</catid>" % cat
+ xml += "</category>"
+ xml += "</xml>"
+ return xml
+
+ def fix_title(self, title):
+ return escape(unescape(title).replace("<em>","").replace("</em>","").replace("<nobr>","").replace("</nobr>","").replace("<wbr>","").replace("—","-"))
+
+ def generateFeedsXml(self, catid):
+ xml = "<?xml version=\"1.0\" encoding=\"utf-8\"?><xml>"
+ for key in self.listing.getSortedListOfKeys("Manual", category=catid):
+ xml += "<feed>"
+ xml += "<feedname>%s</feedname>" %sanitize(self.listing.getFeedTitle(key))
+ xml += "<feedid>%s</feedid>" %key
+ xml += "<unread>%s</unread>" %self.listing.getFeedNumberOfUnreadItems(key)
+ xml += "<updatedDate>%s</updatedDate>" %self.listing.getFeedUpdateTime(key)
+ xml += "<icon>%s</icon>" %self.listing.getFavicon(key)
+ # xml += "<updating>True</updating>"
+ xml += "<updating>False</updating>"
+ xml += "</feed>"
+ xml += "</xml>"
+ return xml
+
+ def generateArticlesXml(self, key, onlyUnread):
+ feed = self.listing.getFeed(key)
+ xml = "<?xml version=\"1.0\" encoding=\"utf-8\"?><xml>"
+ if onlyUnread == "False":
+ onlyUnread = False
+ for id in feed.getIds(onlyUnread):
+ xml += "<article>"
+ xml += "<title>%s</title>" %self.fix_title(feed.getTitle(id))
+ xml += "<articleid>%s</articleid>" %id
+ xml += "<unread>%s</unread>" %str(feed.isEntryRead(id))
+ xml += "<updatedDate>%s</updatedDate>" %feed.getDateStamp(id)
+ xml += "<path>%s</path>" %feed.getContentLink(id)
+ xml += "</article>"
+ xml += "</xml>"
+ return xml
+
+ def do_GET(self):
+ (req, sep, arg) = self.path.partition("?")
+ request = req.split("/")
+ arguments = {}
+ if arg != "":
+ args = arg.split("&")
+ for arg in args:
+ ele = arg.split("=")
+ arguments[ele[0]] = ele[1]
+ if request[1] == "categories":
+ xml = self.generateCategoryXml()
+ elif request[1] == "feeds":
+ catid = request[2]
+ xml = self.generateFeedsXml(catid)
+ elif request[1] == "articles":
+ key = request[2]
+ onlyUnread = arguments.get("onlyUnread","False")
+ markAllAsRead = arguments.get("markAllAsRead", "False")
+ xml = self.generateArticlesXml(key, onlyUnread, markAllAsRead)
+ elif request[1] == "html":
+ key = request[2]
+ article = request[3]
+ feed = listing.getFeed(key)
+ try:
+ file = open(feed.getContentLink(article))
+ html = file.read().replace("body", "body bgcolor='#ffffff'", 1)
+ file.close()
+ except:
+ html = "<html><body>Error retrieving article</body></html>"
+ self.send_response(200)
+ self.send_header("Content-type", "text/html")
+ self.end_headers()
+ self.wfile.write(html)
+ #listing.updateUnread(key)
+ return
+ elif request[1] == "isUpdating":
+ xml = "<xml>"
+ key = request[2]
+ if (key in updatingFeeds) or ((key=="") and (len(updatingFeeds)>0)):
+ xml += "<updating>True</updating>"
+ else:
+ xml += "<updating>False</updating>"
+ xml += self.getCommands()
+ xml += "</xml>"
+ elif request[1] == "read":
+ key = request[2]
+ article = request[3]
+ feed = listing.getFeed(key)
+ feed.setEntryRead(article)
+ listing.updateUnread(key)
+ self.send_response(200)
+ self.send_header("Content-type", "text/html")
+ self.end_headers()
+ self.wfile.write("OK")
+ return
+ elif request[1] == "config":
+ xml = self.getConfigXml()
+ elif request[1] == "home":
+ file = open(self.path)
+ self.send_response(200)
+ self.send_header("Content-type", "text/html")
+ self.end_headers()
+ self.wfile.write(file.read())
+ file.close()
+ return
+ elif request[1] == "task":
+ self.openTaskSwitch()
+ xml = "<xml>OK</xml>"
+ elif request[1] == "deleteCat":
+ key = request[2]
+ listing.removeCategory(key)
+ xml = "<xml>OK</xml>"
+ elif request[1] == "deleteFeed":
+ key = request[3]
+ listing.removeFeed(key)
+ xml = "<xml>OK</xml>"
+ elif request[1] == "addFeed":
+ cat = request[2]
+ name = request[3]
+ url = arguments.get("url","")
+ listing.addFeed(name, url, category=cat)
+ xml = "<xml>OK</xml>"
+ elif request[1] == "updateFeed":
+ key = request[2]
+ listing.updateFeed (key, priority=-1)
+ #download = Download(listing, [key,])
+ #download.start()
+ xml = "<xml>OK</xml>"
+ elif request[1]=="updateAll":
+ #app.automaticUpdate()
+ self.updateAll()
+ xml = "<xml>OK</xml>"
+ elif request[1] == "addCat":
+ catName = request[2]
+ listing.addCategory(catName)
+ xml = "<xml>OK</xml>"
+ else:
+ self.send_error(404, "File not found")
+ return
+ self.send_response(200)
+ self.send_header("Content-type", "text/xml")
+ self.end_headers()
+ self.wfile.write(xml.encode("utf-8"))
--- /dev/null
+#!/usr/bin/env python2.5
+
+#
+# Copyright (c) 2007-2008 INdT.
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+# ============================================================================
+# Name : FeedingIt.py
+# Author : Yves Marcoz
+# Version : 0.6.1
+# Description : Simple RSS Reader
+# ============================================================================
+#try:
+# import gtk
+# import hildon
+# from gobject import idle_add
+#except:
+# pass
+
+from ConfigParser import RawConfigParser
+from gconf import client_get_default
+from urllib2 import ProxyHandler
+from mainthread import mainthread
+import logging
+logger = logging.getLogger(__name__)
+
+VERSION = "52"
+
+section = "FeedingIt"
+ranges = { "updateInterval":[0.5, 1, 2, 4, 12, 24], "expiry":[24, 48, 72, 144, 288], "fontSize":range(12,24), "orientation":["Automatic", "Landscape", "Portrait"], "artFontSize":[10, 12, 14, 16, 18, 20], "feedsort":["Manual", "Most unread", "Least unread", "Most recent", "Least recent"] }
+titles = {"updateInterval":"Auto-update interval", "expiry":"Delete articles", "fontSize":"List font size", "orientation":"Display orientation", "artFontSize":"Article font size","feedsort":"Feed sort order"}
+subtitles = {"updateInterval":"Every %s hours", "expiry":"After %s hours", "fontSize":"%s pixels", "orientation":"%s", "artFontSize":"%s pixels", "feedsort":"%s"}
+
+class Config():
+ def __init__(self, parent, configFilename):
+ self.configFilename = configFilename
+ self.parent = parent
+ # Load config
+ self.loadConfig()
+
+ # Backup current settings for later restore
+ self.config_backup = dict(self.config)
+ self.do_restore_backup = True
+
+ def on_save_button_clicked(self, button):
+ self.do_restore_backup = False
+ self.window.destroy()
+
+ def createDialog(self):
+ import gtk
+ import hildon
+ from gobject import idle_add
+ self.window = gtk.Dialog("Settings", self.parent)
+ self.window.set_geometry_hints(min_height=600)
+
+ save_button = self.window.add_button(gtk.STOCK_SAVE, gtk.RESPONSE_OK)
+ save_button.connect('clicked', self.on_save_button_clicked)
+ #self.window.set_default_size(-1, 600)
+ panArea = hildon.PannableArea()
+
+ vbox = gtk.VBox(False, 2)
+ self.buttons = {}
+
+ def heading(text):
+ l = gtk.Label()
+ l.set_size_request(-1, 6)
+ vbox.pack_start(l, expand=False)
+ vbox.pack_start(gtk.Frame(text), expand=False)
+
+ def add_setting(setting):
+ picker = hildon.PickerButton(gtk.HILDON_SIZE_FINGER_HEIGHT, hildon.BUTTON_ARRANGEMENT_VERTICAL)
+ selector = self.create_selector(ranges[setting], setting)
+ picker.set_selector(selector)
+ picker.set_title(titles[setting])
+ picker.set_text(titles[setting], subtitles[setting] % self.config[setting])
+ picker.set_name('HildonButton-finger')
+ picker.set_alignment(0,0,1,1)
+ self.buttons[setting] = picker
+ vbox.pack_start(picker, expand=False)
+
+ button = hildon.Button(gtk.HILDON_SIZE_FINGER_HEIGHT, hildon.BUTTON_ARRANGEMENT_VERTICAL)
+ button.set_label("View Known Issues and Tips")
+ button.connect("clicked", self.button_tips_clicked)
+ button.set_alignment(0,0,1,1)
+ vbox.pack_start(button, expand=False)
+
+ heading('Display')
+ add_setting('fontSize')
+ add_setting('artFontSize')
+ add_setting('orientation')
+ add_setting('feedsort')
+ button = hildon.CheckButton(gtk.HILDON_SIZE_FINGER_HEIGHT)
+ button.set_label("Hide read feeds")
+ button.set_active(self.config["hidereadfeeds"])
+ button.connect("toggled", self.button_toggled, "hidereadfeeds")
+ vbox.pack_start(button, expand=False)
+
+ button = hildon.CheckButton(gtk.HILDON_SIZE_FINGER_HEIGHT)
+ button.set_label("Hide read articles")
+ button.set_active(self.config["hidereadarticles"])
+ button.connect("toggled", self.button_toggled, "hidereadarticles")
+ vbox.pack_start(button, expand=False)
+
+
+ heading('Updating')
+ button = hildon.CheckButton(gtk.HILDON_SIZE_FINGER_HEIGHT)
+ button.set_label("Automatically update feeds")
+ button.set_active(self.config["autoupdate"])
+ button.connect("toggled", self.button_toggled, "autoupdate")
+ vbox.pack_start(button, expand=False)
+ add_setting('updateInterval')
+ add_setting('expiry')
+
+ heading('Network')
+ button = hildon.CheckButton(gtk.HILDON_SIZE_FINGER_HEIGHT)
+ button.set_label('Cache images')
+ button.set_active(self.config["imageCache"])
+ button.connect("toggled", self.button_toggled, "imageCache")
+ vbox.pack_start(button, expand=False)
+
+ button = hildon.CheckButton(gtk.HILDON_SIZE_FINGER_HEIGHT)
+ button.set_label("Use HTTP proxy")
+ button.set_active(self.config["proxy"])
+ button.connect("toggled", self.button_toggled, "proxy")
+ vbox.pack_start(button, expand=False)
+
+ button = hildon.CheckButton(gtk.HILDON_SIZE_FINGER_HEIGHT)
+ button.set_label('Open links in external browser')
+ button.set_active(self.config["extBrowser"])
+ button.connect("toggled", self.button_toggled, "extBrowser")
+ vbox.pack_start(button, expand=False)
+
+ panArea.add_with_viewport(vbox)
+
+ self.window.vbox.add(panArea)
+ self.window.connect("destroy", self.onExit)
+ #self.window.add(self.vbox)
+ self.window.set_default_size(-1, 600)
+ self.window.show_all()
+ return self.window
+
+ def button_tips_clicked(self, *widget):
+ import dbus
+ bus = dbus.SessionBus()
+ proxy = bus.get_object("com.nokia.osso_browser", "/com/nokia/osso_browser/request")
+ iface = dbus.Interface(proxy, 'com.nokia.osso_browser')
+ iface.open_new_window("http://feedingit.marcoz.org/news/?page_id=%s" % VERSION)
+
+ def onExit(self, *widget):
+ # When the dialog is closed without hitting
+ # the "Save" button, restore the configuration
+ if self.do_restore_backup:
+ logger.debug('Restoring configuration')
+ self.config = self.config_backup
+
+ self.saveConfig()
+ self.window.destroy()
+
+ def button_toggled(self, widget, configName):
+ #print "widget", widget.get_active()
+ if (widget.get_active()):
+ self.config[configName] = True
+ else:
+ self.config[configName] = False
+ #print "autoup", self.autoupdate
+ self.saveConfig()
+
+ def selection_changed(self, selector, button, setting):
+ from gobject import idle_add
+ current_selection = selector.get_current_text()
+ if current_selection:
+ self.config[setting] = current_selection
+ idle_add(self.updateButton, setting)
+ self.saveConfig()
+
+ def updateButton(self, setting):
+ self.buttons[setting].set_text(titles[setting], subtitles[setting] % self.config[setting])
+
+ def loadConfig(self):
+ self.config = {}
+ try:
+ configParser = RawConfigParser()
+ configParser.read(self.configFilename)
+ self.config["fontSize"] = configParser.getint(section, "fontSize")
+ self.config["artFontSize"] = configParser.getint(section, "artFontSize")
+ self.config["expiry"] = configParser.getint(section, "expiry")
+ self.config["autoupdate"] = configParser.getboolean(section, "autoupdate")
+ self.config["updateInterval"] = configParser.getfloat(section, "updateInterval")
+ self.config["orientation"] = configParser.get(section, "orientation")
+ self.config["imageCache"] = configParser.getboolean(section, "imageCache")
+ except:
+ self.config["fontSize"] = 17
+ self.config["artFontSize"] = 14
+ self.config["expiry"] = 24
+ self.config["autoupdate"] = False
+ self.config["updateInterval"] = 4
+ self.config["orientation"] = "Automatic"
+ self.config["imageCache"] = False
+ try:
+ self.config["proxy"] = configParser.getboolean(section, "proxy")
+ except:
+ self.config["proxy"] = True
+ try:
+ self.config["hidereadfeeds"] = configParser.getboolean(section, "hidereadfeeds")
+ self.config["hidereadarticles"] = configParser.getboolean(section, "hidereadarticles")
+ except:
+ self.config["hidereadfeeds"] = False
+ self.config["hidereadarticles"] = False
+ try:
+ self.config["extBrowser"] = configParser.getboolean(section, "extBrowser")
+ except:
+ self.config["extBrowser"] = False
+ try:
+ self.config["feedsort"] = configParser.get(section, "feedsort")
+ except:
+ self.config["feedsort"] = "Manual"
+
+ def saveConfig(self):
+ configParser = RawConfigParser()
+ configParser.add_section(section)
+ configParser.set(section, 'fontSize', str(self.config["fontSize"]))
+ configParser.set(section, 'artFontSize', str(self.config["artFontSize"]))
+ configParser.set(section, 'expiry', str(self.config["expiry"]))
+ configParser.set(section, 'autoupdate', str(self.config["autoupdate"]))
+ configParser.set(section, 'updateInterval', str(self.config["updateInterval"]))
+ configParser.set(section, 'orientation', str(self.config["orientation"]))
+ configParser.set(section, 'imageCache', str(self.config["imageCache"]))
+ configParser.set(section, 'proxy', str(self.config["proxy"]))
+ configParser.set(section, 'hidereadfeeds', str(self.config["hidereadfeeds"]))
+ configParser.set(section, 'hidereadarticles', str(self.config["hidereadarticles"]))
+ configParser.set(section, 'extBrowser', str(self.config["extBrowser"]))
+ configParser.set(section, 'feedsort', str(self.config["feedsort"]))
+
+ # Writing our configuration file
+ file = open(self.configFilename, 'wb')
+ configParser.write(file)
+ file.close()
+
+ def create_selector(self, choices, setting):
+ import gtk
+ import hildon
+ from gobject import idle_add
+ #self.pickerDialog = hildon.PickerDialog(self.parent)
+ selector = hildon.TouchSelector(text=True)
+ index = 0
+ for item in choices:
+ iter = selector.append_text(str(item))
+ if str(self.config[setting]) == str(item):
+ selector.set_active(0, index)
+ index += 1
+ selector.connect("changed", self.selection_changed, setting)
+ #self.pickerDialog.set_selector(selector)
+ return selector
+ #self.pickerDialog.show_all()
+
+ def getFontSize(self):
+ return self.config["fontSize"]
+ def getArtFontSize(self):
+ return self.config["artFontSize"]
+ def getExpiry(self):
+ return self.config["expiry"]
+ def isAutoUpdateEnabled(self):
+ return self.config["autoupdate"]
+ def getUpdateInterval(self):
+ return float(self.config["updateInterval"])
+ def getReadFont(self):
+ return "sans italic %s" % self.config["fontSize"]
+ def getUnreadFont(self):
+ return "sans %s" % self.config["fontSize"]
+ def getOrientation(self):
+ return ranges["orientation"].index(self.config["orientation"])
+ def getImageCache(self):
+ return self.config["imageCache"]
+ @mainthread
+ def getProxy(self):
+ if self.config["proxy"] == False:
+ return (False, None)
+ if client_get_default().get_bool('/system/http_proxy/use_http_proxy'):
+ port = client_get_default().get_int('/system/http_proxy/port')
+ http = client_get_default().get_string('/system/http_proxy/host')
+ proxy = ProxyHandler( {"http":"http://%s:%s/"% (http,port)} )
+ return (True, proxy)
+ return (False, None)
+ def getHideReadFeeds(self):
+ return self.config["hidereadfeeds"]
+ def getHideReadArticles(self):
+ return self.config["hidereadarticles"]
+ def getOpenInExternalBrowser(self):
+ return self.config["extBrowser"]
+ def getFeedSortOrder(self):
+ return self.config["feedsort"]
--- /dev/null
+# Copyright (c) 2011 Neal H. Walfield
+#
+# This software is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from __future__ import with_statement
+import os
+import logging
+import itertools
+import sys
+import string
+import traceback
+import time
+import errno
+import glob
+
+logger = None
+original_excepthook = None
+
+def my_excepthook(exctype, value, tb):
+ """Log uncaught exceptions."""
+ logger.error(
+ "Uncaught exception: %s"
+ % (''.join(traceback.format_exception(exctype, value, tb)),))
+ original_excepthook(exctype, value, tb)
+
+def init(dot_directory, debug=False, max_logfiles=1, program_name=None):
+ if not os.path.isabs(dot_directory):
+ dot_directory = os.path.join(os.path.expanduser("~"), dot_directory)
+
+ logging_directory = os.path.join(dot_directory, "logging")
+ try:
+ os.makedirs(logging_directory)
+ except OSError, e:
+ if e.errno != errno.EEXIST:
+ raise
+
+ if program_name is None:
+ program_name = os.path.basename(sys.argv[0])
+ string.translate(program_name, string.maketrans(' .', '__'))
+
+ timestamp = time.strftime("%Y%m%d")
+
+ logfiles = glob.glob(os.path.join(logging_directory,
+ program_name + '-*.log'))
+ if len(logfiles) >= max_logfiles:
+ logfiles.sort()
+ for f in logfiles[:-(max_logfiles+1)]:
+ print "Purging old log file %s" % (f,)
+ try:
+ os.remove(f)
+ except OSError, e:
+ print "Removing %s: %s" % (f, str(e))
+
+ logfile = os.path.join(logging_directory,
+ program_name + '-' + timestamp + '.log')
+
+ print "Sending output to %s" % logfile
+
+ global logger
+ logger = logging.getLogger(__name__)
+
+ if debug:
+ level = logging.DEBUG
+ else:
+ level = logging.INFO
+
+ logging.basicConfig(
+ level=level,
+ format=('%(asctime)s (pid: ' + str(os.getpid()) + ') '
+ + '%(levelname)-8s %(message)s'),
+ filename=logfile,
+ filemode='a')
+
+ # Log uncaught exceptions.
+ global original_excepthook
+ original_excepthook = sys.excepthook
+ sys.excepthook = my_excepthook
+
+ def redirect(thing):
+ filename = os.path.join(logging_directory, program_name + '.' + thing)
+ try:
+ with open(filename, "r") as fhandle:
+ contents = fhandle.read()
+ except IOError, e:
+ if e.errno in (errno.ENOENT,):
+ fhandle = None
+ contents = ""
+ else:
+ logging.error("Reading %s: %s" % (filename, str(e)))
+ raise
+
+ logging.error("std%s of last run: %s" % (thing, contents))
+
+ if fhandle is not None:
+ os.remove(filename)
+
+ print "Redirecting std%s to %s" % (thing, filename)
+ return open(filename, "w", 0)
+
+ sys.stderr = redirect('err')
+ sys.stdout = redirect('out')
+
--- /dev/null
+import Thread
+
+class Download(Thread):
+ def __init__(self, listing, key, config):
+ Thread.__init__(self)
+ self.listing = listing
+ self.key = key
+ self.config = config
+
+ def run (self):
+ (use_proxy, proxy) = self.config.getProxy()
+ key_lock = get_lock(self.key)
+ if key_lock != None:
+ if use_proxy:
+ self.listing.updateFeed(self.key, self.config.getExpiry(), proxy=proxy, imageCache=self.config.getImageCache() )
+ else:
+ self.listing.updateFeed(self.key, self.config.getExpiry(), imageCache=self.config.getImageCache() )
+ del key_lock
\ No newline at end of file
--- /dev/null
+#!/usr/bin/python
+
+import sys
+
+from PySide import QtGui
+from PySide import QtDeclarative
+import os
+from os import mkdir, remove, stat, environ
+from os.path import isfile, isdir, exists
+
+# Comment the line below if you don't want to use OpenGL for QML rendering or if it is not supported
+from PySide import QtOpenGL, QtCore
+
+from rss_sqlite import Listing
+CONFIGDIR = environ.get("HOME", "/home/user") + "/.feedingit"
+#CONFIGDIR = "/home/user/.feedingit"
+
+import logging
+#logger = logging.getLogger(__name__)
+
+import debugging
+debugging.init(dot_directory=".feedingit", program_name="feedingit-pyside")
+
+from cgi import escape
+from re import sub
+
+class FeedWrapper(QtCore.QObject):
+ def __init__(self, key):
+ QtCore.QObject.__init__(self)
+ self._key = key
+ def _name(self):
+ return listing.getFeedTitle(self._key)
+ def _unread(self):
+ return listing.getFeedNumberOfUnreadItems(self._key)
+ def _updatedDate(self):
+ return listing.getFeedUpdateTime(self._key)
+ def _icon(self):
+ return listing.getFavicon(self._key)
+ def _feedid(self):
+ return self._key
+ def _updating(self):
+ return false
+
+ changed = QtCore.Signal()
+
+ title = QtCore.Property(unicode, _name, notify=changed)
+ feedid = QtCore.Property(unicode, _feedid, notify=changed)
+ unread = QtCore.Property(unicode, _unread, notify=changed)
+ updatedDate= QtCore.Property(unicode, _updatedDate, notify=changed)
+ icon = QtCore.Property(unicode, _icon, notify=changed)
+ updating = QtCore.Property(unicode, _icon, notify=changed)
+
+class FeedsModel(QtCore.QAbstractListModel):
+ COLUMNS = ('feed', )
+ _category = None
+
+ def __init__(self):
+ QtCore.QAbstractListModel.__init__(self)
+ self._feeds = listing.getListOfFeeds(self._category)
+ self.setRoleNames(dict(enumerate(FeedsModel.COLUMNS)))
+
+ def rowCount(self, parent=QtCore.QModelIndex()):
+ return len(self._feeds)
+
+ def data(self, index, role):
+ if index.isValid() and role == FeedsModel.COLUMNS.index('feed'):
+ print self._feeds[index.row()]
+ return FeedWrapper(self._feeds[index.row()])
+ return None
+
+class ArticleWrapper(QtCore.QObject):
+ def __init__(self, feed, articleid):
+ QtCore.QObject.__init__(self)
+ self._feed = feed
+ self._articleid = articleid
+ def _name(self):
+ return self.fix_title(self._feed.getTitle(self._articleid))
+ def _unread(self):
+ return str(self._feed.isEntryRead(self._articleid))
+ def _getarticleid(self):
+ return self._articleid
+ def _updatedDate(self):
+ return self._feed.getDateStamp(self._articleid)
+ def _path(self):
+ return self._feed.getContentLink(self._articleid)
+
+ changed = QtCore.Signal()
+
+ title = QtCore.Property(unicode, _name, notify=changed)
+ articleid = QtCore.Property(unicode, _getarticleid, notify=changed)
+ unread = QtCore.Property(unicode, _unread, notify=changed)
+ updatedDate= QtCore.Property(unicode, _updatedDate, notify=changed)
+ path = QtCore.Property(unicode, _path, notify=changed)
+
+class ArticlesModel(QtCore.QAbstractListModel):
+ COLUMNS = ('article', )
+ _articles = []
+ _key = None
+ _feed = None
+
+ def __init__(self,):
+ QtCore.QAbstractListModel.__init__(self)
+ self.setRoleNames(dict(enumerate(ArticlesModel.COLUMNS)))
+
+ def updateModel(self, key):
+ self._key = key
+ self._feed = listing.getFeed(self._key)
+ self._articles = self._feed.getIds()
+
+ def rowCount(self, parent=QtCore.QModelIndex()):
+ print "art " + str(len(self._articles))
+ return len(self._articles)
+
+ def data(self, index, role):
+ print "data" + str(index) + " " + str(role)
+ if index.isValid() and role == ArticlesModel.COLUMNS.index('article'):
+ return ArticleWrapper(self._articles[index.row()])
+ return None
+
+class Controller(QtCore.QObject):
+
+ def __init__(self, listing):
+ QtCore.QObject.__init__(self)
+ from XmlHandler import XmlHandler
+ self._handler = XmlHandler(listing)
+
+ @QtCore.Slot(str,str, result=str)
+ def getArticle(self, key, article):
+ feed = listing.getFeed(key)
+ try:
+ file = open(feed.getContentLink(article))
+ html = file.read().replace("body", "body bgcolor='#ffffff'", 1)
+ file.close()
+ except:
+ html = "<html><body>Error retrieving article</body></html>"
+ return html
+
+ @QtCore.Slot(str, result=str)
+ def getFeedsXml(self, catid):
+ return self._handler.generateFeedsXml(catid)
+
+ @QtCore.Slot(str,result=str)
+ def getArticlesXml(self, key):
+ #onlyUnread = arguments.get("onlyUnread","False")
+ return self._handler.generateArticlesXml(key, "False")
+
+ @QtCore.Slot(result=str)
+ def getCategoryXml(self):
+ return self._handler.generateCategoryXml()
+
+ @QtCore.Slot(QtCore.QObject)
+ def feedClicked(self, wrapper):
+ #print 'User clicked on:', wrapper._key
+ #articlesModel.updateModel(wrapper._key)
+ pass
+
+ @QtCore.Slot(str)
+ def updateFeed(self, key):
+ print 'updating feed ', key
+ listing.updateFeed(key)
+
+ @QtCore.Slot()
+ def updateAll(self):
+ for feed in listing.getListOfFeeds("Manual"):
+ listing.updateFeed(feed)
+
+ @QtCore.Slot(str,str,str)
+ def addFeed(self, title, url, catid):
+ listing.addFeed(title,url, category=catid)
+
+ @QtCore.Slot(str)
+ def addCategory(self, name):
+ listing.addCategory(name)
+
+ @QtCore.Slot(str)
+ def markAllAsRead(self, key):
+ feed = listing.getFeed(key)
+ feed.markAllAsRead()
+
+ @QtCore.Slot(str, str)
+ def setEntryRead(self, key, articleid):
+ feed = listing.getFeed(key)
+ feed.setEntryRead(articleid)
+ listing.updateUnread(key)
+
+ @QtCore.Slot(str, result=str)
+ def getConfig(self, item):
+ if (item == "hideReadFeed"):
+ return "True"
+ if (item == "hideReadArticles"):
+ return "False"
+ return ""
+
+def main():
+
+ if not isdir(CONFIGDIR):
+ try:
+ mkdir(CONFIGDIR)
+ except:
+ logger.error("Error: Can't create configuration directory")
+ from sys import exit
+ exit(1)
+
+ from config import Config
+ global config
+ config = Config(None,CONFIGDIR+"config.ini")
+
+ global listing
+ listing = Listing(config, CONFIGDIR)
+
+ import mainthread
+ mainthread.init()
+
+ from jobmanager import JobManager
+ JobManager(True)
+
+ app = QtGui.QApplication(sys.argv)
+ view = QtDeclarative.QDeclarativeView()
+
+ global articlesModel
+ feedsModel = FeedsModel()
+ articlesModel = ArticlesModel()
+
+ controller = Controller(listing)
+
+ rc = view.rootContext()
+
+ rc.setContextProperty('controller', controller)
+ rc.setContextProperty('feedsModel', feedsModel)
+ rc.setContextProperty('articlesModel', articlesModel)
+
+ # Comment the two lines below if you don't want to use OpenGL for QML rendering or if it is not supported
+ glw = QtOpenGL.QGLWidget()
+ view.setViewport(glw)
+
+ if os.path.exists('/usr/share/feedingit/qml'):
+ view.setSource('/usr/share/feedingit/qml/main.qml')
+ else:
+ #view.setSource(os.path.join('qml','main.qml'))
+ view.setSource(os.path.join('qml','FeedingIt.qml'))
+
+ #view.showFullScreen()
+ view.show()
+ sys.exit(app.exec_())
+
+if __name__ == "__main__":
+
+ main()
--- /dev/null
+#!/usr/bin/env python
+"""Universal feed parser
+
+Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
+
+Visit http://feedparser.org/ for the latest version
+Visit http://feedparser.org/docs/ for the latest documentation
+
+Required: Python 2.4 or later
+Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
+"""
+
+__version__ = "5.0.1"
+__license__ = """Copyright (c) 2002-2008, Mark Pilgrim, All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE."""
+__author__ = "Mark Pilgrim <http://diveintomark.org/>"
+__contributors__ = ["Jason Diamond <http://injektilo.org/>",
+ "John Beimler <http://john.beimler.org/>",
+ "Fazal Majid <http://www.majid.info/mylos/weblog/>",
+ "Aaron Swartz <http://aaronsw.com/>",
+ "Kevin Marks <http://epeus.blogspot.com/>",
+ "Sam Ruby <http://intertwingly.net/>",
+ "Ade Oshineye <http://blog.oshineye.com/>",
+ "Martin Pool <http://sourcefrog.net/>",
+ "Kurt McKee <http://kurtmckee.org/>"]
+
+# HTTP "User-Agent" header to send to servers when downloading feeds.
+# If you are embedding feedparser in a larger application, you should
+# change this to your application name and URL.
+USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__
+
+# HTTP "Accept" header to send to servers when downloading feeds. If you don't
+# want to send an Accept header, set this to None.
+ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
+
+# List of preferred XML parsers, by SAX driver name. These will be tried first,
+# but if they're not installed, Python will keep searching through its own list
+# of pre-installed parsers until it finds one that supports everything we need.
+PREFERRED_XML_PARSERS = ["drv_libxml2"]
+
+# If you want feedparser to automatically run HTML markup through HTML Tidy, set
+# this to 1. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
+# or utidylib <http://utidylib.berlios.de/>.
+TIDY_MARKUP = 0
+
+# List of Python interfaces for HTML Tidy, in order of preference. Only useful
+# if TIDY_MARKUP = 1
+PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
+
+# If you want feedparser to automatically resolve all relative URIs, set this
+# to 1.
+RESOLVE_RELATIVE_URIS = 1
+
+# If you want feedparser to automatically sanitize all potentially unsafe
+# HTML content, set this to 1.
+SANITIZE_HTML = 1
+
+# ---------- Python 3 modules (make it work if possible) ----------
+try:
+ import rfc822
+except ImportError:
+ from email import _parseaddr as rfc822
+
+try:
+ # Python 3.1 introduces bytes.maketrans and simultaneously
+ # deprecates string.maketrans; use bytes.maketrans if possible
+ _maketrans = bytes.maketrans
+except (NameError, AttributeError):
+ import string
+ _maketrans = string.maketrans
+
+# base64 support for Atom feeds that contain embedded binary data
+try:
+ import base64, binascii
+except ImportError:
+ base64 = binascii = None
+else:
+ # Python 3.1 deprecates decodestring in favor of decodebytes
+ _base64decode = getattr(base64, 'decodebytes', base64.decodestring)
+
+def _s2bytes(s):
+ # Convert a UTF-8 str to bytes if the interpreter is Python 3
+ try:
+ return bytes(s, 'utf8')
+ except (NameError, TypeError):
+ # In Python 2.5 and below, bytes doesn't exist (NameError)
+ # In Python 2.6 and above, bytes and str are the same (TypeError)
+ return s
+
+def _l2bytes(l):
+ # Convert a list of ints to bytes if the interpreter is Python 3
+ try:
+ if bytes is not str:
+ # In Python 2.6 and above, this call won't raise an exception
+ # but it will return bytes([65]) as '[65]' instead of 'A'
+ return bytes(l)
+ raise NameError
+ except NameError:
+ return ''.join(map(chr, l))
+
+# If you want feedparser to allow all URL schemes, set this to ()
+# List culled from Python's urlparse documentation at:
+# http://docs.python.org/library/urlparse.html
+# as well as from "URI scheme" at Wikipedia:
+# https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme
+# Many more will likely need to be added!
+ACCEPTABLE_URI_SCHEMES = (
+ 'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'mailto',
+ 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu', 'sftp',
+ 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet', 'wais',
+ # Additional common-but-unofficial schemes
+ 'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs',
+ 'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg',
+)
+#ACCEPTABLE_URI_SCHEMES = ()
+
+# ---------- required modules (should come with any Python distribution) ----------
+import cgi
+import copy
+import datetime
+import re
+import struct
+import sys
+import time
+import types
+import urllib
+import urllib2
+import urlparse
+
+from htmlentitydefs import name2codepoint, codepoint2name, entitydefs
+
+try:
+ from io import BytesIO as _StringIO
+except ImportError:
+ try:
+ from cStringIO import StringIO as _StringIO
+ except ImportError:
+ from StringIO import StringIO as _StringIO
+
+# ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
+
+# gzip is included with most Python distributions, but may not be available if you compiled your own
+try:
+ import gzip
+except ImportError:
+ gzip = None
+try:
+ import zlib
+except ImportError:
+ zlib = None
+
+# If a real XML parser is available, feedparser will attempt to use it. feedparser has
+# been tested with the built-in SAX parser, PyXML, and libxml2. On platforms where the
+# Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
+# versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
+try:
+ import xml.sax
+ from xml.sax.saxutils import escape as _xmlescape
+except ImportError:
+ _XML_AVAILABLE = 0
+ def _xmlescape(data,entities={}):
+ data = data.replace('&', '&')
+ data = data.replace('>', '>')
+ data = data.replace('<', '<')
+ for char, entity in entities:
+ data = data.replace(char, entity)
+ return data
+else:
+ try:
+ xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
+ except xml.sax.SAXReaderNotAvailable:
+ _XML_AVAILABLE = 0
+ else:
+ _XML_AVAILABLE = 1
+
+# sgmllib is not available by default in Python 3; if the end user doesn't have
+# it available then we'll lose illformed XML parsing, content santizing, and
+# microformat support (at least while feedparser depends on BeautifulSoup).
+try:
+ import sgmllib
+except ImportError:
+ # This is probably Python 3, which doesn't include sgmllib anymore
+ _SGML_AVAILABLE = 0
+
+ # Mock sgmllib enough to allow subclassing later on
+ class sgmllib(object):
+ class SGMLParser(object):
+ def goahead(self, i):
+ pass
+ def parse_starttag(self, i):
+ pass
+else:
+ _SGML_AVAILABLE = 1
+
+ # sgmllib defines a number of module-level regular expressions that are
+ # insufficient for the XML parsing feedparser needs. Rather than modify
+ # the variables directly in sgmllib, they're defined here using the same
+ # names, and the compiled code objects of several sgmllib.SGMLParser
+ # methods are copied into _BaseHTMLProcessor so that they execute in
+ # feedparser's scope instead of sgmllib's scope.
+ charref = re.compile('&#(\d+|[xX][0-9a-fA-F]+);')
+ tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
+ attrfind = re.compile(
+ r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)[$]?(\s*=\s*'
+ r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?'
+ )
+
+ # Unfortunately, these must be copied over to prevent NameError exceptions
+ entityref = sgmllib.entityref
+ incomplete = sgmllib.incomplete
+ interesting = sgmllib.interesting
+ shorttag = sgmllib.shorttag
+ shorttagopen = sgmllib.shorttagopen
+ starttagopen = sgmllib.starttagopen
+
+ class _EndBracketRegEx:
+ def __init__(self):
+ # Overriding the built-in sgmllib.endbracket regex allows the
+ # parser to find angle brackets embedded in element attributes.
+ self.endbracket = re.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''')
+ def search(self, target, index=0):
+ match = self.endbracket.match(target, index)
+ if match is not None:
+ # Returning a new object in the calling thread's context
+ # resolves a thread-safety.
+ return EndBracketMatch(match)
+ return None
+ class EndBracketMatch:
+ def __init__(self, match):
+ self.match = match
+ def start(self, n):
+ return self.match.end(n)
+ endbracket = _EndBracketRegEx()
+
+
+# cjkcodecs and iconv_codec provide support for more character encodings.
+# Both are available from http://cjkpython.i18n.org/
+try:
+ import cjkcodecs.aliases
+except ImportError:
+ pass
+try:
+ import iconv_codec
+except ImportError:
+ pass
+
+# chardet library auto-detects character encodings
+# Download from http://chardet.feedparser.org/
+try:
+ import chardet
+except ImportError:
+ chardet = None
+
+# BeautifulSoup parser used for parsing microformats from embedded HTML content
+# http://www.crummy.com/software/BeautifulSoup/
+# feedparser is tested with BeautifulSoup 3.0.x, but it might work with the
+# older 2.x series. If it doesn't, and you can figure out why, I'll accept a
+# patch and modify the compatibility statement accordingly.
+try:
+ import BeautifulSoup
+except ImportError:
+ BeautifulSoup = None
+
+# ---------- don't touch these ----------
+class ThingsNobodyCaresAboutButMe(Exception): pass
+class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
+class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
+class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
+class UndeclaredNamespace(Exception): pass
+
+SUPPORTED_VERSIONS = {'': u'unknown',
+ 'rss090': u'RSS 0.90',
+ 'rss091n': u'RSS 0.91 (Netscape)',
+ 'rss091u': u'RSS 0.91 (Userland)',
+ 'rss092': u'RSS 0.92',
+ 'rss093': u'RSS 0.93',
+ 'rss094': u'RSS 0.94',
+ 'rss20': u'RSS 2.0',
+ 'rss10': u'RSS 1.0',
+ 'rss': u'RSS (unknown version)',
+ 'atom01': u'Atom 0.1',
+ 'atom02': u'Atom 0.2',
+ 'atom03': u'Atom 0.3',
+ 'atom10': u'Atom 1.0',
+ 'atom': u'Atom (unknown version)',
+ 'cdf': u'CDF',
+ }
+
+class FeedParserDict(dict):
+ keymap = {'channel': 'feed',
+ 'items': 'entries',
+ 'guid': 'id',
+ 'date': 'updated',
+ 'date_parsed': 'updated_parsed',
+ 'description': ['summary', 'subtitle'],
+ 'url': ['href'],
+ 'modified': 'updated',
+ 'modified_parsed': 'updated_parsed',
+ 'issued': 'published',
+ 'issued_parsed': 'published_parsed',
+ 'copyright': 'rights',
+ 'copyright_detail': 'rights_detail',
+ 'tagline': 'subtitle',
+ 'tagline_detail': 'subtitle_detail'}
+ def __getitem__(self, key):
+ if key == 'category':
+ try:
+ return dict.__getitem__(self, 'tags')[0]['term']
+ except IndexError:
+ raise KeyError, "object doesn't have key 'category'"
+ elif key == 'enclosures':
+ norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel'])
+ return [norel(link) for link in dict.__getitem__(self, 'links') if link['rel']==u'enclosure']
+ elif key == 'license':
+ for link in dict.__getitem__(self, 'links'):
+ if link['rel']==u'license' and link.has_key('href'):
+ return link['href']
+ elif key == 'categories':
+ return [(tag['scheme'], tag['term']) for tag in dict.__getitem__(self, 'tags')]
+ else:
+ realkey = self.keymap.get(key, key)
+ if isinstance(realkey, list):
+ for k in realkey:
+ if dict.__contains__(self, k):
+ return dict.__getitem__(self, k)
+ elif dict.__contains__(self, realkey):
+ return dict.__getitem__(self, realkey)
+ return dict.__getitem__(self, key)
+
+ def __contains__(self, key):
+ try:
+ self.__getitem__(key)
+ except KeyError:
+ return False
+ else:
+ return True
+
+ has_key = __contains__
+
+ def get(self, key, default=None):
+ try:
+ return self.__getitem__(key)
+ except KeyError:
+ return default
+
+ def __setitem__(self, key, value):
+ key = self.keymap.get(key, key)
+ if isinstance(key, list):
+ key = key[0]
+ return dict.__setitem__(self, key, value)
+
+ def setdefault(self, key, value):
+ if key not in self:
+ self[key] = value
+ return value
+ return self[key]
+
+ def __getattr__(self, key):
+ # __getattribute__() is called first; this will be called
+ # only if an attribute was not already found
+ try:
+ return self.__getitem__(key)
+ except KeyError:
+ raise AttributeError, "object has no attribute '%s'" % key
+
+
+_ebcdic_to_ascii_map = None
+def _ebcdic_to_ascii(s):
+ global _ebcdic_to_ascii_map
+ if not _ebcdic_to_ascii_map:
+ emap = (
+ 0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
+ 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
+ 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
+ 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
+ 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
+ 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
+ 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
+ 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
+ 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,
+ 202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,
+ 209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,
+ 216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,
+ 123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,
+ 125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,
+ 92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
+ 48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
+ )
+ _ebcdic_to_ascii_map = _maketrans( \
+ _l2bytes(range(256)), _l2bytes(emap))
+ return s.translate(_ebcdic_to_ascii_map)
+
+_cp1252 = {
+ unichr(128): unichr(8364), # euro sign
+ unichr(130): unichr(8218), # single low-9 quotation mark
+ unichr(131): unichr( 402), # latin small letter f with hook
+ unichr(132): unichr(8222), # double low-9 quotation mark
+ unichr(133): unichr(8230), # horizontal ellipsis
+ unichr(134): unichr(8224), # dagger
+ unichr(135): unichr(8225), # double dagger
+ unichr(136): unichr( 710), # modifier letter circumflex accent
+ unichr(137): unichr(8240), # per mille sign
+ unichr(138): unichr( 352), # latin capital letter s with caron
+ unichr(139): unichr(8249), # single left-pointing angle quotation mark
+ unichr(140): unichr( 338), # latin capital ligature oe
+ unichr(142): unichr( 381), # latin capital letter z with caron
+ unichr(145): unichr(8216), # left single quotation mark
+ unichr(146): unichr(8217), # right single quotation mark
+ unichr(147): unichr(8220), # left double quotation mark
+ unichr(148): unichr(8221), # right double quotation mark
+ unichr(149): unichr(8226), # bullet
+ unichr(150): unichr(8211), # en dash
+ unichr(151): unichr(8212), # em dash
+ unichr(152): unichr( 732), # small tilde
+ unichr(153): unichr(8482), # trade mark sign
+ unichr(154): unichr( 353), # latin small letter s with caron
+ unichr(155): unichr(8250), # single right-pointing angle quotation mark
+ unichr(156): unichr( 339), # latin small ligature oe
+ unichr(158): unichr( 382), # latin small letter z with caron
+ unichr(159): unichr( 376)} # latin capital letter y with diaeresis
+
+_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
+def _urljoin(base, uri):
+ uri = _urifixer.sub(r'\1\3', uri)
+ #try:
+ uri = urlparse.urljoin(base, uri)
+ if not isinstance(uri, unicode):
+ return uri.decode('utf-8', 'ignore')
+ return uri
+ #except:
+ # uri = urlparse.urlunparse([urllib.quote(part) for part in urlparse.urlparse(uri)])
+ # return urlparse.urljoin(base, uri)
+
+class _FeedParserMixin:
+ namespaces = {'': '',
+ 'http://backend.userland.com/rss': '',
+ 'http://blogs.law.harvard.edu/tech/rss': '',
+ 'http://purl.org/rss/1.0/': '',
+ 'http://my.netscape.com/rdf/simple/0.9/': '',
+ 'http://example.com/newformat#': '',
+ 'http://example.com/necho': '',
+ 'http://purl.org/echo/': '',
+ 'uri/of/echo/namespace#': '',
+ 'http://purl.org/pie/': '',
+ 'http://purl.org/atom/ns#': '',
+ 'http://www.w3.org/2005/Atom': '',
+ 'http://purl.org/rss/1.0/modules/rss091#': '',
+
+ 'http://webns.net/mvcb/': 'admin',
+ 'http://purl.org/rss/1.0/modules/aggregation/': 'ag',
+ 'http://purl.org/rss/1.0/modules/annotate/': 'annotate',
+ 'http://media.tangent.org/rss/1.0/': 'audio',
+ 'http://backend.userland.com/blogChannelModule': 'blogChannel',
+ 'http://web.resource.org/cc/': 'cc',
+ 'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons',
+ 'http://purl.org/rss/1.0/modules/company': 'co',
+ 'http://purl.org/rss/1.0/modules/content/': 'content',
+ 'http://my.theinfo.org/changed/1.0/rss/': 'cp',
+ 'http://purl.org/dc/elements/1.1/': 'dc',
+ 'http://purl.org/dc/terms/': 'dcterms',
+ 'http://purl.org/rss/1.0/modules/email/': 'email',
+ 'http://purl.org/rss/1.0/modules/event/': 'ev',
+ 'http://rssnamespace.org/feedburner/ext/1.0': 'feedburner',
+ 'http://freshmeat.net/rss/fm/': 'fm',
+ 'http://xmlns.com/foaf/0.1/': 'foaf',
+ 'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo',
+ 'http://postneo.com/icbm/': 'icbm',
+ 'http://purl.org/rss/1.0/modules/image/': 'image',
+ 'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes',
+ 'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes',
+ 'http://purl.org/rss/1.0/modules/link/': 'l',
+ 'http://search.yahoo.com/mrss': 'media',
+ #Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace
+ 'http://search.yahoo.com/mrss/': 'media',
+ 'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
+ 'http://prismstandard.org/namespaces/1.2/basic/': 'prism',
+ 'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf',
+ 'http://www.w3.org/2000/01/rdf-schema#': 'rdfs',
+ 'http://purl.org/rss/1.0/modules/reference/': 'ref',
+ 'http://purl.org/rss/1.0/modules/richequiv/': 'reqv',
+ 'http://purl.org/rss/1.0/modules/search/': 'search',
+ 'http://purl.org/rss/1.0/modules/slash/': 'slash',
+ 'http://schemas.xmlsoap.org/soap/envelope/': 'soap',
+ 'http://purl.org/rss/1.0/modules/servicestatus/': 'ss',
+ 'http://hacks.benhammersley.com/rss/streaming/': 'str',
+ 'http://purl.org/rss/1.0/modules/subscription/': 'sub',
+ 'http://purl.org/rss/1.0/modules/syndication/': 'sy',
+ 'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf',
+ 'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo',
+ 'http://purl.org/rss/1.0/modules/threading/': 'thr',
+ 'http://purl.org/rss/1.0/modules/textinput/': 'ti',
+ 'http://madskills.com/public/xml/rss/module/trackback/':'trackback',
+ 'http://wellformedweb.org/commentAPI/': 'wfw',
+ 'http://purl.org/rss/1.0/modules/wiki/': 'wiki',
+ 'http://www.w3.org/1999/xhtml': 'xhtml',
+ 'http://www.w3.org/1999/xlink': 'xlink',
+ 'http://www.w3.org/XML/1998/namespace': 'xml'
+}
+ _matchnamespaces = {}
+
+ can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'icon', 'logo']
+ can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
+ can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
+ html_types = [u'text/html', u'application/xhtml+xml']
+
+ def __init__(self, baseuri=None, baselang=None, encoding=u'utf-8'):
+ if not self._matchnamespaces:
+ for k, v in self.namespaces.items():
+ self._matchnamespaces[k.lower()] = v
+ self.feeddata = FeedParserDict() # feed-level data
+ self.encoding = encoding # character encoding
+ self.entries = [] # list of entry-level data
+ self.version = u'' # feed type/version, see SUPPORTED_VERSIONS
+ self.namespacesInUse = {} # dictionary of namespaces defined by the feed
+
+ # the following are used internally to track state;
+ # this is really out of control and should be refactored
+ self.infeed = 0
+ self.inentry = 0
+ self.incontent = 0
+ self.intextinput = 0
+ self.inimage = 0
+ self.inauthor = 0
+ self.incontributor = 0
+ self.inpublisher = 0
+ self.insource = 0
+ self.sourcedata = FeedParserDict()
+ self.contentparams = FeedParserDict()
+ self._summaryKey = None
+ self.namespacemap = {}
+ self.elementstack = []
+ self.basestack = []
+ self.langstack = []
+ self.baseuri = baseuri or u''
+ self.lang = baselang or None
+ self.svgOK = 0
+ self.hasTitle = 0
+ if baselang:
+ self.feeddata['language'] = baselang.replace('_','-')
+
+ def _normalize_attributes(self, kv):
+ k = kv[0].lower()
+ v = k in ('rel', 'type') and kv[1].lower() or kv[1]
+ # the sgml parser doesn't handle entities in attributes, nor
+ # does it pass the attribute values through as unicode, while
+ # strict xml parsers do -- account for this difference
+ if isinstance(self, _LooseFeedParser):
+ v = v.replace('&', '&')
+ if not isinstance(v, unicode):
+ v = v.decode('utf-8')
+ return (k, v)
+
+ def unknown_starttag(self, tag, attrs):
+ # normalize attrs
+ attrs = map(self._normalize_attributes, attrs)
+
+ # track xml:base and xml:lang
+ attrsD = dict(attrs)
+ baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
+ if not isinstance(baseuri, unicode):
+ baseuri = baseuri.decode(self.encoding, 'ignore')
+ # ensure that self.baseuri is always an absolute URI that
+ # uses a whitelisted URI scheme (e.g. not `javscript:`)
+ if self.baseuri:
+ self.baseuri = _makeSafeAbsoluteURI(self.baseuri, baseuri) or self.baseuri
+ else:
+ self.baseuri = _urljoin(self.baseuri, baseuri)
+ lang = attrsD.get('xml:lang', attrsD.get('lang'))
+ if lang == '':
+ # xml:lang could be explicitly set to '', we need to capture that
+ lang = None
+ elif lang is None:
+ # if no xml:lang is specified, use parent lang
+ lang = self.lang
+ if lang:
+ if tag in ('feed', 'rss', 'rdf:RDF'):
+ self.feeddata['language'] = lang.replace('_','-')
+ self.lang = lang
+ self.basestack.append(self.baseuri)
+ self.langstack.append(lang)
+
+ # track namespaces
+ for prefix, uri in attrs:
+ if prefix.startswith('xmlns:'):
+ self.trackNamespace(prefix[6:], uri)
+ elif prefix == 'xmlns':
+ self.trackNamespace(None, uri)
+
+ # track inline content
+ if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', u'xml').endswith(u'xml'):
+ if tag in ['xhtml:div', 'div']:
+ return # typepad does this 10/2007
+ # element declared itself as escaped markup, but it isn't really
+ self.contentparams['type'] = u'application/xhtml+xml'
+ if self.incontent and self.contentparams.get('type') == u'application/xhtml+xml':
+ if tag.find(':') <> -1:
+ prefix, tag = tag.split(':', 1)
+ namespace = self.namespacesInUse.get(prefix, '')
+ if tag=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
+ attrs.append(('xmlns',namespace))
+ if tag=='svg' and namespace=='http://www.w3.org/2000/svg':
+ attrs.append(('xmlns',namespace))
+ if tag == 'svg':
+ self.svgOK += 1
+ return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0)
+
+ # match namespaces
+ if tag.find(':') <> -1:
+ prefix, suffix = tag.split(':', 1)
+ else:
+ prefix, suffix = '', tag
+ prefix = self.namespacemap.get(prefix, prefix)
+ if prefix:
+ prefix = prefix + '_'
+
+ # special hack for better tracking of empty textinput/image elements in illformed feeds
+ if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
+ self.intextinput = 0
+ if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
+ self.inimage = 0
+
+ # call special handler (if defined) or default handler
+ methodname = '_start_' + prefix + suffix
+ try:
+ method = getattr(self, methodname)
+ return method(attrsD)
+ except AttributeError:
+ # Since there's no handler or something has gone wrong we explicitly add the element and its attributes
+ unknown_tag = prefix + suffix
+ if len(attrsD) == 0:
+ # No attributes so merge it into the encosing dictionary
+ return self.push(unknown_tag, 1)
+ else:
+ # Has attributes so create it in its own dictionary
+ context = self._getContext()
+ context[unknown_tag] = attrsD
+
+ def unknown_endtag(self, tag):
+ # match namespaces
+ if tag.find(':') <> -1:
+ prefix, suffix = tag.split(':', 1)
+ else:
+ prefix, suffix = '', tag
+ prefix = self.namespacemap.get(prefix, prefix)
+ if prefix:
+ prefix = prefix + '_'
+ if suffix == 'svg' and self.svgOK:
+ self.svgOK -= 1
+
+ # call special handler (if defined) or default handler
+ methodname = '_end_' + prefix + suffix
+ try:
+ if self.svgOK:
+ raise AttributeError()
+ method = getattr(self, methodname)
+ method()
+ except AttributeError:
+ self.pop(prefix + suffix)
+
+ # track inline content
+ if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', u'xml').endswith(u'xml'):
+ # element declared itself as escaped markup, but it isn't really
+ if tag in ['xhtml:div', 'div']:
+ return # typepad does this 10/2007
+ self.contentparams['type'] = u'application/xhtml+xml'
+ if self.incontent and self.contentparams.get('type') == u'application/xhtml+xml':
+ tag = tag.split(':')[-1]
+ self.handle_data('</%s>' % tag, escape=0)
+
+ # track xml:base and xml:lang going out of scope
+ if self.basestack:
+ self.basestack.pop()
+ if self.basestack and self.basestack[-1]:
+ self.baseuri = self.basestack[-1]
+ if self.langstack:
+ self.langstack.pop()
+ if self.langstack: # and (self.langstack[-1] is not None):
+ self.lang = self.langstack[-1]
+
+ def handle_charref(self, ref):
+ # called for each character reference, e.g. for ' ', ref will be '160'
+ if not self.elementstack:
+ return
+ ref = ref.lower()
+ if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
+ text = '&#%s;' % ref
+ else:
+ if ref[0] == 'x':
+ c = int(ref[1:], 16)
+ else:
+ c = int(ref)
+ text = unichr(c).encode('utf-8')
+ self.elementstack[-1][2].append(text)
+
+ def handle_entityref(self, ref):
+ # called for each entity reference, e.g. for '©', ref will be 'copy'
+ if not self.elementstack:
+ return
+ if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
+ text = '&%s;' % ref
+ elif ref in self.entities.keys():
+ text = self.entities[ref]
+ if text.startswith('&#') and text.endswith(';'):
+ return self.handle_entityref(text)
+ else:
+ try:
+ name2codepoint[ref]
+ except KeyError:
+ text = '&%s;' % ref
+ else:
+ text = unichr(name2codepoint[ref]).encode('utf-8')
+ self.elementstack[-1][2].append(text)
+
+ def handle_data(self, text, escape=1):
+ # called for each block of plain text, i.e. outside of any tag and
+ # not containing any character or entity references
+ if not self.elementstack:
+ return
+ if escape and self.contentparams.get('type') == u'application/xhtml+xml':
+ text = _xmlescape(text)
+ self.elementstack[-1][2].append(text)
+
+ def handle_comment(self, text):
+ # called for each comment, e.g. <!-- insert message here -->
+ pass
+
+ def handle_pi(self, text):
+ # called for each processing instruction, e.g. <?instruction>
+ pass
+
+ def handle_decl(self, text):
+ pass
+
+ def parse_declaration(self, i):
+ # override internal declaration handler to handle CDATA blocks
+ if self.rawdata[i:i+9] == '<![CDATA[':
+ k = self.rawdata.find(']]>', i)
+ if k == -1:
+ # CDATA block began but didn't finish
+ k = len(self.rawdata)
+ return k
+ self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
+ return k+3
+ else:
+ k = self.rawdata.find('>', i)
+ if k >= 0:
+ return k+1
+ else:
+ # We have an incomplete CDATA block.
+ return k
+
+ def mapContentType(self, contentType):
+ contentType = contentType.lower()
+ if contentType == 'text' or contentType == 'plain':
+ contentType = u'text/plain'
+ elif contentType == 'html':
+ contentType = u'text/html'
+ elif contentType == 'xhtml':
+ contentType = u'application/xhtml+xml'
+ return contentType
+
+ def trackNamespace(self, prefix, uri):
+ loweruri = uri.lower()
+ if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
+ self.version = u'rss090'
+ if loweruri == 'http://purl.org/rss/1.0/' and not self.version:
+ self.version = u'rss10'
+ if loweruri == 'http://www.w3.org/2005/atom' and not self.version:
+ self.version = u'atom10'
+ if loweruri.find(u'backend.userland.com/rss') <> -1:
+ # match any backend.userland.com namespace
+ uri = u'http://backend.userland.com/rss'
+ loweruri = uri
+ if self._matchnamespaces.has_key(loweruri):
+ self.namespacemap[prefix] = self._matchnamespaces[loweruri]
+ self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
+ else:
+ self.namespacesInUse[prefix or ''] = uri
+
+ def resolveURI(self, uri):
+ return _urljoin(self.baseuri or u'', uri)
+
+ def decodeEntities(self, element, data):
+ return data
+
+ def strattrs(self, attrs):
+ return ''.join([' %s="%s"' % (t[0],_xmlescape(t[1],{'"':'"'})) for t in attrs])
+
+ def push(self, element, expectingText):
+ self.elementstack.append([element, expectingText, []])
+
+ def pop(self, element, stripWhitespace=1):
+ if not self.elementstack:
+ return
+ if self.elementstack[-1][0] != element:
+ return
+
+ element, expectingText, pieces = self.elementstack.pop()
+
+ if self.version == u'atom10' and self.contentparams.get('type', u'text') == u'application/xhtml+xml':
+ # remove enclosing child element, but only if it is a <div> and
+ # only if all the remaining content is nested underneath it.
+ # This means that the divs would be retained in the following:
+ # <div>foo</div><div>bar</div>
+ while pieces and len(pieces)>1 and not pieces[-1].strip():
+ del pieces[-1]
+ while pieces and len(pieces)>1 and not pieces[0].strip():
+ del pieces[0]
+ if pieces and (pieces[0] == '<div>' or pieces[0].startswith('<div ')) and pieces[-1]=='</div>':
+ depth = 0
+ for piece in pieces[:-1]:
+ if piece.startswith('</'):
+ depth -= 1
+ if depth == 0:
+ break
+ elif piece.startswith('<') and not piece.endswith('/>'):
+ depth += 1
+ else:
+ pieces = pieces[1:-1]
+
+ # Ensure each piece is a str for Python 3
+ for (i, v) in enumerate(pieces):
+ if not isinstance(v, unicode):
+ pieces[i] = v.decode('utf-8')
+
+ output = u''.join(pieces)
+ if stripWhitespace:
+ output = output.strip()
+ if not expectingText:
+ return output
+
+ # decode base64 content
+ if base64 and self.contentparams.get('base64', 0):
+ try:
+ output = _base64decode(output)
+ except binascii.Error:
+ pass
+ except binascii.Incomplete:
+ pass
+ except TypeError:
+ # In Python 3, base64 takes and outputs bytes, not str
+ # This may not be the most correct way to accomplish this
+ output = _base64decode(output.encode('utf-8')).decode('utf-8')
+
+ # resolve relative URIs
+ if (element in self.can_be_relative_uri) and output:
+ output = self.resolveURI(output)
+
+ # decode entities within embedded markup
+ if not self.contentparams.get('base64', 0):
+ output = self.decodeEntities(element, output)
+
+ # some feed formats require consumers to guess
+ # whether the content is html or plain text
+ if not self.version.startswith(u'atom') and self.contentparams.get('type') == u'text/plain':
+ if self.lookslikehtml(output):
+ self.contentparams['type'] = u'text/html'
+
+ # remove temporary cruft from contentparams
+ try:
+ del self.contentparams['mode']
+ except KeyError:
+ pass
+ try:
+ del self.contentparams['base64']
+ except KeyError:
+ pass
+
+ is_htmlish = self.mapContentType(self.contentparams.get('type', u'text/html')) in self.html_types
+ # resolve relative URIs within embedded markup
+ if is_htmlish and RESOLVE_RELATIVE_URIS:
+ if element in self.can_contain_relative_uris:
+ output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', u'text/html'))
+
+ # parse microformats
+ # (must do this before sanitizing because some microformats
+ # rely on elements that we sanitize)
+ if is_htmlish and element in ['content', 'description', 'summary']:
+ mfresults = _parseMicroformats(output, self.baseuri, self.encoding)
+ if mfresults:
+ for tag in mfresults.get('tags', []):
+ self._addTag(tag['term'], tag['scheme'], tag['label'])
+ for enclosure in mfresults.get('enclosures', []):
+ self._start_enclosure(enclosure)
+ for xfn in mfresults.get('xfn', []):
+ self._addXFN(xfn['relationships'], xfn['href'], xfn['name'])
+ vcard = mfresults.get('vcard')
+ if vcard:
+ self._getContext()['vcard'] = vcard
+
+ # sanitize embedded markup
+ if is_htmlish and SANITIZE_HTML:
+ if element in self.can_contain_dangerous_markup:
+ output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', u'text/html'))
+
+ if self.encoding and not isinstance(output, unicode):
+ output = output.decode(self.encoding, 'ignore')
+
+ # address common error where people take data that is already
+ # utf-8, presume that it is iso-8859-1, and re-encode it.
+ if self.encoding in (u'utf-8', u'utf-8_INVALID_PYTHON_3') and isinstance(output, unicode):
+ try:
+ output = output.encode('iso-8859-1').decode('utf-8')
+ except (UnicodeEncodeError, UnicodeDecodeError):
+ pass
+
+ # map win-1252 extensions to the proper code points
+ if isinstance(output, unicode):
+ output = u''.join([c in _cp1252.keys() and _cp1252[c] or c for c in output])
+
+ # categories/tags/keywords/whatever are handled in _end_category
+ if element == 'category':
+ return output
+
+ if element == 'title' and self.hasTitle:
+ return output
+
+ # store output in appropriate place(s)
+ if self.inentry and not self.insource:
+ if element == 'content':
+ self.entries[-1].setdefault(element, [])
+ contentparams = copy.deepcopy(self.contentparams)
+ contentparams['value'] = output
+ self.entries[-1][element].append(contentparams)
+ elif element == 'link':
+ if not self.inimage:
+ # query variables in urls in link elements are improperly
+ # converted from `?a=1&b=2` to `?a=1&b;=2` as if they're
+ # unhandled character references. fix this special case.
+ output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output)
+ self.entries[-1][element] = output
+ if output:
+ self.entries[-1]['links'][-1]['href'] = output
+ else:
+ if element == 'description':
+ element = 'summary'
+ self.entries[-1][element] = output
+ if self.incontent:
+ contentparams = copy.deepcopy(self.contentparams)
+ contentparams['value'] = output
+ self.entries[-1][element + '_detail'] = contentparams
+ elif (self.infeed or self.insource):# and (not self.intextinput) and (not self.inimage):
+ context = self._getContext()
+ if element == 'description':
+ element = 'subtitle'
+ context[element] = output
+ if element == 'link':
+ # fix query variables; see above for the explanation
+ output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output)
+ context[element] = output
+ context['links'][-1]['href'] = output
+ elif self.incontent:
+ contentparams = copy.deepcopy(self.contentparams)
+ contentparams['value'] = output
+ context[element + '_detail'] = contentparams
+ return output
+
+ def pushContent(self, tag, attrsD, defaultContentType, expectingText):
+ self.incontent += 1
+ if self.lang:
+ self.lang=self.lang.replace('_','-')
+ self.contentparams = FeedParserDict({
+ 'type': self.mapContentType(attrsD.get('type', defaultContentType)),
+ 'language': self.lang,
+ 'base': self.baseuri})
+ self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams)
+ self.push(tag, expectingText)
+
+ def popContent(self, tag):
+ value = self.pop(tag)
+ self.incontent -= 1
+ self.contentparams.clear()
+ return value
+
+ # a number of elements in a number of RSS variants are nominally plain
+ # text, but this is routinely ignored. This is an attempt to detect
+ # the most common cases. As false positives often result in silent
+ # data loss, this function errs on the conservative side.
+ @staticmethod
+ def lookslikehtml(s):
+ # must have a close tag or a entity reference to qualify
+ if not (re.search(r'</(\w+)>',s) or re.search("&#?\w+;",s)):
+ return
+
+ # all tags must be in a restricted subset of valid HTML tags
+ if filter(lambda t: t.lower() not in _HTMLSanitizer.acceptable_elements,
+ re.findall(r'</?(\w+)',s)):
+ return
+
+ # all entities must have been defined as valid HTML entities
+ if filter(lambda e: e not in entitydefs.keys(), re.findall(r'&(\w+);', s)):
+ return
+
+ return 1
+
+ def _mapToStandardPrefix(self, name):
+ colonpos = name.find(':')
+ if colonpos <> -1:
+ prefix = name[:colonpos]
+ suffix = name[colonpos+1:]
+ prefix = self.namespacemap.get(prefix, prefix)
+ name = prefix + ':' + suffix
+ return name
+
+ def _getAttribute(self, attrsD, name):
+ return attrsD.get(self._mapToStandardPrefix(name))
+
+ def _isBase64(self, attrsD, contentparams):
+ if attrsD.get('mode', '') == 'base64':
+ return 1
+ if self.contentparams['type'].startswith(u'text/'):
+ return 0
+ if self.contentparams['type'].endswith(u'+xml'):
+ return 0
+ if self.contentparams['type'].endswith(u'/xml'):
+ return 0
+ return 1
+
+ def _itsAnHrefDamnIt(self, attrsD):
+ href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
+ if href:
+ try:
+ del attrsD['url']
+ except KeyError:
+ pass
+ try:
+ del attrsD['uri']
+ except KeyError:
+ pass
+ attrsD['href'] = href
+ return attrsD
+
+ def _save(self, key, value, overwrite=False):
+ context = self._getContext()
+ if overwrite:
+ context[key] = value
+ else:
+ context.setdefault(key, value)
+
+ def _start_rss(self, attrsD):
+ versionmap = {'0.91': u'rss091u',
+ '0.92': u'rss092',
+ '0.93': u'rss093',
+ '0.94': u'rss094'}
+ #If we're here then this is an RSS feed.
+ #If we don't have a version or have a version that starts with something
+ #other than RSS then there's been a mistake. Correct it.
+ if not self.version or not self.version.startswith(u'rss'):
+ attr_version = attrsD.get('version', '')
+ version = versionmap.get(attr_version)
+ if version:
+ self.version = version
+ elif attr_version.startswith('2.'):
+ self.version = u'rss20'
+ else:
+ self.version = u'rss'
+
+ def _start_channel(self, attrsD):
+ self.infeed = 1
+ self._cdf_common(attrsD)
+
+ def _cdf_common(self, attrsD):
+ if attrsD.has_key('lastmod'):
+ self._start_modified({})
+ self.elementstack[-1][-1] = attrsD['lastmod']
+ self._end_modified()
+ if attrsD.has_key('href'):
+ self._start_link({})
+ self.elementstack[-1][-1] = attrsD['href']
+ self._end_link()
+
+ def _start_feed(self, attrsD):
+ self.infeed = 1
+ versionmap = {'0.1': u'atom01',
+ '0.2': u'atom02',
+ '0.3': u'atom03'}
+ if not self.version:
+ attr_version = attrsD.get('version')
+ version = versionmap.get(attr_version)
+ if version:
+ self.version = version
+ else:
+ self.version = u'atom'
+
+ def _end_channel(self):
+ self.infeed = 0
+ _end_feed = _end_channel
+
+ def _start_image(self, attrsD):
+ context = self._getContext()
+ if not self.inentry:
+ context.setdefault('image', FeedParserDict())
+ self.inimage = 1
+ self.hasTitle = 0
+ self.push('image', 0)
+
+ def _end_image(self):
+ self.pop('image')
+ self.inimage = 0
+
+ def _start_textinput(self, attrsD):
+ context = self._getContext()
+ context.setdefault('textinput', FeedParserDict())
+ self.intextinput = 1
+ self.hasTitle = 0
+ self.push('textinput', 0)
+ _start_textInput = _start_textinput
+
+ def _end_textinput(self):
+ self.pop('textinput')
+ self.intextinput = 0
+ _end_textInput = _end_textinput
+
+ def _start_author(self, attrsD):
+ self.inauthor = 1
+ self.push('author', 1)
+ # Append a new FeedParserDict when expecting an author
+ context = self._getContext()
+ context.setdefault('authors', [])
+ context['authors'].append(FeedParserDict())
+ _start_managingeditor = _start_author
+ _start_dc_author = _start_author
+ _start_dc_creator = _start_author
+ _start_itunes_author = _start_author
+
+ def _end_author(self):
+ self.pop('author')
+ self.inauthor = 0
+ self._sync_author_detail()
+ _end_managingeditor = _end_author
+ _end_dc_author = _end_author
+ _end_dc_creator = _end_author
+ _end_itunes_author = _end_author
+
+ def _start_itunes_owner(self, attrsD):
+ self.inpublisher = 1
+ self.push('publisher', 0)
+
+ def _end_itunes_owner(self):
+ self.pop('publisher')
+ self.inpublisher = 0
+ self._sync_author_detail('publisher')
+
+ def _start_contributor(self, attrsD):
+ self.incontributor = 1
+ context = self._getContext()
+ context.setdefault('contributors', [])
+ context['contributors'].append(FeedParserDict())
+ self.push('contributor', 0)
+
+ def _end_contributor(self):
+ self.pop('contributor')
+ self.incontributor = 0
+
+ def _start_dc_contributor(self, attrsD):
+ self.incontributor = 1
+ context = self._getContext()
+ context.setdefault('contributors', [])
+ context['contributors'].append(FeedParserDict())
+ self.push('name', 0)
+
+ def _end_dc_contributor(self):
+ self._end_name()
+ self.incontributor = 0
+
+ def _start_name(self, attrsD):
+ self.push('name', 0)
+ _start_itunes_name = _start_name
+
+ def _end_name(self):
+ value = self.pop('name')
+ if self.inpublisher:
+ self._save_author('name', value, 'publisher')
+ elif self.inauthor:
+ self._save_author('name', value)
+ elif self.incontributor:
+ self._save_contributor('name', value)
+ elif self.intextinput:
+ context = self._getContext()
+ context['name'] = value
+ _end_itunes_name = _end_name
+
+ def _start_width(self, attrsD):
+ self.push('width', 0)
+
+ def _end_width(self):
+ value = self.pop('width')
+ try:
+ value = int(value)
+ except ValueError:
+ value = 0
+ if self.inimage:
+ context = self._getContext()
+ context['width'] = value
+
+ def _start_height(self, attrsD):
+ self.push('height', 0)
+
+ def _end_height(self):
+ value = self.pop('height')
+ try:
+ value = int(value)
+ except ValueError:
+ value = 0
+ if self.inimage:
+ context = self._getContext()
+ context['height'] = value
+
+ def _start_url(self, attrsD):
+ self.push('href', 1)
+ _start_homepage = _start_url
+ _start_uri = _start_url
+
+ def _end_url(self):
+ value = self.pop('href')
+ if self.inauthor:
+ self._save_author('href', value)
+ elif self.incontributor:
+ self._save_contributor('href', value)
+ _end_homepage = _end_url
+ _end_uri = _end_url
+
+ def _start_email(self, attrsD):
+ self.push('email', 0)
+ _start_itunes_email = _start_email
+
+ def _end_email(self):
+ value = self.pop('email')
+ if self.inpublisher:
+ self._save_author('email', value, 'publisher')
+ elif self.inauthor:
+ self._save_author('email', value)
+ elif self.incontributor:
+ self._save_contributor('email', value)
+ _end_itunes_email = _end_email
+
+ def _getContext(self):
+ if self.insource:
+ context = self.sourcedata
+ elif self.inimage and self.feeddata.has_key('image'):
+ context = self.feeddata['image']
+ elif self.intextinput:
+ context = self.feeddata['textinput']
+ elif self.inentry:
+ context = self.entries[-1]
+ else:
+ context = self.feeddata
+ return context
+
+ def _save_author(self, key, value, prefix='author'):
+ context = self._getContext()
+ context.setdefault(prefix + '_detail', FeedParserDict())
+ context[prefix + '_detail'][key] = value
+ self._sync_author_detail()
+ context.setdefault('authors', [FeedParserDict()])
+ context['authors'][-1][key] = value
+
+ def _save_contributor(self, key, value):
+ context = self._getContext()
+ context.setdefault('contributors', [FeedParserDict()])
+ context['contributors'][-1][key] = value
+
+ def _sync_author_detail(self, key='author'):
+ context = self._getContext()
+ detail = context.get('%s_detail' % key)
+ if detail:
+ name = detail.get('name')
+ email = detail.get('email')
+ if name and email:
+ context[key] = u'%s (%s)' % (name, email)
+ elif name:
+ context[key] = name
+ elif email:
+ context[key] = email
+ else:
+ author, email = context.get(key), None
+ if not author:
+ return
+ emailmatch = re.search(ur'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author)
+ if emailmatch:
+ email = emailmatch.group(0)
+ # probably a better way to do the following, but it passes all the tests
+ author = author.replace(email, u'')
+ author = author.replace(u'()', u'')
+ author = author.replace(u'<>', u'')
+ author = author.replace(u'<>', u'')
+ author = author.strip()
+ if author and (author[0] == u'('):
+ author = author[1:]
+ if author and (author[-1] == u')'):
+ author = author[:-1]
+ author = author.strip()
+ if author or email:
+ context.setdefault('%s_detail' % key, FeedParserDict())
+ if author:
+ context['%s_detail' % key]['name'] = author
+ if email:
+ context['%s_detail' % key]['email'] = email
+
+ def _start_subtitle(self, attrsD):
+ self.pushContent('subtitle', attrsD, u'text/plain', 1)
+ _start_tagline = _start_subtitle
+ _start_itunes_subtitle = _start_subtitle
+
+ def _end_subtitle(self):
+ self.popContent('subtitle')
+ _end_tagline = _end_subtitle
+ _end_itunes_subtitle = _end_subtitle
+
+ def _start_rights(self, attrsD):
+ self.pushContent('rights', attrsD, u'text/plain', 1)
+ _start_dc_rights = _start_rights
+ _start_copyright = _start_rights
+
+ def _end_rights(self):
+ self.popContent('rights')
+ _end_dc_rights = _end_rights
+ _end_copyright = _end_rights
+
+ def _start_item(self, attrsD):
+ self.entries.append(FeedParserDict())
+ self.push('item', 0)
+ self.inentry = 1
+ self.guidislink = 0
+ self.hasTitle = 0
+ id = self._getAttribute(attrsD, 'rdf:about')
+ if id:
+ context = self._getContext()
+ context['id'] = id
+ self._cdf_common(attrsD)
+ _start_entry = _start_item
+
+ def _end_item(self):
+ self.pop('item')
+ self.inentry = 0
+ _end_entry = _end_item
+
+ def _start_dc_language(self, attrsD):
+ self.push('language', 1)
+ _start_language = _start_dc_language
+
+ def _end_dc_language(self):
+ self.lang = self.pop('language')
+ _end_language = _end_dc_language
+
+ def _start_dc_publisher(self, attrsD):
+ self.push('publisher', 1)
+ _start_webmaster = _start_dc_publisher
+
+ def _end_dc_publisher(self):
+ self.pop('publisher')
+ self._sync_author_detail('publisher')
+ _end_webmaster = _end_dc_publisher
+
+ def _start_published(self, attrsD):
+ self.push('published', 1)
+ _start_dcterms_issued = _start_published
+ _start_issued = _start_published
+
+ def _end_published(self):
+ value = self.pop('published')
+ self._save('published_parsed', _parse_date(value), overwrite=True)
+ _end_dcterms_issued = _end_published
+ _end_issued = _end_published
+
+ def _start_updated(self, attrsD):
+ self.push('updated', 1)
+ _start_modified = _start_updated
+ _start_dcterms_modified = _start_updated
+ _start_pubdate = _start_updated
+ _start_dc_date = _start_updated
+ _start_lastbuilddate = _start_updated
+
+ def _end_updated(self):
+ value = self.pop('updated')
+ parsed_value = _parse_date(value)
+ self._save('updated_parsed', parsed_value, overwrite=True)
+ _end_modified = _end_updated
+ _end_dcterms_modified = _end_updated
+ _end_pubdate = _end_updated
+ _end_dc_date = _end_updated
+ _end_lastbuilddate = _end_updated
+
+ def _start_created(self, attrsD):
+ self.push('created', 1)
+ _start_dcterms_created = _start_created
+
+ def _end_created(self):
+ value = self.pop('created')
+ self._save('created_parsed', _parse_date(value), overwrite=True)
+ _end_dcterms_created = _end_created
+
+ def _start_expirationdate(self, attrsD):
+ self.push('expired', 1)
+
+ def _end_expirationdate(self):
+ self._save('expired_parsed', _parse_date(self.pop('expired')), overwrite=True)
+
+ def _start_cc_license(self, attrsD):
+ context = self._getContext()
+ value = self._getAttribute(attrsD, 'rdf:resource')
+ attrsD = FeedParserDict()
+ attrsD['rel'] = u'license'
+ if value:
+ attrsD['href']=value
+ context.setdefault('links', []).append(attrsD)
+
+ def _start_creativecommons_license(self, attrsD):
+ self.push('license', 1)
+ _start_creativeCommons_license = _start_creativecommons_license
+
+ def _end_creativecommons_license(self):
+ value = self.pop('license')
+ context = self._getContext()
+ attrsD = FeedParserDict()
+ attrsD['rel'] = u'license'
+ if value:
+ attrsD['href'] = value
+ context.setdefault('links', []).append(attrsD)
+ del context['license']
+ _end_creativeCommons_license = _end_creativecommons_license
+
+ def _addXFN(self, relationships, href, name):
+ context = self._getContext()
+ xfn = context.setdefault('xfn', [])
+ value = FeedParserDict({'relationships': relationships, 'href': href, 'name': name})
+ if value not in xfn:
+ xfn.append(value)
+
+ def _addTag(self, term, scheme, label):
+ context = self._getContext()
+ tags = context.setdefault('tags', [])
+ if (not term) and (not scheme) and (not label):
+ return
+ value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})
+ if value not in tags:
+ tags.append(value)
+
+ def _start_category(self, attrsD):
+ term = attrsD.get('term')
+ scheme = attrsD.get('scheme', attrsD.get('domain'))
+ label = attrsD.get('label')
+ self._addTag(term, scheme, label)
+ self.push('category', 1)
+ _start_dc_subject = _start_category
+ _start_keywords = _start_category
+
+ def _start_media_category(self, attrsD):
+ attrsD.setdefault('scheme', u'http://search.yahoo.com/mrss/category_schema')
+ self._start_category(attrsD)
+
+ def _end_itunes_keywords(self):
+ for term in self.pop('itunes_keywords').split():
+ self._addTag(term, u'http://www.itunes.com/', None)
+
+ def _start_itunes_category(self, attrsD):
+ self._addTag(attrsD.get('text'), u'http://www.itunes.com/', None)
+ self.push('category', 1)
+
+ def _end_category(self):
+ value = self.pop('category')
+ if not value:
+ return
+ context = self._getContext()
+ tags = context['tags']
+ if value and len(tags) and not tags[-1]['term']:
+ tags[-1]['term'] = value
+ else:
+ self._addTag(value, None, None)
+ _end_dc_subject = _end_category
+ _end_keywords = _end_category
+ _end_itunes_category = _end_category
+ _end_media_category = _end_category
+
+ def _start_cloud(self, attrsD):
+ self._getContext()['cloud'] = FeedParserDict(attrsD)
+
+ def _start_link(self, attrsD):
+ attrsD.setdefault('rel', u'alternate')
+ if attrsD['rel'] == u'self':
+ attrsD.setdefault('type', u'application/atom+xml')
+ else:
+ attrsD.setdefault('type', u'text/html')
+ context = self._getContext()
+ attrsD = self._itsAnHrefDamnIt(attrsD)
+ if attrsD.has_key('href'):
+ attrsD['href'] = self.resolveURI(attrsD['href'])
+ expectingText = self.infeed or self.inentry or self.insource
+ context.setdefault('links', [])
+ if not (self.inentry and self.inimage):
+ context['links'].append(FeedParserDict(attrsD))
+ if attrsD.has_key('href'):
+ expectingText = 0
+ if (attrsD.get('rel') == u'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
+ context['link'] = attrsD['href']
+ else:
+ self.push('link', expectingText)
+
+ def _end_link(self):
+ value = self.pop('link')
+ context = self._getContext()
+
+ def _start_guid(self, attrsD):
+ self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
+ self.push('id', 1)
+
+ def _end_guid(self):
+ value = self.pop('id')
+ self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))
+ if self.guidislink:
+ # guid acts as link, but only if 'ispermalink' is not present or is 'true',
+ # and only if the item doesn't already have a link element
+ self._save('link', value)
+
+ def _start_title(self, attrsD):
+ if self.svgOK:
+ return self.unknown_starttag('title', attrsD.items())
+ self.pushContent('title', attrsD, u'text/plain', self.infeed or self.inentry or self.insource)
+ _start_dc_title = _start_title
+ _start_media_title = _start_title
+
+ def _end_title(self):
+ if self.svgOK:
+ return
+ value = self.popContent('title')
+ if not value:
+ return
+ context = self._getContext()
+ self.hasTitle = 1
+ _end_dc_title = _end_title
+
+ def _end_media_title(self):
+ hasTitle = self.hasTitle
+ self._end_title()
+ self.hasTitle = hasTitle
+
+ def _start_description(self, attrsD):
+ context = self._getContext()
+ if context.has_key('summary'):
+ self._summaryKey = 'content'
+ self._start_content(attrsD)
+ else:
+ self.pushContent('description', attrsD, u'text/html', self.infeed or self.inentry or self.insource)
+ _start_dc_description = _start_description
+
+ def _start_abstract(self, attrsD):
+ self.pushContent('description', attrsD, u'text/plain', self.infeed or self.inentry or self.insource)
+
+ def _end_description(self):
+ if self._summaryKey == 'content':
+ self._end_content()
+ else:
+ value = self.popContent('description')
+ self._summaryKey = None
+ _end_abstract = _end_description
+ _end_dc_description = _end_description
+
+ def _start_info(self, attrsD):
+ self.pushContent('info', attrsD, u'text/plain', 1)
+ _start_feedburner_browserfriendly = _start_info
+
+ def _end_info(self):
+ self.popContent('info')
+ _end_feedburner_browserfriendly = _end_info
+
+ def _start_generator(self, attrsD):
+ if attrsD:
+ attrsD = self._itsAnHrefDamnIt(attrsD)
+ if attrsD.has_key('href'):
+ attrsD['href'] = self.resolveURI(attrsD['href'])
+ self._getContext()['generator_detail'] = FeedParserDict(attrsD)
+ self.push('generator', 1)
+
+ def _end_generator(self):
+ value = self.pop('generator')
+ context = self._getContext()
+ if context.has_key('generator_detail'):
+ context['generator_detail']['name'] = value
+
+ def _start_admin_generatoragent(self, attrsD):
+ self.push('generator', 1)
+ value = self._getAttribute(attrsD, 'rdf:resource')
+ if value:
+ self.elementstack[-1][2].append(value)
+ self.pop('generator')
+ self._getContext()['generator_detail'] = FeedParserDict({'href': value})
+
+ def _start_admin_errorreportsto(self, attrsD):
+ self.push('errorreportsto', 1)
+ value = self._getAttribute(attrsD, 'rdf:resource')
+ if value:
+ self.elementstack[-1][2].append(value)
+ self.pop('errorreportsto')
+
+ def _start_summary(self, attrsD):
+ context = self._getContext()
+ if context.has_key('summary'):
+ self._summaryKey = 'content'
+ self._start_content(attrsD)
+ else:
+ self._summaryKey = 'summary'
+ self.pushContent(self._summaryKey, attrsD, u'text/plain', 1)
+ _start_itunes_summary = _start_summary
+
+ def _end_summary(self):
+ if self._summaryKey == 'content':
+ self._end_content()
+ else:
+ self.popContent(self._summaryKey or 'summary')
+ self._summaryKey = None
+ _end_itunes_summary = _end_summary
+
+ def _start_enclosure(self, attrsD):
+ attrsD = self._itsAnHrefDamnIt(attrsD)
+ context = self._getContext()
+ attrsD['rel'] = u'enclosure'
+ context.setdefault('links', []).append(FeedParserDict(attrsD))
+
+ def _start_source(self, attrsD):
+ if 'url' in attrsD:
+ # This means that we're processing a source element from an RSS 2.0 feed
+ self.sourcedata['href'] = attrsD[u'url']
+ self.push('source', 1)
+ self.insource = 1
+ self.hasTitle = 0
+
+ def _end_source(self):
+ self.insource = 0
+ value = self.pop('source')
+ if value:
+ self.sourcedata['title'] = value
+ self._getContext()['source'] = copy.deepcopy(self.sourcedata)
+ self.sourcedata.clear()
+
+ def _start_content(self, attrsD):
+ self.pushContent('content', attrsD, u'text/plain', 1)
+ src = attrsD.get('src')
+ if src:
+ self.contentparams['src'] = src
+ self.push('content', 1)
+
+ def _start_body(self, attrsD):
+ self.pushContent('content', attrsD, u'application/xhtml+xml', 1)
+ _start_xhtml_body = _start_body
+
+ def _start_content_encoded(self, attrsD):
+ self.pushContent('content', attrsD, u'text/html', 1)
+ _start_fullitem = _start_content_encoded
+
+ def _end_content(self):
+ copyToSummary = self.mapContentType(self.contentparams.get('type')) in ([u'text/plain'] + self.html_types)
+ value = self.popContent('content')
+ if copyToSummary:
+ self._save('summary', value)
+
+ _end_body = _end_content
+ _end_xhtml_body = _end_content
+ _end_content_encoded = _end_content
+ _end_fullitem = _end_content
+
+ def _start_itunes_image(self, attrsD):
+ self.push('itunes_image', 0)
+ if attrsD.get('href'):
+ self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
+ _start_itunes_link = _start_itunes_image
+
+ def _end_itunes_block(self):
+ value = self.pop('itunes_block', 0)
+ self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0
+
+ def _end_itunes_explicit(self):
+ value = self.pop('itunes_explicit', 0)
+ # Convert 'yes' -> True, 'clean' to False, and any other value to None
+ # False and None both evaluate as False, so the difference can be ignored
+ # by applications that only need to know if the content is explicit.
+ self._getContext()['itunes_explicit'] = (None, False, True)[(value == 'yes' and 2) or value == 'clean' or 0]
+
+ def _start_media_content(self, attrsD):
+ context = self._getContext()
+ context.setdefault('media_content', [])
+ context['media_content'].append(attrsD)
+
+ def _start_media_thumbnail(self, attrsD):
+ context = self._getContext()
+ context.setdefault('media_thumbnail', [])
+ self.push('url', 1) # new
+ context['media_thumbnail'].append(attrsD)
+
+ def _end_media_thumbnail(self):
+ url = self.pop('url')
+ context = self._getContext()
+ if url != None and len(url.strip()) != 0:
+ if not context['media_thumbnail'][-1].has_key('url'):
+ context['media_thumbnail'][-1]['url'] = url
+
+ def _start_media_player(self, attrsD):
+ self.push('media_player', 0)
+ self._getContext()['media_player'] = FeedParserDict(attrsD)
+
+ def _end_media_player(self):
+ value = self.pop('media_player')
+ context = self._getContext()
+ context['media_player']['content'] = value
+
+ def _start_newlocation(self, attrsD):
+ self.push('newlocation', 1)
+
+ def _end_newlocation(self):
+ url = self.pop('newlocation')
+ context = self._getContext()
+ # don't set newlocation if the context isn't right
+ if context is not self.feeddata:
+ return
+ context['newlocation'] = _makeSafeAbsoluteURI(self.baseuri, url.strip())
+
+if _XML_AVAILABLE:
+ class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
+ def __init__(self, baseuri, baselang, encoding):
+ xml.sax.handler.ContentHandler.__init__(self)
+ _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
+ self.bozo = 0
+ self.exc = None
+ self.decls = {}
+
+ def startPrefixMapping(self, prefix, uri):
+ if not uri:
+ return
+ # Jython uses '' instead of None; standardize on None
+ prefix = prefix or None
+ self.trackNamespace(prefix, uri)
+ if prefix and uri == 'http://www.w3.org/1999/xlink':
+ self.decls['xmlns:' + prefix] = uri
+
+ def startElementNS(self, name, qname, attrs):
+ namespace, localname = name
+ lowernamespace = str(namespace or '').lower()
+ if lowernamespace.find(u'backend.userland.com/rss') <> -1:
+ # match any backend.userland.com namespace
+ namespace = u'http://backend.userland.com/rss'
+ lowernamespace = namespace
+ if qname and qname.find(':') > 0:
+ givenprefix = qname.split(':')[0]
+ else:
+ givenprefix = None
+ prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
+ if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix):
+ raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix
+ localname = str(localname).lower()
+
+ # qname implementation is horribly broken in Python 2.1 (it
+ # doesn't report any), and slightly broken in Python 2.2 (it
+ # doesn't report the xml: namespace). So we match up namespaces
+ # with a known list first, and then possibly override them with
+ # the qnames the SAX parser gives us (if indeed it gives us any
+ # at all). Thanks to MatejC for helping me test this and
+ # tirelessly telling me that it didn't work yet.
+ attrsD, self.decls = self.decls, {}
+ if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
+ attrsD['xmlns']=namespace
+ if localname=='svg' and namespace=='http://www.w3.org/2000/svg':
+ attrsD['xmlns']=namespace
+
+ if prefix:
+ localname = prefix.lower() + ':' + localname
+ elif namespace and not qname: #Expat
+ for name,value in self.namespacesInUse.items():
+ if name and value == namespace:
+ localname = name + ':' + localname
+ break
+
+ for (namespace, attrlocalname), attrvalue in attrs.items():
+ lowernamespace = (namespace or '').lower()
+ prefix = self._matchnamespaces.get(lowernamespace, '')
+ if prefix:
+ attrlocalname = prefix + ':' + attrlocalname
+ attrsD[str(attrlocalname).lower()] = attrvalue
+ for qname in attrs.getQNames():
+ attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
+ self.unknown_starttag(localname, attrsD.items())
+
+ def characters(self, text):
+ self.handle_data(text)
+
+ def endElementNS(self, name, qname):
+ namespace, localname = name
+ lowernamespace = str(namespace or '').lower()
+ if qname and qname.find(':') > 0:
+ givenprefix = qname.split(':')[0]
+ else:
+ givenprefix = ''
+ prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
+ if prefix:
+ localname = prefix + ':' + localname
+ elif namespace and not qname: #Expat
+ for name,value in self.namespacesInUse.items():
+ if name and value == namespace:
+ localname = name + ':' + localname
+ break
+ localname = str(localname).lower()
+ self.unknown_endtag(localname)
+
+ def error(self, exc):
+ self.bozo = 1
+ self.exc = exc
+
+ # drv_libxml2 calls warning() in some cases
+ warning = error
+
+ def fatalError(self, exc):
+ self.error(exc)
+ raise exc
+
+class _BaseHTMLProcessor(sgmllib.SGMLParser):
+ special = re.compile('''[<>'"]''')
+ bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
+ elements_no_end_tag = [
+ 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame',
+ 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param',
+ 'source', 'track', 'wbr'
+ ]
+
+ def __init__(self, encoding, _type):
+ self.encoding = encoding
+ self._type = _type
+ sgmllib.SGMLParser.__init__(self)
+
+ def reset(self):
+ self.pieces = []
+ sgmllib.SGMLParser.reset(self)
+
+ def _shorttag_replace(self, match):
+ tag = match.group(1)
+ if tag in self.elements_no_end_tag:
+ return '<' + tag + ' />'
+ else:
+ return '<' + tag + '></' + tag + '>'
+
+ # By declaring these methods and overriding their compiled code
+ # with the code from sgmllib, the original code will execute in
+ # feedparser's scope instead of sgmllib's. This means that the
+ # `tagfind` and `charref` regular expressions will be found as
+ # they're declared above, not as they're declared in sgmllib.
+ def goahead(self, i):
+ pass
+ goahead.func_code = sgmllib.SGMLParser.goahead.func_code
+
+ def __parse_starttag(self, i):
+ pass
+ __parse_starttag.func_code = sgmllib.SGMLParser.parse_starttag.func_code
+
+ def parse_starttag(self,i):
+ j = self.__parse_starttag(i)
+ if self._type == 'application/xhtml+xml':
+ if j>2 and self.rawdata[j-2:j]=='/>':
+ self.unknown_endtag(self.lasttag)
+ return j
+
+ def feed(self, data):
+ data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'<!\1', data)
+ #data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace
+ data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
+ data = data.replace(''', "'")
+ data = data.replace('"', '"')
+ try:
+ bytes
+ if bytes is str:
+ raise NameError
+ self.encoding = self.encoding + u'_INVALID_PYTHON_3'
+ except NameError:
+ if self.encoding and isinstance(data, unicode):
+ data = data.encode(self.encoding)
+ sgmllib.SGMLParser.feed(self, data)
+ sgmllib.SGMLParser.close(self)
+
+ def normalize_attrs(self, attrs):
+ if not attrs:
+ return attrs
+ # utility method to be called by descendants
+ attrs = dict([(k.lower(), v) for k, v in attrs]).items()
+ attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
+ attrs.sort()
+ return attrs
+
+ def unknown_starttag(self, tag, attrs):
+ # called for each start tag
+ # attrs is a list of (attr, value) tuples
+ # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
+ uattrs = []
+ strattrs=''
+ if attrs:
+ for key, value in attrs:
+ value=value.replace('>','>').replace('<','<').replace('"','"')
+ value = self.bare_ampersand.sub("&", value)
+ # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
+ if not isinstance(value, unicode):
+ value = value.decode(self.encoding, 'ignore')
+ try:
+ # Currently, in Python 3 the key is already a str, and cannot be decoded again
+ uattrs.append((unicode(key, self.encoding), value))
+ except TypeError:
+ uattrs.append((key, value))
+ strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs])
+ if self.encoding:
+ try:
+ strattrs = strattrs.encode(self.encoding)
+ except (UnicodeEncodeError, LookupError):
+ pass
+ if tag in self.elements_no_end_tag:
+ self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
+ else:
+ self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
+
+ def unknown_endtag(self, tag):
+ # called for each end tag, e.g. for </pre>, tag will be 'pre'
+ # Reconstruct the original end tag.
+ if tag not in self.elements_no_end_tag:
+ self.pieces.append("</%(tag)s>" % locals())
+
+ def handle_charref(self, ref):
+ # called for each character reference, e.g. for ' ', ref will be '160'
+ # Reconstruct the original character reference.
+ if ref.startswith('x'):
+ value = unichr(int(ref[1:],16))
+ else:
+ value = unichr(int(ref))
+
+ if value in _cp1252.keys():
+ self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:])
+ else:
+ self.pieces.append('&#%(ref)s;' % locals())
+
+ def handle_entityref(self, ref):
+ # called for each entity reference, e.g. for '©', ref will be 'copy'
+ # Reconstruct the original entity reference.
+ if name2codepoint.has_key(ref):
+ self.pieces.append('&%(ref)s;' % locals())
+ else:
+ self.pieces.append('&%(ref)s' % locals())
+
+ def handle_data(self, text):
+ # called for each block of plain text, i.e. outside of any tag and
+ # not containing any character or entity references
+ # Store the original text verbatim.
+ self.pieces.append(text)
+
+ def handle_comment(self, text):
+ # called for each HTML comment, e.g. <!-- insert Javascript code here -->
+ # Reconstruct the original comment.
+ self.pieces.append('<!--%(text)s-->' % locals())
+
+ def handle_pi(self, text):
+ # called for each processing instruction, e.g. <?instruction>
+ # Reconstruct original processing instruction.
+ self.pieces.append('<?%(text)s>' % locals())
+
+ def handle_decl(self, text):
+ # called for the DOCTYPE, if present, e.g.
+ # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
+ # "http://www.w3.org/TR/html4/loose.dtd">
+ # Reconstruct original DOCTYPE
+ self.pieces.append('<!%(text)s>' % locals())
+
+ _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
+ def _scan_name(self, i, declstartpos):
+ rawdata = self.rawdata
+ n = len(rawdata)
+ if i == n:
+ return None, -1
+ m = self._new_declname_match(rawdata, i)
+ if m:
+ s = m.group()
+ name = s.strip()
+ if (i + len(s)) == n:
+ return None, -1 # end of buffer
+ return name.lower(), m.end()
+ else:
+ self.handle_data(rawdata)
+# self.updatepos(declstartpos, i)
+ return None, -1
+
+ def convert_charref(self, name):
+ return '&#%s;' % name
+
+ def convert_entityref(self, name):
+ return '&%s;' % name
+
+ def output(self):
+ '''Return processed HTML as a single string'''
+ return ''.join([str(p) for p in self.pieces])
+
+ def parse_declaration(self, i):
+ try:
+ return sgmllib.SGMLParser.parse_declaration(self, i)
+ except sgmllib.SGMLParseError:
+ # escape the doctype declaration and continue parsing
+ self.handle_data('<')
+ return i+1
+
+class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
+ def __init__(self, baseuri, baselang, encoding, entities):
+ sgmllib.SGMLParser.__init__(self)
+ _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
+ _BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml')
+ self.entities=entities
+
+ def decodeEntities(self, element, data):
+ data = data.replace('<', '<')
+ data = data.replace('<', '<')
+ data = data.replace('<', '<')
+ data = data.replace('>', '>')
+ data = data.replace('>', '>')
+ data = data.replace('>', '>')
+ data = data.replace('&', '&')
+ data = data.replace('&', '&')
+ data = data.replace('"', '"')
+ data = data.replace('"', '"')
+ data = data.replace(''', ''')
+ data = data.replace(''', ''')
+ if self.contentparams.has_key('type') and not self.contentparams.get('type', u'xml').endswith(u'xml'):
+ data = data.replace('<', '<')
+ data = data.replace('>', '>')
+ data = data.replace('&', '&')
+ data = data.replace('"', '"')
+ data = data.replace(''', "'")
+ return data
+
+ def strattrs(self, attrs):
+ return ''.join([' %s="%s"' % (n,v.replace('"','"')) for n,v in attrs])
+
+class _MicroformatsParser:
+ STRING = 1
+ DATE = 2
+ URI = 3
+ NODE = 4
+ EMAIL = 5
+
+ known_xfn_relationships = ['contact', 'acquaintance', 'friend', 'met', 'co-worker', 'coworker', 'colleague', 'co-resident', 'coresident', 'neighbor', 'child', 'parent', 'sibling', 'brother', 'sister', 'spouse', 'wife', 'husband', 'kin', 'relative', 'muse', 'crush', 'date', 'sweetheart', 'me']
+ known_binary_extensions = ['zip','rar','exe','gz','tar','tgz','tbz2','bz2','z','7z','dmg','img','sit','sitx','hqx','deb','rpm','bz2','jar','rar','iso','bin','msi','mp2','mp3','ogg','ogm','mp4','m4v','m4a','avi','wma','wmv']
+
+ def __init__(self, data, baseuri, encoding):
+ self.document = BeautifulSoup.BeautifulSoup(data)
+ self.baseuri = baseuri
+ self.encoding = encoding
+ if isinstance(data, unicode):
+ data = data.encode(encoding)
+ self.tags = []
+ self.enclosures = []
+ self.xfn = []
+ self.vcard = None
+
+ def vcardEscape(self, s):
+ if isinstance(s, basestring):
+ s = s.replace(',', '\\,').replace(';', '\\;').replace('\n', '\\n')
+ return s
+
+ def vcardFold(self, s):
+ s = re.sub(';+$', '', s)
+ sFolded = ''
+ iMax = 75
+ sPrefix = ''
+ while len(s) > iMax:
+ sFolded += sPrefix + s[:iMax] + '\n'
+ s = s[iMax:]
+ sPrefix = ' '
+ iMax = 74
+ sFolded += sPrefix + s
+ return sFolded
+
+ def normalize(self, s):
+ return re.sub(r'\s+', ' ', s).strip()
+
+ def unique(self, aList):
+ results = []
+ for element in aList:
+ if element not in results:
+ results.append(element)
+ return results
+
+ def toISO8601(self, dt):
+ return time.strftime('%Y-%m-%dT%H:%M:%SZ', dt)
+
+ def getPropertyValue(self, elmRoot, sProperty, iPropertyType=4, bAllowMultiple=0, bAutoEscape=0):
+ all = lambda x: 1
+ sProperty = sProperty.lower()
+ bFound = 0
+ bNormalize = 1
+ propertyMatch = {'class': re.compile(r'\b%s\b' % sProperty)}
+ if bAllowMultiple and (iPropertyType != self.NODE):
+ snapResults = []
+ containers = elmRoot(['ul', 'ol'], propertyMatch)
+ for container in containers:
+ snapResults.extend(container('li'))
+ bFound = (len(snapResults) != 0)
+ if not bFound:
+ snapResults = elmRoot(all, propertyMatch)
+ bFound = (len(snapResults) != 0)
+ if (not bFound) and (sProperty == 'value'):
+ snapResults = elmRoot('pre')
+ bFound = (len(snapResults) != 0)
+ bNormalize = not bFound
+ if not bFound:
+ snapResults = [elmRoot]
+ bFound = (len(snapResults) != 0)
+ arFilter = []
+ if sProperty == 'vcard':
+ snapFilter = elmRoot(all, propertyMatch)
+ for node in snapFilter:
+ if node.findParent(all, propertyMatch):
+ arFilter.append(node)
+ arResults = []
+ for node in snapResults:
+ if node not in arFilter:
+ arResults.append(node)
+ bFound = (len(arResults) != 0)
+ if not bFound:
+ if bAllowMultiple:
+ return []
+ elif iPropertyType == self.STRING:
+ return ''
+ elif iPropertyType == self.DATE:
+ return None
+ elif iPropertyType == self.URI:
+ return ''
+ elif iPropertyType == self.NODE:
+ return None
+ else:
+ return None
+ arValues = []
+ for elmResult in arResults:
+ sValue = None
+ if iPropertyType == self.NODE:
+ if bAllowMultiple:
+ arValues.append(elmResult)
+ continue
+ else:
+ return elmResult
+ sNodeName = elmResult.name.lower()
+ if (iPropertyType == self.EMAIL) and (sNodeName == 'a'):
+ sValue = (elmResult.get('href') or '').split('mailto:').pop().split('?')[0]
+ if sValue:
+ sValue = bNormalize and self.normalize(sValue) or sValue.strip()
+ if (not sValue) and (sNodeName == 'abbr'):
+ sValue = elmResult.get('title')
+ if sValue:
+ sValue = bNormalize and self.normalize(sValue) or sValue.strip()
+ if (not sValue) and (iPropertyType == self.URI):
+ if sNodeName == 'a':
+ sValue = elmResult.get('href')
+ elif sNodeName == 'img':
+ sValue = elmResult.get('src')
+ elif sNodeName == 'object':
+ sValue = elmResult.get('data')
+ if sValue:
+ sValue = bNormalize and self.normalize(sValue) or sValue.strip()
+ if (not sValue) and (sNodeName == 'img'):
+ sValue = elmResult.get('alt')
+ if sValue:
+ sValue = bNormalize and self.normalize(sValue) or sValue.strip()
+ if not sValue:
+ sValue = elmResult.renderContents()
+ sValue = re.sub(r'<\S[^>]*>', '', sValue)
+ sValue = sValue.replace('\r\n', '\n')
+ sValue = sValue.replace('\r', '\n')
+ if sValue:
+ sValue = bNormalize and self.normalize(sValue) or sValue.strip()
+ if not sValue:
+ continue
+ if iPropertyType == self.DATE:
+ sValue = _parse_date_iso8601(sValue)
+ if bAllowMultiple:
+ arValues.append(bAutoEscape and self.vcardEscape(sValue) or sValue)
+ else:
+ return bAutoEscape and self.vcardEscape(sValue) or sValue
+ return arValues
+
+ def findVCards(self, elmRoot, bAgentParsing=0):
+ sVCards = ''
+
+ if not bAgentParsing:
+ arCards = self.getPropertyValue(elmRoot, 'vcard', bAllowMultiple=1)
+ else:
+ arCards = [elmRoot]
+
+ for elmCard in arCards:
+ arLines = []
+
+ def processSingleString(sProperty):
+ sValue = self.getPropertyValue(elmCard, sProperty, self.STRING, bAutoEscape=1).decode(self.encoding)
+ if sValue:
+ arLines.append(self.vcardFold(sProperty.upper() + ':' + sValue))
+ return sValue or u''
+
+ def processSingleURI(sProperty):
+ sValue = self.getPropertyValue(elmCard, sProperty, self.URI)
+ if sValue:
+ sContentType = ''
+ sEncoding = ''
+ sValueKey = ''
+ if sValue.startswith('data:'):
+ sEncoding = ';ENCODING=b'
+ sContentType = sValue.split(';')[0].split('/').pop()
+ sValue = sValue.split(',', 1).pop()
+ else:
+ elmValue = self.getPropertyValue(elmCard, sProperty)
+ if elmValue:
+ if sProperty != 'url':
+ sValueKey = ';VALUE=uri'
+ sContentType = elmValue.get('type', '').strip().split('/').pop().strip()
+ sContentType = sContentType.upper()
+ if sContentType == 'OCTET-STREAM':
+ sContentType = ''
+ if sContentType:
+ sContentType = ';TYPE=' + sContentType.upper()
+ arLines.append(self.vcardFold(sProperty.upper() + sEncoding + sContentType + sValueKey + ':' + sValue))
+
+ def processTypeValue(sProperty, arDefaultType, arForceType=None):
+ arResults = self.getPropertyValue(elmCard, sProperty, bAllowMultiple=1)
+ for elmResult in arResults:
+ arType = self.getPropertyValue(elmResult, 'type', self.STRING, 1, 1)
+ if arForceType:
+ arType = self.unique(arForceType + arType)
+ if not arType:
+ arType = arDefaultType
+ sValue = self.getPropertyValue(elmResult, 'value', self.EMAIL, 0)
+ if sValue:
+ arLines.append(self.vcardFold(sProperty.upper() + ';TYPE=' + ','.join(arType) + ':' + sValue))
+
+ # AGENT
+ # must do this before all other properties because it is destructive
+ # (removes nested class="vcard" nodes so they don't interfere with
+ # this vcard's other properties)
+ arAgent = self.getPropertyValue(elmCard, 'agent', bAllowMultiple=1)
+ for elmAgent in arAgent:
+ if re.compile(r'\bvcard\b').search(elmAgent.get('class')):
+ sAgentValue = self.findVCards(elmAgent, 1) + '\n'
+ sAgentValue = sAgentValue.replace('\n', '\\n')
+ sAgentValue = sAgentValue.replace(';', '\\;')
+ if sAgentValue:
+ arLines.append(self.vcardFold('AGENT:' + sAgentValue))
+ # Completely remove the agent element from the parse tree
+ elmAgent.extract()
+ else:
+ sAgentValue = self.getPropertyValue(elmAgent, 'value', self.URI, bAutoEscape=1);
+ if sAgentValue:
+ arLines.append(self.vcardFold('AGENT;VALUE=uri:' + sAgentValue))
+
+ # FN (full name)
+ sFN = processSingleString('fn')
+
+ # N (name)
+ elmName = self.getPropertyValue(elmCard, 'n')
+ if elmName:
+ sFamilyName = self.getPropertyValue(elmName, 'family-name', self.STRING, bAutoEscape=1)
+ sGivenName = self.getPropertyValue(elmName, 'given-name', self.STRING, bAutoEscape=1)
+ arAdditionalNames = self.getPropertyValue(elmName, 'additional-name', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'additional-names', self.STRING, 1, 1)
+ arHonorificPrefixes = self.getPropertyValue(elmName, 'honorific-prefix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-prefixes', self.STRING, 1, 1)
+ arHonorificSuffixes = self.getPropertyValue(elmName, 'honorific-suffix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-suffixes', self.STRING, 1, 1)
+ arLines.append(self.vcardFold('N:' + sFamilyName + ';' +
+ sGivenName + ';' +
+ ','.join(arAdditionalNames) + ';' +
+ ','.join(arHonorificPrefixes) + ';' +
+ ','.join(arHonorificSuffixes)))
+ elif sFN:
+ # implied "N" optimization
+ # http://microformats.org/wiki/hcard#Implied_.22N.22_Optimization
+ arNames = self.normalize(sFN).split()
+ if len(arNames) == 2:
+ bFamilyNameFirst = (arNames[0].endswith(',') or
+ len(arNames[1]) == 1 or
+ ((len(arNames[1]) == 2) and (arNames[1].endswith('.'))))
+ if bFamilyNameFirst:
+ arLines.append(self.vcardFold('N:' + arNames[0] + ';' + arNames[1]))
+ else:
+ arLines.append(self.vcardFold('N:' + arNames[1] + ';' + arNames[0]))
+
+ # SORT-STRING
+ sSortString = self.getPropertyValue(elmCard, 'sort-string', self.STRING, bAutoEscape=1)
+ if sSortString:
+ arLines.append(self.vcardFold('SORT-STRING:' + sSortString))
+
+ # NICKNAME
+ arNickname = self.getPropertyValue(elmCard, 'nickname', self.STRING, 1, 1)
+ if arNickname:
+ arLines.append(self.vcardFold('NICKNAME:' + ','.join(arNickname)))
+
+ # PHOTO
+ processSingleURI('photo')
+
+ # BDAY
+ dtBday = self.getPropertyValue(elmCard, 'bday', self.DATE)
+ if dtBday:
+ arLines.append(self.vcardFold('BDAY:' + self.toISO8601(dtBday)))
+
+ # ADR (address)
+ arAdr = self.getPropertyValue(elmCard, 'adr', bAllowMultiple=1)
+ for elmAdr in arAdr:
+ arType = self.getPropertyValue(elmAdr, 'type', self.STRING, 1, 1)
+ if not arType:
+ arType = ['intl','postal','parcel','work'] # default adr types, see RFC 2426 section 3.2.1
+ sPostOfficeBox = self.getPropertyValue(elmAdr, 'post-office-box', self.STRING, 0, 1)
+ sExtendedAddress = self.getPropertyValue(elmAdr, 'extended-address', self.STRING, 0, 1)
+ sStreetAddress = self.getPropertyValue(elmAdr, 'street-address', self.STRING, 0, 1)
+ sLocality = self.getPropertyValue(elmAdr, 'locality', self.STRING, 0, 1)
+ sRegion = self.getPropertyValue(elmAdr, 'region', self.STRING, 0, 1)
+ sPostalCode = self.getPropertyValue(elmAdr, 'postal-code', self.STRING, 0, 1)
+ sCountryName = self.getPropertyValue(elmAdr, 'country-name', self.STRING, 0, 1)
+ arLines.append(self.vcardFold('ADR;TYPE=' + ','.join(arType) + ':' +
+ sPostOfficeBox + ';' +
+ sExtendedAddress + ';' +
+ sStreetAddress + ';' +
+ sLocality + ';' +
+ sRegion + ';' +
+ sPostalCode + ';' +
+ sCountryName))
+
+ # LABEL
+ processTypeValue('label', ['intl','postal','parcel','work'])
+
+ # TEL (phone number)
+ processTypeValue('tel', ['voice'])
+
+ # EMAIL
+ processTypeValue('email', ['internet'], ['internet'])
+
+ # MAILER
+ processSingleString('mailer')
+
+ # TZ (timezone)
+ processSingleString('tz')
+
+ # GEO (geographical information)
+ elmGeo = self.getPropertyValue(elmCard, 'geo')
+ if elmGeo:
+ sLatitude = self.getPropertyValue(elmGeo, 'latitude', self.STRING, 0, 1)
+ sLongitude = self.getPropertyValue(elmGeo, 'longitude', self.STRING, 0, 1)
+ arLines.append(self.vcardFold('GEO:' + sLatitude + ';' + sLongitude))
+
+ # TITLE
+ processSingleString('title')
+
+ # ROLE
+ processSingleString('role')
+
+ # LOGO
+ processSingleURI('logo')
+
+ # ORG (organization)
+ elmOrg = self.getPropertyValue(elmCard, 'org')
+ if elmOrg:
+ sOrganizationName = self.getPropertyValue(elmOrg, 'organization-name', self.STRING, 0, 1)
+ if not sOrganizationName:
+ # implied "organization-name" optimization
+ # http://microformats.org/wiki/hcard#Implied_.22organization-name.22_Optimization
+ sOrganizationName = self.getPropertyValue(elmCard, 'org', self.STRING, 0, 1)
+ if sOrganizationName:
+ arLines.append(self.vcardFold('ORG:' + sOrganizationName))
+ else:
+ arOrganizationUnit = self.getPropertyValue(elmOrg, 'organization-unit', self.STRING, 1, 1)
+ arLines.append(self.vcardFold('ORG:' + sOrganizationName + ';' + ';'.join(arOrganizationUnit)))
+
+ # CATEGORY
+ arCategory = self.getPropertyValue(elmCard, 'category', self.STRING, 1, 1) + self.getPropertyValue(elmCard, 'categories', self.STRING, 1, 1)
+ if arCategory:
+ arLines.append(self.vcardFold('CATEGORIES:' + ','.join(arCategory)))
+
+ # NOTE
+ processSingleString('note')
+
+ # REV
+ processSingleString('rev')
+
+ # SOUND
+ processSingleURI('sound')
+
+ # UID
+ processSingleString('uid')
+
+ # URL
+ processSingleURI('url')
+
+ # CLASS
+ processSingleString('class')
+
+ # KEY
+ processSingleURI('key')
+
+ if arLines:
+ arLines = [u'BEGIN:vCard',u'VERSION:3.0'] + arLines + [u'END:vCard']
+ # XXX - this is super ugly; properly fix this with issue 148
+ for i, s in enumerate(arLines):
+ if not isinstance(s, unicode):
+ arLines[i] = s.decode('utf-8', 'ignore')
+ sVCards += u'\n'.join(arLines) + u'\n'
+
+ return sVCards.strip()
+
+ def isProbablyDownloadable(self, elm):
+ attrsD = elm.attrMap
+ if not attrsD.has_key('href'):
+ return 0
+ linktype = attrsD.get('type', '').strip()
+ if linktype.startswith('audio/') or \
+ linktype.startswith('video/') or \
+ (linktype.startswith('application/') and not linktype.endswith('xml')):
+ return 1
+ path = urlparse.urlparse(attrsD['href'])[2]
+ if path.find('.') == -1:
+ return 0
+ fileext = path.split('.').pop().lower()
+ return fileext in self.known_binary_extensions
+
+ def findTags(self):
+ all = lambda x: 1
+ for elm in self.document(all, {'rel': re.compile(r'\btag\b')}):
+ href = elm.get('href')
+ if not href:
+ continue
+ urlscheme, domain, path, params, query, fragment = \
+ urlparse.urlparse(_urljoin(self.baseuri, href))
+ segments = path.split('/')
+ tag = segments.pop()
+ if not tag:
+ if segments:
+ tag = segments.pop()
+ else:
+ # there are no tags
+ continue
+ tagscheme = urlparse.urlunparse((urlscheme, domain, '/'.join(segments), '', '', ''))
+ if not tagscheme.endswith('/'):
+ tagscheme += '/'
+ self.tags.append(FeedParserDict({"term": tag, "scheme": tagscheme, "label": elm.string or ''}))
+
+ def findEnclosures(self):
+ all = lambda x: 1
+ enclosure_match = re.compile(r'\benclosure\b')
+ for elm in self.document(all, {'href': re.compile(r'.+')}):
+ if not enclosure_match.search(elm.get('rel', u'')) and not self.isProbablyDownloadable(elm):
+ continue
+ if elm.attrMap not in self.enclosures:
+ self.enclosures.append(elm.attrMap)
+ if elm.string and not elm.get('title'):
+ self.enclosures[-1]['title'] = elm.string
+
+ def findXFN(self):
+ all = lambda x: 1
+ for elm in self.document(all, {'rel': re.compile('.+'), 'href': re.compile('.+')}):
+ rels = elm.get('rel', u'').split()
+ xfn_rels = []
+ for rel in rels:
+ if rel in self.known_xfn_relationships:
+ xfn_rels.append(rel)
+ if xfn_rels:
+ self.xfn.append({"relationships": xfn_rels, "href": elm.get('href', ''), "name": elm.string})
+
+def _parseMicroformats(htmlSource, baseURI, encoding):
+ if not BeautifulSoup:
+ return
+ try:
+ p = _MicroformatsParser(htmlSource, baseURI, encoding)
+ except UnicodeEncodeError:
+ # sgmllib throws this exception when performing lookups of tags
+ # with non-ASCII characters in them.
+ return
+ p.vcard = p.findVCards(p.document)
+ p.findTags()
+ p.findEnclosures()
+ p.findXFN()
+ return {"tags": p.tags, "enclosures": p.enclosures, "xfn": p.xfn, "vcard": p.vcard}
+
+class _RelativeURIResolver(_BaseHTMLProcessor):
+ relative_uris = [('a', 'href'),
+ ('applet', 'codebase'),
+ ('area', 'href'),
+ ('blockquote', 'cite'),
+ ('body', 'background'),
+ ('del', 'cite'),
+ ('form', 'action'),
+ ('frame', 'longdesc'),
+ ('frame', 'src'),
+ ('iframe', 'longdesc'),
+ ('iframe', 'src'),
+ ('head', 'profile'),
+ ('img', 'longdesc'),
+ ('img', 'src'),
+ ('img', 'usemap'),
+ ('input', 'src'),
+ ('input', 'usemap'),
+ ('ins', 'cite'),
+ ('link', 'href'),
+ ('object', 'classid'),
+ ('object', 'codebase'),
+ ('object', 'data'),
+ ('object', 'usemap'),
+ ('q', 'cite'),
+ ('script', 'src')]
+
+ def __init__(self, baseuri, encoding, _type):
+ _BaseHTMLProcessor.__init__(self, encoding, _type)
+ self.baseuri = baseuri
+
+ def resolveURI(self, uri):
+ return _makeSafeAbsoluteURI(_urljoin(self.baseuri, uri.strip()))
+
+ def unknown_starttag(self, tag, attrs):
+ attrs = self.normalize_attrs(attrs)
+ attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
+ _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
+
+def _resolveRelativeURIs(htmlSource, baseURI, encoding, _type):
+ if not _SGML_AVAILABLE:
+ return htmlSource
+
+ p = _RelativeURIResolver(baseURI, encoding, _type)
+ p.feed(htmlSource)
+ return p.output()
+
+def _makeSafeAbsoluteURI(base, rel=None):
+ # bail if ACCEPTABLE_URI_SCHEMES is empty
+ if not ACCEPTABLE_URI_SCHEMES:
+ return _urljoin(base, rel or u'')
+ if not base:
+ return rel or u''
+ if not rel:
+ scheme = urlparse.urlparse(base)[0]
+ if not scheme or scheme in ACCEPTABLE_URI_SCHEMES:
+ return base
+ return u''
+ uri = _urljoin(base, rel)
+ if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES:
+ return u''
+ return uri
+
+class _HTMLSanitizer(_BaseHTMLProcessor):
+ acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
+ 'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
+ 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
+ 'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
+ 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
+ 'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
+ 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
+ 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
+ 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
+ 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
+ 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
+ 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
+ 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript']
+
+ acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
+ 'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
+ 'background', 'balance', 'bgcolor', 'bgproperties', 'border',
+ 'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
+ 'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
+ 'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols',
+ 'colspan', 'compact', 'contenteditable', 'controls', 'coords', 'data',
+ 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', 'delay',
+ 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for',
+ 'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus',
+ 'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode',
+ 'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc',
+ 'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max',
+ 'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref',
+ 'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size',
+ 'prompt', 'pqg', 'radiogroup', 'readonly', 'rel', 'repeat-max',
+ 'repeat-min', 'replace', 'required', 'rev', 'rightspacing', 'rows',
+ 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src',
+ 'start', 'step', 'summary', 'suppress', 'tabindex', 'target', 'template',
+ 'title', 'toppadding', 'type', 'unselectable', 'usemap', 'urn', 'valign',
+ 'value', 'variable', 'volume', 'vspace', 'vrml', 'width', 'wrap',
+ 'xml:lang']
+
+ unacceptable_elements_with_end_tag = ['script', 'applet', 'style']
+
+ acceptable_css_properties = ['azimuth', 'background-color',
+ 'border-bottom-color', 'border-collapse', 'border-color',
+ 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
+ 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
+ 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
+ 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
+ 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
+ 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
+ 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
+ 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
+ 'white-space', 'width']
+
+ # survey of common keywords found in feeds
+ acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
+ 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
+ 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
+ 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
+ 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
+ 'transparent', 'underline', 'white', 'yellow']
+
+ valid_css_values = re.compile('^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|' +
+ '\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$')
+
+ mathml_elements = ['annotation', 'annotation-xml', 'maction', 'math',
+ 'merror', 'mfenced', 'mfrac', 'mi', 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded',
+ 'mphantom', 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle',
+ 'msub', 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
+ 'munderover', 'none', 'semantics']
+
+ mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
+ 'columnalign', 'close', 'columnlines', 'columnspacing', 'columnspan', 'depth',
+ 'display', 'displaystyle', 'encoding', 'equalcolumns', 'equalrows',
+ 'fence', 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness',
+ 'lspace', 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant',
+ 'maxsize', 'minsize', 'open', 'other', 'rowalign', 'rowalign', 'rowalign',
+ 'rowlines', 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
+ 'separator', 'separators', 'stretchy', 'width', 'width', 'xlink:href',
+ 'xlink:show', 'xlink:type', 'xmlns', 'xmlns:xlink']
+
+ # svgtiny - foreignObject + linearGradient + radialGradient + stop
+ svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
+ 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject',
+ 'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
+ 'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath',
+ 'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop',
+ 'svg', 'switch', 'text', 'title', 'tspan', 'use']
+
+ # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
+ svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
+ 'arabic-form', 'ascent', 'attributeName', 'attributeType',
+ 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
+ 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
+ 'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
+ 'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
+ 'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
+ 'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
+ 'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'keySplines',
+ 'keyTimes', 'lang', 'mathematical', 'marker-end', 'marker-mid',
+ 'marker-start', 'markerHeight', 'markerUnits', 'markerWidth', 'max',
+ 'min', 'name', 'offset', 'opacity', 'orient', 'origin',
+ 'overline-position', 'overline-thickness', 'panose-1', 'path',
+ 'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX', 'refY',
+ 'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
+ 'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
+ 'stop-color', 'stop-opacity', 'strikethrough-position',
+ 'strikethrough-thickness', 'stroke', 'stroke-dasharray',
+ 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
+ 'stroke-miterlimit', 'stroke-opacity', 'stroke-width', 'systemLanguage',
+ 'target', 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
+ 'underline-position', 'underline-thickness', 'unicode', 'unicode-range',
+ 'units-per-em', 'values', 'version', 'viewBox', 'visibility', 'width',
+ 'widths', 'x', 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
+ 'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
+ 'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1',
+ 'y2', 'zoomAndPan']
+
+ svg_attr_map = None
+ svg_elem_map = None
+
+ acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
+ 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
+ 'stroke-opacity']
+
+ def reset(self):
+ _BaseHTMLProcessor.reset(self)
+ self.unacceptablestack = 0
+ self.mathmlOK = 0
+ self.svgOK = 0
+
+ def unknown_starttag(self, tag, attrs):
+ acceptable_attributes = self.acceptable_attributes
+ keymap = {}
+ if not tag in self.acceptable_elements or self.svgOK:
+ if tag in self.unacceptable_elements_with_end_tag:
+ self.unacceptablestack += 1
+
+ # add implicit namespaces to html5 inline svg/mathml
+ if self._type.endswith('html'):
+ if not dict(attrs).get('xmlns'):
+ if tag=='svg':
+ attrs.append( ('xmlns','http://www.w3.org/2000/svg') )
+ if tag=='math':
+ attrs.append( ('xmlns','http://www.w3.org/1998/Math/MathML') )
+
+ # not otherwise acceptable, perhaps it is MathML or SVG?
+ if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs:
+ self.mathmlOK += 1
+ if tag=='svg' and ('xmlns','http://www.w3.org/2000/svg') in attrs:
+ self.svgOK += 1
+
+ # chose acceptable attributes based on tag class, else bail
+ if self.mathmlOK and tag in self.mathml_elements:
+ acceptable_attributes = self.mathml_attributes
+ elif self.svgOK and tag in self.svg_elements:
+ # for most vocabularies, lowercasing is a good idea. Many
+ # svg elements, however, are camel case
+ if not self.svg_attr_map:
+ lower=[attr.lower() for attr in self.svg_attributes]
+ mix=[a for a in self.svg_attributes if a not in lower]
+ self.svg_attributes = lower
+ self.svg_attr_map = dict([(a.lower(),a) for a in mix])
+
+ lower=[attr.lower() for attr in self.svg_elements]
+ mix=[a for a in self.svg_elements if a not in lower]
+ self.svg_elements = lower
+ self.svg_elem_map = dict([(a.lower(),a) for a in mix])
+ acceptable_attributes = self.svg_attributes
+ tag = self.svg_elem_map.get(tag,tag)
+ keymap = self.svg_attr_map
+ elif not tag in self.acceptable_elements:
+ return
+
+ # declare xlink namespace, if needed
+ if self.mathmlOK or self.svgOK:
+ if filter(lambda (n,v): n.startswith('xlink:'),attrs):
+ if not ('xmlns:xlink','http://www.w3.org/1999/xlink') in attrs:
+ attrs.append(('xmlns:xlink','http://www.w3.org/1999/xlink'))
+
+ clean_attrs = []
+ for key, value in self.normalize_attrs(attrs):
+ if key in acceptable_attributes:
+ key=keymap.get(key,key)
+ # make sure the uri uses an acceptable uri scheme
+ if key == u'href':
+ value = _makeSafeAbsoluteURI(value)
+ clean_attrs.append((key,value))
+ elif key=='style':
+ clean_value = self.sanitize_style(value)
+ if clean_value:
+ clean_attrs.append((key,clean_value))
+ _BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs)
+
+ def unknown_endtag(self, tag):
+ if not tag in self.acceptable_elements:
+ if tag in self.unacceptable_elements_with_end_tag:
+ self.unacceptablestack -= 1
+ if self.mathmlOK and tag in self.mathml_elements:
+ if tag == 'math' and self.mathmlOK:
+ self.mathmlOK -= 1
+ elif self.svgOK and tag in self.svg_elements:
+ tag = self.svg_elem_map.get(tag,tag)
+ if tag == 'svg' and self.svgOK:
+ self.svgOK -= 1
+ else:
+ return
+ _BaseHTMLProcessor.unknown_endtag(self, tag)
+
+ def handle_pi(self, text):
+ pass
+
+ def handle_decl(self, text):
+ pass
+
+ def handle_data(self, text):
+ if not self.unacceptablestack:
+ _BaseHTMLProcessor.handle_data(self, text)
+
+ def sanitize_style(self, style):
+ # disallow urls
+ style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
+
+ # gauntlet
+ if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
+ return ''
+ # This replaced a regexp that used re.match and was prone to pathological back-tracking.
+ if re.sub("\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip():
+ return ''
+
+ clean = []
+ for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
+ if not value:
+ continue
+ if prop.lower() in self.acceptable_css_properties:
+ clean.append(prop + ': ' + value + ';')
+ elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
+ for keyword in value.split():
+ if not keyword in self.acceptable_css_keywords and \
+ not self.valid_css_values.match(keyword):
+ break
+ else:
+ clean.append(prop + ': ' + value + ';')
+ elif self.svgOK and prop.lower() in self.acceptable_svg_properties:
+ clean.append(prop + ': ' + value + ';')
+
+ return ' '.join(clean)
+
+ def parse_comment(self, i, report=1):
+ ret = _BaseHTMLProcessor.parse_comment(self, i, report)
+ if ret >= 0:
+ return ret
+ # if ret == -1, this may be a malicious attempt to circumvent
+ # sanitization, or a page-destroying unclosed comment
+ match = re.compile(r'--[^>]*>').search(self.rawdata, i+4)
+ if match:
+ return match.end()
+ # unclosed comment; deliberately fail to handle_data()
+ return len(self.rawdata)
+
+
+def _sanitizeHTML(htmlSource, encoding, _type):
+ if not _SGML_AVAILABLE:
+ return htmlSource
+ p = _HTMLSanitizer(encoding, _type)
+ htmlSource = htmlSource.replace('<![CDATA[', '<![CDATA[')
+ p.feed(htmlSource)
+ data = p.output()
+ if TIDY_MARKUP:
+ # loop through list of preferred Tidy interfaces looking for one that's installed,
+ # then set up a common _tidy function to wrap the interface-specific API.
+ _tidy = None
+ for tidy_interface in PREFERRED_TIDY_INTERFACES:
+ try:
+ if tidy_interface == "uTidy":
+ from tidy import parseString as _utidy
+ def _tidy(data, **kwargs):
+ return str(_utidy(data, **kwargs))
+ break
+ elif tidy_interface == "mxTidy":
+ from mx.Tidy import Tidy as _mxtidy
+ def _tidy(data, **kwargs):
+ nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs)
+ return data
+ break
+ except:
+ pass
+ if _tidy:
+ utf8 = isinstance(data, unicode)
+ if utf8:
+ data = data.encode('utf-8')
+ data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8")
+ if utf8:
+ data = unicode(data, 'utf-8')
+ if data.count('<body'):
+ data = data.split('<body', 1)[1]
+ if data.count('>'):
+ data = data.split('>', 1)[1]
+ if data.count('</body'):
+ data = data.split('</body', 1)[0]
+ data = data.strip().replace('\r\n', '\n')
+ return data
+
+class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
+ def http_error_default(self, req, fp, code, msg, headers):
+ # The default implementation just raises HTTPError.
+ # Forget that.
+ fp.status = code
+ return fp
+
+ def http_error_301(self, req, fp, code, msg, hdrs):
+ result = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp,
+ code, msg, hdrs)
+ result.status = code
+ result.newurl = result.geturl()
+ return result
+ # The default implementations in urllib2.HTTPRedirectHandler
+ # are identical, so hardcoding a http_error_301 call above
+ # won't affect anything
+ http_error_300 = http_error_301
+ http_error_302 = http_error_301
+ http_error_303 = http_error_301
+ http_error_307 = http_error_301
+
+ def http_error_401(self, req, fp, code, msg, headers):
+ # Check if
+ # - server requires digest auth, AND
+ # - we tried (unsuccessfully) with basic auth, AND
+ # If all conditions hold, parse authentication information
+ # out of the Authorization header we sent the first time
+ # (for the username and password) and the WWW-Authenticate
+ # header the server sent back (for the realm) and retry
+ # the request with the appropriate digest auth headers instead.
+ # This evil genius hack has been brought to you by Aaron Swartz.
+ host = urlparse.urlparse(req.get_full_url())[1]
+ if base64 is None or 'Authorization' not in req.headers \
+ or 'WWW-Authenticate' not in headers:
+ return self.http_error_default(req, fp, code, msg, headers)
+ auth = _base64decode(req.headers['Authorization'].split(' ')[1])
+ user, passw = auth.split(':')
+ realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
+ self.add_password(realm, host, user, passw)
+ retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
+ self.reset_retry_count()
+ return retry
+
+def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers):
+ """URL, filename, or string --> stream
+
+ This function lets you define parsers that take any input source
+ (URL, pathname to local or network file, or actual data as a string)
+ and deal with it in a uniform manner. Returned object is guaranteed
+ to have all the basic stdio read methods (read, readline, readlines).
+ Just .close() the object when you're done with it.
+
+ If the etag argument is supplied, it will be used as the value of an
+ If-None-Match request header.
+
+ If the modified argument is supplied, it can be a tuple of 9 integers
+ (as returned by gmtime() in the standard Python time module) or a date
+ string in any format supported by feedparser. Regardless, it MUST
+ be in GMT (Greenwich Mean Time). It will be reformatted into an
+ RFC 1123-compliant date and used as the value of an If-Modified-Since
+ request header.
+
+ If the agent argument is supplied, it will be used as the value of a
+ User-Agent request header.
+
+ If the referrer argument is supplied, it will be used as the value of a
+ Referer[sic] request header.
+
+ If handlers is supplied, it is a list of handlers used to build a
+ urllib2 opener.
+
+ if request_headers is supplied it is a dictionary of HTTP request headers
+ that will override the values generated by FeedParser.
+ """
+
+ if hasattr(url_file_stream_or_string, 'read'):
+ return url_file_stream_or_string
+
+ if url_file_stream_or_string == '-':
+ return sys.stdin
+
+ if isinstance(url_file_stream_or_string, basestring) \
+ and urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'):
+ # Deal with the feed URI scheme
+ if url_file_stream_or_string.startswith('feed:http'):
+ url_file_stream_or_string = url_file_stream_or_string[5:]
+ elif url_file_stream_or_string.startswith('feed:'):
+ url_file_stream_or_string = 'http:' + url_file_stream_or_string[5:]
+ if not agent:
+ agent = USER_AGENT
+ # test for inline user:password for basic auth
+ auth = None
+ if base64:
+ urltype, rest = urllib.splittype(url_file_stream_or_string)
+ realhost, rest = urllib.splithost(rest)
+ if realhost:
+ user_passwd, realhost = urllib.splituser(realhost)
+ if user_passwd:
+ url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)
+ auth = base64.standard_b64encode(user_passwd).strip()
+
+ # iri support
+ if isinstance(url_file_stream_or_string, unicode):
+ url_file_stream_or_string = _convert_to_idn(url_file_stream_or_string)
+
+ # try to open with urllib2 (to use optional headers)
+ request = _build_urllib2_request(url_file_stream_or_string, agent, etag, modified, referrer, auth, request_headers)
+ opener = apply(urllib2.build_opener, tuple(handlers + [_FeedURLHandler()]))
+ opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
+ try:
+ return opener.open(request)
+ finally:
+ opener.close() # JohnD
+
+ # try to open with native open function (if url_file_stream_or_string is a filename)
+ try:
+ return open(url_file_stream_or_string, 'rb')
+ except IOError:
+ pass
+
+ # treat url_file_stream_or_string as string
+ if isinstance(url_file_stream_or_string, unicode):
+ return _StringIO(url_file_stream_or_string.encode('utf-8'))
+ return _StringIO(url_file_stream_or_string)
+
+def _convert_to_idn(url):
+ """Convert a URL to IDN notation"""
+ # this function should only be called with a unicode string
+ # strategy: if the host cannot be encoded in ascii, then
+ # it'll be necessary to encode it in idn form
+ parts = list(urlparse.urlsplit(url))
+ try:
+ parts[1].encode('ascii')
+ except UnicodeEncodeError:
+ # the url needs to be converted to idn notation
+ host = parts[1].rsplit(':', 1)
+ newhost = []
+ port = u''
+ if len(host) == 2:
+ port = host.pop()
+ for h in host[0].split('.'):
+ newhost.append(h.encode('idna').decode('utf-8'))
+ parts[1] = '.'.join(newhost)
+ if port:
+ parts[1] += ':' + port
+ return urlparse.urlunsplit(parts)
+ else:
+ return url
+
+def _build_urllib2_request(url, agent, etag, modified, referrer, auth, request_headers):
+ request = urllib2.Request(url)
+ request.add_header('User-Agent', agent)
+ if etag:
+ request.add_header('If-None-Match', etag)
+ if isinstance(modified, basestring):
+ modified = _parse_date(modified)
+ elif isinstance(modified, datetime.datetime):
+ modified = modified.utctimetuple()
+ if modified:
+ # format into an RFC 1123-compliant timestamp. We can't use
+ # time.strftime() since the %a and %b directives can be affected
+ # by the current locale, but RFC 2616 states that dates must be
+ # in English.
+ short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
+ months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
+ request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
+ if referrer:
+ request.add_header('Referer', referrer)
+ if gzip and zlib:
+ request.add_header('Accept-encoding', 'gzip, deflate')
+ elif gzip:
+ request.add_header('Accept-encoding', 'gzip')
+ elif zlib:
+ request.add_header('Accept-encoding', 'deflate')
+ else:
+ request.add_header('Accept-encoding', '')
+ if auth:
+ request.add_header('Authorization', 'Basic %s' % auth)
+ if ACCEPT_HEADER:
+ request.add_header('Accept', ACCEPT_HEADER)
+ # use this for whatever -- cookies, special headers, etc
+ # [('Cookie','Something'),('x-special-header','Another Value')]
+ for header_name, header_value in request_headers.items():
+ request.add_header(header_name, header_value)
+ request.add_header('A-IM', 'feed') # RFC 3229 support
+ return request
+
+_date_handlers = []
+def registerDateHandler(func):
+ '''Register a date handler function (takes string, returns 9-tuple date in GMT)'''
+ _date_handlers.insert(0, func)
+
+# ISO-8601 date parsing routines written by Fazal Majid.
+# The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
+# parser is beyond the scope of feedparser and would be a worthwhile addition
+# to the Python library.
+# A single regular expression cannot parse ISO 8601 date formats into groups
+# as the standard is highly irregular (for instance is 030104 2003-01-04 or
+# 0301-04-01), so we use templates instead.
+# Please note the order in templates is significant because we need a
+# greedy match.
+_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-0MM?-?DD', 'YYYY-MM', 'YYYY-?OOO',
+ 'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
+ '-YY-?MM', '-OOO', '-YY',
+ '--MM-?DD', '--MM',
+ '---DD',
+ 'CC', '']
+_iso8601_re = [
+ tmpl.replace(
+ 'YYYY', r'(?P<year>\d{4})').replace(
+ 'YY', r'(?P<year>\d\d)').replace(
+ 'MM', r'(?P<month>[01]\d)').replace(
+ 'DD', r'(?P<day>[0123]\d)').replace(
+ 'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
+ 'CC', r'(?P<century>\d\d$)')
+ + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
+ + r'(:(?P<second>\d{2}))?'
+ + r'(\.(?P<fracsecond>\d+))?'
+ + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
+ for tmpl in _iso8601_tmpl]
+try:
+ del tmpl
+except NameError:
+ pass
+_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
+try:
+ del regex
+except NameError:
+ pass
+def _parse_date_iso8601(dateString):
+ '''Parse a variety of ISO-8601-compatible formats like 20040105'''
+ m = None
+ for _iso8601_match in _iso8601_matches:
+ m = _iso8601_match(dateString)
+ if m:
+ break
+ if not m:
+ return
+ if m.span() == (0, 0):
+ return
+ params = m.groupdict()
+ ordinal = params.get('ordinal', 0)
+ if ordinal:
+ ordinal = int(ordinal)
+ else:
+ ordinal = 0
+ year = params.get('year', '--')
+ if not year or year == '--':
+ year = time.gmtime()[0]
+ elif len(year) == 2:
+ # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
+ year = 100 * int(time.gmtime()[0] / 100) + int(year)
+ else:
+ year = int(year)
+ month = params.get('month', '-')
+ if not month or month == '-':
+ # ordinals are NOT normalized by mktime, we simulate them
+ # by setting month=1, day=ordinal
+ if ordinal:
+ month = 1
+ else:
+ month = time.gmtime()[1]
+ month = int(month)
+ day = params.get('day', 0)
+ if not day:
+ # see above
+ if ordinal:
+ day = ordinal
+ elif params.get('century', 0) or \
+ params.get('year', 0) or params.get('month', 0):
+ day = 1
+ else:
+ day = time.gmtime()[2]
+ else:
+ day = int(day)
+ # special case of the century - is the first year of the 21st century
+ # 2000 or 2001 ? The debate goes on...
+ if 'century' in params.keys():
+ year = (int(params['century']) - 1) * 100 + 1
+ # in ISO 8601 most fields are optional
+ for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:
+ if not params.get(field, None):
+ params[field] = 0
+ hour = int(params.get('hour', 0))
+ minute = int(params.get('minute', 0))
+ second = int(float(params.get('second', 0)))
+ # weekday is normalized by mktime(), we can ignore it
+ weekday = 0
+ daylight_savings_flag = -1
+ tm = [year, month, day, hour, minute, second, weekday,
+ ordinal, daylight_savings_flag]
+ # ISO 8601 time zone adjustments
+ tz = params.get('tz')
+ if tz and tz != 'Z':
+ if tz[0] == '-':
+ tm[3] += int(params.get('tzhour', 0))
+ tm[4] += int(params.get('tzmin', 0))
+ elif tz[0] == '+':
+ tm[3] -= int(params.get('tzhour', 0))
+ tm[4] -= int(params.get('tzmin', 0))
+ else:
+ return None
+ # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
+ # which is guaranteed to normalize d/m/y/h/m/s.
+ # Many implementations have bugs, but we'll pretend they don't.
+ return time.localtime(time.mktime(tuple(tm)))
+registerDateHandler(_parse_date_iso8601)
+
+# 8-bit date handling routines written by ytrewq1.
+_korean_year = u'\ub144' # b3e2 in euc-kr
+_korean_month = u'\uc6d4' # bff9 in euc-kr
+_korean_day = u'\uc77c' # c0cf in euc-kr
+_korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr
+_korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr
+
+_korean_onblog_date_re = \
+ re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
+ (_korean_year, _korean_month, _korean_day))
+_korean_nate_date_re = \
+ re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
+ (_korean_am, _korean_pm))
+def _parse_date_onblog(dateString):
+ '''Parse a string according to the OnBlog 8-bit date format'''
+ m = _korean_onblog_date_re.match(dateString)
+ if not m:
+ return
+ w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
+ {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
+ 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
+ 'zonediff': '+09:00'}
+ return _parse_date_w3dtf(w3dtfdate)
+registerDateHandler(_parse_date_onblog)
+
+def _parse_date_nate(dateString):
+ '''Parse a string according to the Nate 8-bit date format'''
+ m = _korean_nate_date_re.match(dateString)
+ if not m:
+ return
+ hour = int(m.group(5))
+ ampm = m.group(4)
+ if (ampm == _korean_pm):
+ hour += 12
+ hour = str(hour)
+ if len(hour) == 1:
+ hour = '0' + hour
+ w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
+ {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
+ 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\
+ 'zonediff': '+09:00'}
+ return _parse_date_w3dtf(w3dtfdate)
+registerDateHandler(_parse_date_nate)
+
+_mssql_date_re = \
+ re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?')
+def _parse_date_mssql(dateString):
+ '''Parse a string according to the MS SQL date format'''
+ m = _mssql_date_re.match(dateString)
+ if not m:
+ return
+ w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
+ {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
+ 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
+ 'zonediff': '+09:00'}
+ return _parse_date_w3dtf(w3dtfdate)
+registerDateHandler(_parse_date_mssql)
+
+# Unicode strings for Greek date strings
+_greek_months = \
+ { \
+ u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7
+ u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7
+ u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7
+ u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7
+ u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7
+ u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7
+ u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7
+ u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7
+ u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7
+ u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7
+ u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7
+ u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7
+ u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7
+ u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7
+ u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7
+ u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7
+ u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7
+ u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7
+ u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7
+ }
+
+_greek_wdays = \
+ { \
+ u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7
+ u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7
+ u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7
+ u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7
+ u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7
+ u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7
+ u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7
+ }
+
+_greek_date_format_re = \
+ re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
+
+def _parse_date_greek(dateString):
+ '''Parse a string according to a Greek 8-bit date format.'''
+ m = _greek_date_format_re.match(dateString)
+ if not m:
+ return
+ wday = _greek_wdays[m.group(1)]
+ month = _greek_months[m.group(3)]
+ rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
+ {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
+ 'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\
+ 'zonediff': m.group(8)}
+ return _parse_date_rfc822(rfc822date)
+registerDateHandler(_parse_date_greek)
+
+# Unicode strings for Hungarian date strings
+_hungarian_months = \
+ { \
+ u'janu\u00e1r': u'01', # e1 in iso-8859-2
+ u'febru\u00e1ri': u'02', # e1 in iso-8859-2
+ u'm\u00e1rcius': u'03', # e1 in iso-8859-2
+ u'\u00e1prilis': u'04', # e1 in iso-8859-2
+ u'm\u00e1ujus': u'05', # e1 in iso-8859-2
+ u'j\u00fanius': u'06', # fa in iso-8859-2
+ u'j\u00falius': u'07', # fa in iso-8859-2
+ u'augusztus': u'08',
+ u'szeptember': u'09',
+ u'okt\u00f3ber': u'10', # f3 in iso-8859-2
+ u'november': u'11',
+ u'december': u'12',
+ }
+
+_hungarian_date_format_re = \
+ re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
+
+def _parse_date_hungarian(dateString):
+ '''Parse a string according to a Hungarian 8-bit date format.'''
+ m = _hungarian_date_format_re.match(dateString)
+ if not m or m.group(2) not in _hungarian_months:
+ return None
+ month = _hungarian_months[m.group(2)]
+ day = m.group(3)
+ if len(day) == 1:
+ day = '0' + day
+ hour = m.group(4)
+ if len(hour) == 1:
+ hour = '0' + hour
+ w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
+ {'year': m.group(1), 'month': month, 'day': day,\
+ 'hour': hour, 'minute': m.group(5),\
+ 'zonediff': m.group(6)}
+ return _parse_date_w3dtf(w3dtfdate)
+registerDateHandler(_parse_date_hungarian)
+
+# W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by
+# Drake and licensed under the Python license. Removed all range checking
+# for month, day, hour, minute, and second, since mktime will normalize
+# these later
+def _parse_date_w3dtf(dateString):
+ def __extract_date(m):
+ year = int(m.group('year'))
+ if year < 100:
+ year = 100 * int(time.gmtime()[0] / 100) + int(year)
+ if year < 1000:
+ return 0, 0, 0
+ julian = m.group('julian')
+ if julian:
+ julian = int(julian)
+ month = julian / 30 + 1
+ day = julian % 30 + 1
+ jday = None
+ while jday != julian:
+ t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0))
+ jday = time.gmtime(t)[-2]
+ diff = abs(jday - julian)
+ if jday > julian:
+ if diff < day:
+ day = day - diff
+ else:
+ month = month - 1
+ day = 31
+ elif jday < julian:
+ if day + diff < 28:
+ day = day + diff
+ else:
+ month = month + 1
+ return year, month, day
+ month = m.group('month')
+ day = 1
+ if month is None:
+ month = 1
+ else:
+ month = int(month)
+ day = m.group('day')
+ if day:
+ day = int(day)
+ else:
+ day = 1
+ return year, month, day
+
+ def __extract_time(m):
+ if not m:
+ return 0, 0, 0
+ hours = m.group('hours')
+ if not hours:
+ return 0, 0, 0
+ hours = int(hours)
+ minutes = int(m.group('minutes'))
+ seconds = m.group('seconds')
+ if seconds:
+ seconds = int(seconds)
+ else:
+ seconds = 0
+ return hours, minutes, seconds
+
+ def __extract_tzd(m):
+ '''Return the Time Zone Designator as an offset in seconds from UTC.'''
+ if not m:
+ return 0
+ tzd = m.group('tzd')
+ if not tzd:
+ return 0
+ if tzd == 'Z':
+ return 0
+ hours = int(m.group('tzdhours'))
+ minutes = m.group('tzdminutes')
+ if minutes:
+ minutes = int(minutes)
+ else:
+ minutes = 0
+ offset = (hours*60 + minutes) * 60
+ if tzd[0] == '+':
+ return -offset
+ return offset
+
+ __date_re = ('(?P<year>\d\d\d\d)'
+ '(?:(?P<dsep>-|)'
+ '(?:(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?'
+ '|(?P<julian>\d\d\d)))?')
+ __tzd_re = '(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)'
+ __tzd_rx = re.compile(__tzd_re)
+ __time_re = ('(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)'
+ '(?:(?P=tsep)(?P<seconds>\d\d)(?:[.,]\d+)?)?'
+ + __tzd_re)
+ __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re)
+ __datetime_rx = re.compile(__datetime_re)
+ m = __datetime_rx.match(dateString)
+ if (m is None) or (m.group() != dateString):
+ return
+ gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)
+ if gmt[0] == 0:
+ return
+ return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone)
+registerDateHandler(_parse_date_w3dtf)
+
+def _parse_date_rfc822(dateString):
+ '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date'''
+ data = dateString.split()
+ if not data:
+ return None
+ if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames:
+ del data[0]
+ if len(data) == 4:
+ s = data[3]
+ i = s.find('+')
+ if i > 0:
+ data[3:] = [s[:i], s[i+1:]]
+ else:
+ data.append('')
+ dateString = " ".join(data)
+ # Account for the Etc/GMT timezone by stripping 'Etc/'
+ elif len(data) == 5 and data[4].lower().startswith('etc/'):
+ data[4] = data[4][4:]
+ dateString = " ".join(data)
+ if len(data) < 5:
+ dateString += ' 00:00:00 GMT'
+ tm = rfc822.parsedate_tz(dateString)
+ if tm:
+ # Jython doesn't adjust for 2-digit years like CPython does,
+ # so account for it by shifting the year so that it's in the
+ # range 1970-2069 (1970 being the year of the Unix epoch).
+ if tm[0] < 100:
+ tm = (tm[0] + (1900, 2000)[tm[0] < 70],) + tm[1:]
+ return time.gmtime(rfc822.mktime_tz(tm))
+# rfc822.py defines several time zones, but we define some extra ones.
+# 'ET' is equivalent to 'EST', etc.
+_additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
+rfc822._timezones.update(_additional_timezones)
+registerDateHandler(_parse_date_rfc822)
+
+def _parse_date_perforce(aDateString):
+ """parse a date in yyyy/mm/dd hh:mm:ss TTT format"""
+ # Fri, 2006/09/15 08:19:53 EDT
+ _my_date_pattern = re.compile( \
+ r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})')
+
+ m = _my_date_pattern.search(aDateString)
+ if m is None:
+ return None
+ dow, year, month, day, hour, minute, second, tz = m.groups()
+ months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
+ dateString = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz)
+ tm = rfc822.parsedate_tz(dateString)
+ if tm:
+ return time.gmtime(rfc822.mktime_tz(tm))
+registerDateHandler(_parse_date_perforce)
+
+def _parse_date(dateString):
+ '''Parses a variety of date formats into a 9-tuple in GMT'''
+ if not dateString:
+ return None
+ for handler in _date_handlers:
+ try:
+ date9tuple = handler(dateString)
+ except (KeyError, OverflowError, ValueError):
+ continue
+ if not date9tuple:
+ continue
+ if len(date9tuple) != 9:
+ continue
+ return date9tuple
+ return None
+
+def _getCharacterEncoding(http_headers, xml_data):
+ '''Get the character encoding of the XML document
+
+ http_headers is a dictionary
+ xml_data is a raw string (not Unicode)
+
+ This is so much trickier than it sounds, it's not even funny.
+ According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
+ is application/xml, application/*+xml,
+ application/xml-external-parsed-entity, or application/xml-dtd,
+ the encoding given in the charset parameter of the HTTP Content-Type
+ takes precedence over the encoding given in the XML prefix within the
+ document, and defaults to 'utf-8' if neither are specified. But, if
+ the HTTP Content-Type is text/xml, text/*+xml, or
+ text/xml-external-parsed-entity, the encoding given in the XML prefix
+ within the document is ALWAYS IGNORED and only the encoding given in
+ the charset parameter of the HTTP Content-Type header should be
+ respected, and it defaults to 'us-ascii' if not specified.
+
+ Furthermore, discussion on the atom-syntax mailing list with the
+ author of RFC 3023 leads me to the conclusion that any document
+ served with a Content-Type of text/* and no charset parameter
+ must be treated as us-ascii. (We now do this.) And also that it
+ must always be flagged as non-well-formed. (We now do this too.)
+
+ If Content-Type is unspecified (input was local file or non-HTTP source)
+ or unrecognized (server just got it totally wrong), then go by the
+ encoding given in the XML prefix of the document and default to
+ 'iso-8859-1' as per the HTTP specification (RFC 2616).
+
+ Then, assuming we didn't find a character encoding in the HTTP headers
+ (and the HTTP Content-type allowed us to look in the body), we need
+ to sniff the first few bytes of the XML data and try to determine
+ whether the encoding is ASCII-compatible. Section F of the XML
+ specification shows the way here:
+ http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
+
+ If the sniffed encoding is not ASCII-compatible, we need to make it
+ ASCII compatible so that we can sniff further into the XML declaration
+ to find the encoding attribute, which will tell us the true encoding.
+
+ Of course, none of this guarantees that we will be able to parse the
+ feed in the declared character encoding (assuming it was declared
+ correctly, which many are not). CJKCodecs and iconv_codec help a lot;
+ you should definitely install them if you can.
+ http://cjkpython.i18n.org/
+ '''
+
+ def _parseHTTPContentType(content_type):
+ '''takes HTTP Content-Type header and returns (content type, charset)
+
+ If no charset is specified, returns (content type, '')
+ If no content type is specified, returns ('', '')
+ Both return parameters are guaranteed to be lowercase strings
+ '''
+ content_type = content_type or ''
+ content_type, params = cgi.parse_header(content_type)
+ charset = params.get('charset', '').replace("'", "")
+ if not isinstance(charset, unicode):
+ charset = charset.decode('utf-8', 'ignore')
+ return content_type, charset
+
+ sniffed_xml_encoding = u''
+ xml_encoding = u''
+ true_encoding = u''
+ http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type', http_headers.get('Content-type')))
+ # Must sniff for non-ASCII-compatible character encodings before
+ # searching for XML declaration. This heuristic is defined in
+ # section F of the XML specification:
+ # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
+ try:
+ if xml_data[:4] == _l2bytes([0x4c, 0x6f, 0xa7, 0x94]):
+ # EBCDIC
+ xml_data = _ebcdic_to_ascii(xml_data)
+ elif xml_data[:4] == _l2bytes([0x00, 0x3c, 0x00, 0x3f]):
+ # UTF-16BE
+ sniffed_xml_encoding = u'utf-16be'
+ xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
+ elif (len(xml_data) >= 4) and (xml_data[:2] == _l2bytes([0xfe, 0xff])) and (xml_data[2:4] != _l2bytes([0x00, 0x00])):
+ # UTF-16BE with BOM
+ sniffed_xml_encoding = u'utf-16be'
+ xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
+ elif xml_data[:4] == _l2bytes([0x3c, 0x00, 0x3f, 0x00]):
+ # UTF-16LE
+ sniffed_xml_encoding = u'utf-16le'
+ xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
+ elif (len(xml_data) >= 4) and (xml_data[:2] == _l2bytes([0xff, 0xfe])) and (xml_data[2:4] != _l2bytes([0x00, 0x00])):
+ # UTF-16LE with BOM
+ sniffed_xml_encoding = u'utf-16le'
+ xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
+ elif xml_data[:4] == _l2bytes([0x00, 0x00, 0x00, 0x3c]):
+ # UTF-32BE
+ sniffed_xml_encoding = u'utf-32be'
+ xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
+ elif xml_data[:4] == _l2bytes([0x3c, 0x00, 0x00, 0x00]):
+ # UTF-32LE
+ sniffed_xml_encoding = u'utf-32le'
+ xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
+ elif xml_data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]):
+ # UTF-32BE with BOM
+ sniffed_xml_encoding = u'utf-32be'
+ xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
+ elif xml_data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]):
+ # UTF-32LE with BOM
+ sniffed_xml_encoding = u'utf-32le'
+ xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
+ elif xml_data[:3] == _l2bytes([0xef, 0xbb, 0xbf]):
+ # UTF-8 with BOM
+ sniffed_xml_encoding = u'utf-8'
+ xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
+ else:
+ # ASCII-compatible
+ pass
+ xml_encoding_match = re.compile(_s2bytes('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')).match(xml_data)
+ except UnicodeDecodeError:
+ xml_encoding_match = None
+ if xml_encoding_match:
+ xml_encoding = xml_encoding_match.groups()[0].decode('utf-8').lower()
+ if sniffed_xml_encoding and (xml_encoding in (u'iso-10646-ucs-2', u'ucs-2', u'csunicode', u'iso-10646-ucs-4', u'ucs-4', u'csucs4', u'utf-16', u'utf-32', u'utf_16', u'utf_32', u'utf16', u'u16')):
+ xml_encoding = sniffed_xml_encoding
+ acceptable_content_type = 0
+ application_content_types = (u'application/xml', u'application/xml-dtd', u'application/xml-external-parsed-entity')
+ text_content_types = (u'text/xml', u'text/xml-external-parsed-entity')
+ if (http_content_type in application_content_types) or \
+ (http_content_type.startswith(u'application/') and http_content_type.endswith(u'+xml')):
+ acceptable_content_type = 1
+ true_encoding = http_encoding or xml_encoding or u'utf-8'
+ elif (http_content_type in text_content_types) or \
+ (http_content_type.startswith(u'text/')) and http_content_type.endswith(u'+xml'):
+ acceptable_content_type = 1
+ true_encoding = http_encoding or u'us-ascii'
+ elif http_content_type.startswith(u'text/'):
+ true_encoding = http_encoding or u'us-ascii'
+ elif http_headers and (not (http_headers.has_key('content-type') or http_headers.has_key('Content-type'))):
+ true_encoding = xml_encoding or u'iso-8859-1'
+ else:
+ true_encoding = xml_encoding or u'utf-8'
+ # some feeds claim to be gb2312 but are actually gb18030.
+ # apparently MSIE and Firefox both do the following switch:
+ if true_encoding.lower() == u'gb2312':
+ true_encoding = u'gb18030'
+ return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
+
+def _toUTF8(data, encoding):
+ '''Changes an XML data stream on the fly to specify a new encoding
+
+ data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
+ encoding is a string recognized by encodings.aliases
+ '''
+ # strip Byte Order Mark (if present)
+ if (len(data) >= 4) and (data[:2] == _l2bytes([0xfe, 0xff])) and (data[2:4] != _l2bytes([0x00, 0x00])):
+ encoding = 'utf-16be'
+ data = data[2:]
+ elif (len(data) >= 4) and (data[:2] == _l2bytes([0xff, 0xfe])) and (data[2:4] != _l2bytes([0x00, 0x00])):
+ encoding = 'utf-16le'
+ data = data[2:]
+ elif data[:3] == _l2bytes([0xef, 0xbb, 0xbf]):
+ encoding = 'utf-8'
+ data = data[3:]
+ elif data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]):
+ encoding = 'utf-32be'
+ data = data[4:]
+ elif data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]):
+ encoding = 'utf-32le'
+ data = data[4:]
+ newdata = unicode(data, encoding)
+ declmatch = re.compile('^<\?xml[^>]*?>')
+ newdecl = '''<?xml version='1.0' encoding='utf-8'?>'''
+ if declmatch.search(newdata):
+ newdata = declmatch.sub(newdecl, newdata)
+ else:
+ newdata = newdecl + u'\n' + newdata
+ return newdata.encode('utf-8')
+
+def _stripDoctype(data):
+ '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
+
+ rss_version may be 'rss091n' or None
+ stripped_data is the same XML document, minus the DOCTYPE
+ '''
+ start = re.search(_s2bytes('<\w'), data)
+ start = start and start.start() or -1
+ head,data = data[:start+1], data[start+1:]
+
+ entity_pattern = re.compile(_s2bytes(r'^\s*<!ENTITY([^>]*?)>'), re.MULTILINE)
+ entity_results=entity_pattern.findall(head)
+ head = entity_pattern.sub(_s2bytes(''), head)
+ doctype_pattern = re.compile(_s2bytes(r'^\s*<!DOCTYPE([^>]*?)>'), re.MULTILINE)
+ doctype_results = doctype_pattern.findall(head)
+ doctype = doctype_results and doctype_results[0] or _s2bytes('')
+ if doctype.lower().count(_s2bytes('netscape')):
+ version = u'rss091n'
+ else:
+ version = None
+
+ # only allow in 'safe' inline entity definitions
+ replacement=_s2bytes('')
+ if len(doctype_results)==1 and entity_results:
+ safe_pattern=re.compile(_s2bytes('\s+(\w+)\s+"(&#\w+;|[^&"]*)"'))
+ safe_entities=filter(lambda e: safe_pattern.match(e),entity_results)
+ if safe_entities:
+ replacement=_s2bytes('<!DOCTYPE feed [\n <!ENTITY') + _s2bytes('>\n <!ENTITY ').join(safe_entities) + _s2bytes('>\n]>')
+ data = doctype_pattern.sub(replacement, head) + data
+
+ return version, data, dict(replacement and [(k.decode('utf-8'), v.decode('utf-8')) for k, v in safe_pattern.findall(replacement)])
+
+def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None):
+ '''Parse a feed from a URL, file, stream, or string.
+
+ request_headers, if given, is a dict from http header name to value to add
+ to the request; this overrides internally generated values.
+ '''
+
+ if handlers is None:
+ handlers = []
+ if request_headers is None:
+ request_headers = {}
+ if response_headers is None:
+ response_headers = {}
+
+ result = FeedParserDict()
+ result['feed'] = FeedParserDict()
+ result['entries'] = []
+ result['bozo'] = 0
+ if not isinstance(handlers, list):
+ handlers = [handlers]
+ try:
+ f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers)
+ data = f.read()
+ except Exception, e:
+ result['bozo'] = 1
+ result['bozo_exception'] = e
+ data = None
+ f = None
+
+ if hasattr(f, 'headers'):
+ result['headers'] = dict(f.headers)
+ # overwrite existing headers using response_headers
+ if 'headers' in result:
+ result['headers'].update(response_headers)
+ elif response_headers:
+ result['headers'] = copy.deepcopy(response_headers)
+
+ # if feed is gzip-compressed, decompress it
+ if f and data and 'headers' in result:
+ if gzip and 'gzip' in (result['headers'].get('content-encoding'), result['headers'].get('Content-Encoding')):
+ try:
+ data = gzip.GzipFile(fileobj=_StringIO(data)).read()
+ except (IOError, struct.error), e:
+ # IOError can occur if the gzip header is bad
+ # struct.error can occur if the data is damaged
+ # Some feeds claim to be gzipped but they're not, so
+ # we get garbage. Ideally, we should re-request the
+ # feed without the 'Accept-encoding: gzip' header,
+ # but we don't.
+ result['bozo'] = 1
+ result['bozo_exception'] = e
+ data = None
+ elif zlib and 'deflate' in (result['headers'].get('content-encoding'), result['headers'].get('Content-Encoding')):
+ try:
+ data = zlib.decompress(data)
+ except zlib.error, e:
+ result['bozo'] = 1
+ result['bozo_exception'] = e
+ data = None
+
+ # save HTTP headers
+ if 'headers' in result:
+ if 'etag' in result['headers'] or 'ETag' in result['headers']:
+ etag = result['headers'].get('etag', result['headers'].get('ETag', u''))
+ if not isinstance(etag, unicode):
+ etag = etag.decode('utf-8', 'ignore')
+ if etag:
+ result['etag'] = etag
+ if 'last-modified' in result['headers'] or 'Last-Modified' in result['headers']:
+ modified = result['headers'].get('last-modified', result['headers'].get('Last-Modified'))
+ if modified:
+ result['modified'] = _parse_date(modified)
+ if hasattr(f, 'url'):
+ if not isinstance(f.url, unicode):
+ result['href'] = f.url.decode('utf-8', 'ignore')
+ else:
+ result['href'] = f.url
+ result['status'] = 200
+ if hasattr(f, 'status'):
+ result['status'] = f.status
+ if hasattr(f, 'close'):
+ f.close()
+
+ if data is None:
+ return result
+
+ # there are four encodings to keep track of:
+ # - http_encoding is the encoding declared in the Content-Type HTTP header
+ # - xml_encoding is the encoding declared in the <?xml declaration
+ # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data
+ # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
+ http_headers = result.get('headers', {})
+ result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \
+ _getCharacterEncoding(http_headers, data)
+ if http_headers and (not acceptable_content_type):
+ if http_headers.has_key('content-type') or http_headers.has_key('Content-type'):
+ bozo_message = '%s is not an XML media type' % http_headers.get('content-type', http_headers.get('Content-type'))
+ else:
+ bozo_message = 'no Content-type specified'
+ result['bozo'] = 1
+ result['bozo_exception'] = NonXMLContentType(bozo_message)
+
+ if data is not None:
+ result['version'], data, entities = _stripDoctype(data)
+
+ # ensure that baseuri is an absolute uri using an acceptable URI scheme
+ contentloc = http_headers.get('content-location', http_headers.get('Content-Location', u''))
+ href = result.get('href', u'')
+ baseuri = _makeSafeAbsoluteURI(href, contentloc) or _makeSafeAbsoluteURI(contentloc) or href
+
+ baselang = http_headers.get('content-language', http_headers.get('Content-Language', None))
+ if not isinstance(baselang, unicode) and baselang is not None:
+ baselang = baselang.decode('utf-8', 'ignore')
+
+ # if server sent 304, we're done
+ if result.get('status', 0) == 304:
+ result['version'] = u''
+ result['debug_message'] = 'The feed has not changed since you last checked, ' + \
+ 'so the server sent no data. This is a feature, not a bug!'
+ return result
+
+ # if there was a problem downloading, we're done
+ if data is None:
+ return result
+
+ # determine character encoding
+ use_strict_parser = 0
+ known_encoding = 0
+ tried_encodings = []
+ # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
+ for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding):
+ if not proposed_encoding:
+ continue
+ if proposed_encoding in tried_encodings:
+ continue
+ tried_encodings.append(proposed_encoding)
+ try:
+ data = _toUTF8(data, proposed_encoding)
+ except (UnicodeDecodeError, LookupError):
+ pass
+ else:
+ known_encoding = use_strict_parser = 1
+ break
+ # if no luck and we have auto-detection library, try that
+ if (not known_encoding) and chardet:
+ proposed_encoding = chardet.detect(data)['encoding']
+ if proposed_encoding and (proposed_encoding not in tried_encodings):
+ tried_encodings.append(proposed_encoding)
+ try:
+ data = _toUTF8(data, proposed_encoding)
+ except (UnicodeDecodeError, LookupError):
+ pass
+ else:
+ known_encoding = use_strict_parser = 1
+ # if still no luck and we haven't tried utf-8 yet, try that
+ if (not known_encoding) and (u'utf-8' not in tried_encodings):
+ proposed_encoding = u'utf-8'
+ tried_encodings.append(proposed_encoding)
+ try:
+ data = _toUTF8(data, proposed_encoding)
+ except UnicodeDecodeError:
+ pass
+ else:
+ known_encoding = use_strict_parser = 1
+ # if still no luck and we haven't tried windows-1252 yet, try that
+ if (not known_encoding) and (u'windows-1252' not in tried_encodings):
+ proposed_encoding = u'windows-1252'
+ tried_encodings.append(proposed_encoding)
+ try:
+ data = _toUTF8(data, proposed_encoding)
+ except UnicodeDecodeError:
+ pass
+ else:
+ known_encoding = use_strict_parser = 1
+ # if still no luck and we haven't tried iso-8859-2 yet, try that.
+ if (not known_encoding) and (u'iso-8859-2' not in tried_encodings):
+ proposed_encoding = u'iso-8859-2'
+ tried_encodings.append(proposed_encoding)
+ try:
+ data = _toUTF8(data, proposed_encoding)
+ except UnicodeDecodeError:
+ pass
+ else:
+ known_encoding = use_strict_parser = 1
+ # if still no luck, give up
+ if not known_encoding:
+ result['bozo'] = 1
+ result['bozo_exception'] = CharacterEncodingUnknown( \
+ 'document encoding unknown, I tried ' + \
+ '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' % \
+ (result['encoding'], xml_encoding))
+ result['encoding'] = u''
+ elif proposed_encoding != result['encoding']:
+ result['bozo'] = 1
+ result['bozo_exception'] = CharacterEncodingOverride( \
+ 'document declared as %s, but parsed as %s' % \
+ (result['encoding'], proposed_encoding))
+ result['encoding'] = proposed_encoding
+
+ if not _XML_AVAILABLE:
+ use_strict_parser = 0
+ if use_strict_parser:
+ # initialize the SAX parser
+ feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')
+ saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
+ saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
+ try:
+ # disable downloading external doctype references, if possible
+ saxparser.setFeature(xml.sax.handler.feature_external_ges, 0)
+ except xml.sax.SAXNotSupportedException:
+ pass
+ saxparser.setContentHandler(feedparser)
+ saxparser.setErrorHandler(feedparser)
+ source = xml.sax.xmlreader.InputSource()
+ source.setByteStream(_StringIO(data))
+ if hasattr(saxparser, '_ns_stack'):
+ # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
+ # PyXML doesn't have this problem, and it doesn't have _ns_stack either
+ saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
+ try:
+ saxparser.parse(source)
+ except xml.sax.SAXParseException, e:
+ result['bozo'] = 1
+ result['bozo_exception'] = feedparser.exc or e
+ use_strict_parser = 0
+ if not use_strict_parser and _SGML_AVAILABLE:
+ feedparser = _LooseFeedParser(baseuri, baselang, 'utf-8', entities)
+ feedparser.feed(data.decode('utf-8', 'replace'))
+ result['feed'] = feedparser.feeddata
+ result['entries'] = feedparser.entries
+ result['version'] = result['version'] or feedparser.version
+ result['namespaces'] = feedparser.namespacesInUse
+ return result
--- /dev/null
+#!/usr/bin/env python2.5
+
+# Copyright (c) 2011 Neal H. Walfield <neal@walfield.org>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import urllib2
+import httplib
+import time
+import logging
+logger = logging.getLogger(__name__)
+
+class ProgressSocket(object):
+ """
+ Monitor what is being sent and received.
+ """
+ def __init__(self, socket, connection):
+ self.socket = socket
+ self.connection = connection
+
+ def __getattribute__(self, attr):
+ # logger.debug("%s.__getattribute__(%s)"
+ # % (self.__class__.__name__, attr))
+
+ def send(data):
+ # 100k at a time.
+ bs = 100 * 1024
+ sent = 0
+ while sent < len (data):
+ remaining = len (data) - sent
+ if remaining < bs:
+ amount = remaining
+ else:
+ amount = bs
+
+ self.socket.sendall(data[sent:sent+amount])
+ sent += amount
+ self.connection.stats['sent'] += amount
+ self.connection.opener.stats['sent'] += amount
+
+ if self.connection.callback is not None:
+ self.connection.callback ()
+
+ def read(*args, **kwargs):
+ data = self.socket.read (*args, **kwargs)
+ # print "GOT: %s" % (data[0:240],)
+ self.connection.stats['received'] += len (data)
+ self.connection.opener.stats['received'] += len (data)
+ if self.connection.callback is not None:
+ self.connection.callback ()
+ return data
+
+ if attr == 'send' or attr == 'sendall':
+ return send
+ if attr == 'read':
+ return read
+
+ try:
+ return super (ProgressSocket, self).__getattribute__(attr)
+ except AttributeError:
+ socket = super (ProgressSocket, self).__getattribute__('socket')
+ return socket.__getattribute__(attr)
+
+ def makefile(self, mode, bufsize):
+ return ProgressSocket (socket=self.socket.makefile(mode, bufsize),
+ connection=self.connection)
+
+ def close(self):
+ return self.socket.close ()
+
+def HTTPProgressConnectionBuilder(callback, opener):
+ class HTTPProgressConnection(httplib.HTTPConnection):
+ def __init__(self, *args, **kwargs):
+ self.method = None
+ self.url = None
+ return httplib.HTTPConnection.__init__ (self, *args, **kwargs)
+
+ def putrequest(self, method, url, *args, **kwargs):
+ self.method = method
+ self.url = url
+ return httplib.HTTPConnection.putrequest (
+ self, method, url, *args, **kwargs)
+
+ def connect(self):
+ httplib.HTTPConnection.connect(self)
+ # Wrap the socket.
+ self.sock = ProgressSocket(socket=self.sock,
+ connection=self)
+
+ HTTPProgressConnection.callback = callback
+ HTTPProgressConnection.opener = opener
+ HTTPProgressConnection.stats \
+ = {'sent': 0, 'received': 0, 'started':time.time()}
+ return HTTPProgressConnection
+
+class HTTPProgressHandler(urllib2.HTTPHandler):
+ def __init__(self, callback):
+ self.callback = callback
+ self.stats = {'sent': 0, 'received': 0, 'started':time.time()}
+ return urllib2.HTTPHandler.__init__(self)
+
+ def http_open(self, request):
+ return self.do_open(
+ HTTPProgressConnectionBuilder(self.callback, self),
+ request)
+
+if __name__ == '__main__':
+ def callback(connection):
+ req = ""
+ if connection.method:
+ req += connection.method + " "
+ req += connection.host + ':' + str (connection.port)
+ if connection.url:
+ req += connection.url
+
+ cstats = connection.stats
+ ostats = connection.opener.stats
+
+ print(
+ ("%s: connection: %d sent, %d received: %d kb/s; "
+ + "opener: %d sent, %d received, %d kb/s")
+ % (req,
+ cstats['sent'], cstats['received'],
+ ((cstats['sent'] + cstats['received'])
+ / (time.time() - cstats['started']) / 1024),
+ ostats['sent'], ostats['received'],
+ ((ostats['sent'] + ostats['received'])
+ / (time.time() - ostats['started']) / 1024)))
+
+ opener = urllib2.build_opener(HTTPProgressHandler(callback))
+
+ data = opener.open ('http://google.com')
+ downloaded = 0
+ for d in data:
+ downloaded += len (d)
+ print "Document is %d bytes in size" % (downloaded,)
--- /dev/null
+#!/usr/bin/env python2.5
+
+# Copyright (c) 2011 Neal H. Walfield <neal@walfield.org>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import threading
+import thread
+import traceback
+import heapq
+import sys
+import mainthread
+import logging
+logger = logging.getLogger(__name__)
+
+def debug(*args):
+ if False:
+ logger.debug(' '.join(args))
+
+# The default priority. Like nice(), a smaller numeric priority
+# corresponds to a higher priority class.
+default_priority = 0
+
+class JobRunner(threading.Thread):
+ def __init__(self, job_manager):
+ threading.Thread.__init__(self)
+ self.job_manager = job_manager
+
+ def run (self):
+ have_lock = True
+ self.job_manager.lock.acquire ()
+ try:
+ while (self.job_manager.pause == 0
+ and not self.job_manager.do_quit
+ and (len (self.job_manager.threads)
+ <= self.job_manager.num_threads)):
+ try:
+ _, key, job = heapq.heappop (self.job_manager.queue)
+ except IndexError:
+ return
+
+ try:
+ self.job_manager.in_progress.append (key)
+ self.job_manager.lock.release ()
+ have_lock = False
+
+ # Execute the job.
+ try:
+ job ()
+ except KeyboardInterrupt:
+ # This is handled below and doesn't require a
+ # traceback.
+ raise
+ except:
+ print ("Executing job %s (%s) from thread %s: %s"
+ % (str (key), str (job),
+ threading.currentThread(),
+ traceback.format_exc ()))
+
+ self.job_manager.lock.acquire ()
+ have_lock = True
+
+ assert key in self.job_manager.in_progress
+ finally:
+ try:
+ self.job_manager.in_progress.remove (key)
+ except ValueError:
+ pass
+
+ debug("Finished executing job %s (%s)" % (key, job,))
+
+ self.job_manager._stats_hooks_run ({'job':job, 'key':key})
+ except KeyboardInterrupt:
+ debug("%s: KeyboardInterrupt" % threading.currentThread())
+ thread.interrupt_main()
+ debug("%s: Forwarded KeyboardInterrupt to main thread"
+ % threading.currentThread())
+ finally:
+ if have_lock:
+ self.job_manager.lock.release ()
+
+ assert self in self.job_manager.threads
+ self.job_manager.threads.remove (self)
+
+ debug ("Job runner %s (%d left) exiting."
+ % (threading.currentThread(),
+ len (self.job_manager.threads)))
+
+_jm = None
+def JobManager(start=False):
+ """
+ Return the job manager instance. The job manager will not start
+ executing jobs until this is called with start set to True. Note:
+ you can still queue jobs.
+ """
+ global _jm
+ if _jm is None:
+ _jm = _JobManager ()
+ if start and not _jm.started:
+ _jm.started = True
+ if _jm.jobs > 0:
+ _jm._stats_hooks_run ()
+ _jm.tickle ()
+
+ return _jm
+
+class _JobManager(object):
+ def __init__(self, started=False, num_threads=4):
+ """
+ Initialize the job manager.
+
+ If started is false, jobs may be queued, but jobs will not be
+ started until start() is called.
+ """
+ # A reentrant lock so that a job runner can call stat without
+ # dropping the lock.
+ self.lock = threading.RLock()
+
+ # If we can start executing jobs.
+ self.started = started
+
+ # The maximum number of threads to use for executing jobs.
+ self._num_threads = num_threads
+
+ # List of jobs (priority, key, job) that are queued for
+ # execution.
+ self.queue = []
+ # List of keys of the jobs that are being executed.
+ self.in_progress = []
+ # List of threads.
+ self.threads = []
+
+ # If 0, jobs may execute, otherwise, job execution is paused.
+ self.pause = 0
+
+ # The total number of jobs that this manager ever executed.
+ self.jobs = 0
+
+ # A list of status hooks to execute when the stats change.
+ self._stats_hooks = []
+ self._current_stats = self.stats ()
+
+ self.do_quit = False
+
+ def _lock(f):
+ def wrapper(*args, **kwargs):
+ self = args[0]
+ self.lock.acquire ()
+ try:
+ return f(*args, **kwargs)
+ finally:
+ self.lock.release()
+ return wrapper
+
+ def get_num_threads(self):
+ return self._num_threads
+ def set_num_threads(self, value):
+ self._num_threads = value
+ self.tickle ()
+ num_threads = property(get_num_threads, set_num_threads)
+
+ @_lock
+ def start(self):
+ """
+ Start executing jobs.
+ """
+ if self.started:
+ return
+ if self.jobs > 0:
+ self._stats_hooks_run ()
+ self.tickle ()
+
+ @_lock
+ def tickle(self):
+ """
+ Ensure that there are enough job runners for the number of
+ pending jobs.
+ """
+ if self.do_quit:
+ debug("%s.quit called, not creating new threads."
+ % self.__class__.__name__)
+ return
+
+ if self.pause > 0:
+ # Job execution is paused. Don't start any new threads.
+ debug("%s.tickle(): Not doing anything: paused"
+ % (self.__class__.__name__))
+ return
+
+ debug("%s.tickle: Have %d threads (can start %d); %d jobs queued"
+ % (self.__class__.__name__,
+ len (self.threads), self.num_threads, len (self.queue)))
+ if len (self.threads) < self.num_threads:
+ for _ in range (min (len (self.queue),
+ self.num_threads - len (self.threads))):
+ thread = JobRunner (self)
+ # Setting threads as daemons means faster shutdown
+ # when the main thread exists, but it results in
+ # exceptions and occassional setfaults.
+ # thread.setDaemon(True)
+ self.threads.append (thread)
+ thread.start ()
+ debug("Now have %d threads" % len (self.threads))
+
+ @_lock
+ def execute(self, job, key=None, priority=default_priority):
+ """
+ Enqueue a job for execution. job is a function to execute.
+ If key is not None, the job is only enqueued if there is no
+ job that is inprogress or enqueued with the same key.
+ priority is the job's priority. Like nice(), a smaller
+ numeric priority corresponds to a higher priority class. Jobs
+ are executed highest priority first, in the order that they
+ were added.
+ """
+ if self.do_quit:
+ debug("%s.quit called, not enqueuing new jobs."
+ % self.__class__.__name__)
+
+ if key is not None:
+ if key in self.in_progress:
+ return
+ for item in self.queue:
+ if item[1] == key:
+ if item[0][0] < priority:
+ # Priority raised.
+ item[0][0] = priority
+ self.queue = heapq.heapify (self.queue)
+ return
+
+ # To ensure that jobs with the same priority are executed
+ # in the order they are added, we set the priority to
+ # [priority, next (monotomic counter)].
+ self.jobs += 1
+ heapq.heappush (self.queue, [[priority, self.jobs], key, job])
+
+ if self.started:
+ self._stats_hooks_run ()
+ self.tickle ()
+ else:
+ debug("%s not initialized. delaying execution of %s (%s)"
+ % (self.__class__.__name__, key, str (job),))
+
+ @_lock
+ def pause(self):
+ """
+ Increasement the pause count. When the pause count is greater
+ than 0, job execution is suspended.
+ """
+ self.pause += 1
+
+ if self.pause == 1:
+ self._stats_hooks_run ()
+
+ @_lock
+ def resume(self):
+ """
+ Decrement the pause count. If the pause count is greater than
+ 0 and this decrement brings it to 0, enqueued jobs are
+ resumed.
+ """
+ assert self.pause > 0
+ self.pause -= 1
+ if not self.paused():
+ self._stats_hooks_run ()
+ self.tickle ()
+
+ @_lock
+ def paused(self):
+ """
+ Returns whether job execution is paused.
+ """
+ return self.pause > 0
+
+ @_lock
+ def cancel(self):
+ """
+ Cancel any pending jobs.
+ """
+ self.queue = []
+ self._stats_hooks_run ()
+
+ def quit(self):
+ self.cancel ()
+ self.do_quit = True
+
+ @_lock
+ def stats(self):
+ """
+ Return a dictionary consisting of:
+
+ - 'paused': whether execution is paused
+ - 'jobs': the total number of jobs this manager has
+ executed, is executing or are queued
+ - 'jobs-completed': the numer of jobs that have completed
+ - 'jobs-in-progress': the number of jobs in progress
+ - 'jobs-queued': the number of jobs currently queued
+ """
+ return {'paused': self.paused(),
+ 'jobs': self.jobs,
+ 'jobs-completed':
+ self.jobs - len (self.in_progress) - len (self.queue),
+ 'jobs-in-progress': len (self.in_progress),
+ 'jobs-queued': len (self.queue)
+ }
+
+ def stats_hook_register(self, func, *args, **kwargs):
+ """
+ Registers a function to be called when the job status changes.
+ Passed the following parameters:
+
+ - the JobManager instance.
+ - the previous stats (as returned by stats)
+ - the current stats
+ - the job that was completed (or None)
+
+ Note: the hook may not be run in the main thread!
+ """
+ mainthread=False
+ try:
+ mainthread = kwargs['run_in_main_thread']
+ del kwargs['run_in_main_thread']
+ except KeyError:
+ pass
+ self._stats_hooks.append ([func, mainthread, args, kwargs])
+
+ def _stats_hooks_run(self, completed_job=None):
+ """
+ Run the stats hooks.
+ """
+ # if not self._stats_hooks:
+ # return
+
+ self.lock.acquire ()
+ try:
+ old_stats = self._current_stats
+ self._current_stats = self.stats ()
+ current_stats = self._current_stats
+ finally:
+ self.lock.release ()
+
+ debug("%s -> %s" % (str (old_stats), str (current_stats)))
+
+ for (f, run_in_main_thread, args, kwargs) in self._stats_hooks:
+ if run_in_main_thread:
+ debug("JobManager._stats_hooks_run: Running %s in main thread"
+ % f)
+ mainthread.execute(
+ f, self, old_stats, current_stats, completed_job,
+ async=True, *args, **kwargs)
+ else:
+ debug("JobManager._stats_hooks_run: Running %s in any thread"
+ % f)
+ f(self, old_stats, current_stats, completed_job,
+ *args, **kwargs)
--- /dev/null
+#!/usr/bin/env python2.5
+
+# Copyright (c) 2011 Neal H. Walfield <neal@walfield.org>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import threading
+import traceback
+import logging
+logger = logging.getLogger(__name__)
+
+_run_in_main_thread = None
+_main_thread = None
+
+def init(run_in_main_thread=None):
+ """
+ run_in_main_thread is a function that takes a single argument, a
+ callable and returns False. run_in_main_thread should run the
+ function in the main thread.
+
+ If you are using glib, gobject.idle_add (the default) is
+ sufficient. (gobject.idle_add is thread-safe.)
+ """
+ if run_in_main_thread is None:
+ import gobject
+ run_in_main_thread = gobject.idle_add
+
+ global _run_in_main_thread
+ assert _run_in_main_thread is None
+ _run_in_main_thread = run_in_main_thread
+
+ global _main_thread
+ _main_thread = threading.currentThread ()
+
+def execute(func, *args, **kwargs):
+ """
+ Execute FUNC in the main thread.
+
+ If kwargs['async'] exists and is True, the function is executed
+ asynchronously (i.e., the thread does not wait for the function to
+ return in which case the function's return value is discarded).
+ Otherwise, this function waits until the function is executed and
+ returns its return value.
+ """
+ async = False
+ try:
+ async = kwargs['async']
+ del kwargs['async']
+ except KeyError:
+ pass
+
+ if threading.currentThread() == _main_thread:
+ if async:
+ try:
+ func (*args, **kwargs)
+ except:
+ logger.debug("mainthread.execute: Executing %s: %s"
+ % (func, traceback.format_exc ()))
+ return
+ else:
+ return func (*args, **kwargs)
+
+ assert _run_in_main_thread is not None, \
+ "You can't call this function from a non-main thread until you've called init()"
+
+ if not async:
+ cond = threading.Condition()
+
+ result = {}
+ result['done'] = False
+
+ def doit():
+ def it():
+ # Execute the function.
+ assert threading.currentThread() == _main_thread
+
+ try:
+ result['result'] = func (*args, **kwargs)
+ except:
+ logger.debug("mainthread.execute: Executing %s: %s"
+ % (func, traceback.format_exc ()))
+
+ if not async:
+ cond.acquire ()
+ result['done'] = True
+ if not async:
+ cond.notify ()
+ cond.release ()
+
+ return False
+ return it
+
+ if not async:
+ cond.acquire ()
+ _run_in_main_thread (doit())
+
+ if async:
+ # Don't wait for the method to complete execution.
+ return
+
+ # Wait for the result to become available.
+ while not result['done']:
+ cond.wait ()
+
+ return result.get ('result', None)
+
+if __name__ == "__main__":
+ import sys
+ import gobject
+
+ init()
+
+ def in_main_thread(test_num):
+ assert threading.currentThread() == _main_thread, \
+ "Test %d failed" % (test_num,)
+ return test_num
+
+ mainloop = gobject.MainLoop()
+ gobject.threads_init()
+
+ assert execute (in_main_thread, 1) == 1
+ assert (execute (in_main_thread, 2, async=False) == 2)
+ execute (in_main_thread, 3, async=True)
+
+ class T(threading.Thread):
+ def __init__(self):
+ threading.Thread.__init__(self)
+
+ def run(self):
+ assert threading.currentThread() != _main_thread
+
+ assert execute (in_main_thread, 4) == 4
+ assert (execute (in_main_thread, 5, async=False) == 5)
+ execute (in_main_thread, 6, async=True)
+ execute (mainloop.quit, async=False)
+
+ def start_thread():
+ t = T()
+ t.start()
+ return False
+
+ gobject.idle_add (start_thread)
+ mainloop.run()
+
+def mainthread(f):
+ def wrapper(*args, **kwargs):
+ return execute (f, *args, **kwargs)
+ return wrapper
--- /dev/null
+#!/usr/bin/env python2.5
+
+#
+# Copyright (c) 2007-2008 INdT.
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+# ============================================================================
+# Name : FeedingIt.py
+# Author : Yves Marcoz
+# Version : 0.2.2
+# Description : Simple RSS Reader
+# ============================================================================
+
+from xml.dom.minidom import parse, parseString
+import urllib2
+import gtk
+import hildon
+import gobject
+import time
+from os.path import isfile, dirname
+import gobject
+import logging
+logger = logging.getLogger(__name__)
+
+class ExportOpmlData():
+ def __init__(self, parent, listing):
+ fs = hildon.FileSystemModel()
+ dialog = hildon.FileChooserDialog(parent, gtk.FILE_CHOOSER_ACTION_SAVE, fs)
+ #(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL,
+ #gtk.STOCK_SAVE, gtk.RESPONSE_OK))
+ #)
+ #dialog = gobject.new(hildon.FileChooserDialog, \
+ # action=gtk.FILE_CHOOSER_ACTION_SAVE)
+ #dialog.set_default_response(gtk.RESPONSE_OK)
+ #dialog.set_property('autonaming',False)
+ #dialog.set_property('show-files',True)
+ dialog.set_current_folder('/home/user/MyDocs/')
+ dialog.set_current_name('feedingit-export')
+ dialog.set_extension('opml')
+ response = dialog.run()
+ dialog.hide()
+ if response == gtk.RESPONSE_OK:
+ filename = dialog.get_filename()
+ logger.debug("ExportOpmlData: %s" % filename)
+ #try:
+
+ cont = True
+ if isfile(filename):
+ note = "File already exists. Aborted"
+ confirm = hildon.Note ("confirmation", parent, "File already exists. Are you sure you want to overwrite it?", gtk.STOCK_DIALOG_WARNING )
+ confirm.set_button_texts ("Yes", "Cancel")
+ response = confirm.run()
+ confirm.destroy()
+ if response == gtk.RESPONSE_OK:
+ cont = True
+ else:
+ note = "Operation cancelled."
+ cont = False
+ if cont:
+ file = open(filename, "w")
+ file.write(self.getOpmlText(listing))
+ file.close()
+ note = "Feeds exported to %s" %filename
+ #except:
+ note = "Failed to export feeds"
+
+ #dialog.destroy()
+ #dialog = hildon.Note ("information", parent, note , gtk.STOCK_DIALOG_INFO )
+ #dialog.run()
+ #dialog.destroy()
+ elif response == gtk.RESPONSE_CANCEL:
+ dialog.destroy()
+
+ def getOpmlText(self, listing):
+ time_now = time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.gmtime())
+ opml_text = """<?xml version="1.0" encoding="UTF-8"?>
+<opml version="1.0">
+<head>
+ <title>Feeding It Export</title>
+</head>
+<body>
+"""
+ for key in listing.getListOfFeeds():
+ title = listing.getFeedTitle(key)
+ url = listing.getFeedUrl(key)
+ if not title == "Archived Articles":
+ opml_text += """\n\t\t<outline type="rss" text="%s" title="%s" xmlUrl="%s"/>""" % (self.sanitize(title), self.sanitize(title), self.sanitize(url))
+ opml_text += """\n</body>\n</opml>\n"""
+ return opml_text
+
+ def sanitize(self, text):
+ from cgi import escape
+ return escape(text).encode('ascii', 'xmlcharrefreplace')
+
+
+
+class GetOpmlData():
+ def __init__(self, parent):
+ self.parent = parent
+ dialog = hildon.Note ("confirmation", parent, "What type of OPML?", gtk.STOCK_DIALOG_WARNING )
+ dialog.set_button_texts ("File", "URL")
+ response = dialog.run()
+ dialog.destroy()
+
+ if response == gtk.RESPONSE_OK:
+ # Choose a file
+ self.data = self.askForFile()
+ else:
+ # Download a URL
+ self.data = self.downloadFile()
+
+ def getData(self):
+ if not self.data == None:
+ dialog = OpmlDialog(self.parent, self.data)
+ response = dialog.run()
+ if response == gtk.RESPONSE_ACCEPT:
+ items = dialog.getItems()
+ else:
+ items = []
+ dialog.destroy()
+ return items
+ return []
+
+ def downloadFile(self):
+ dlg = gtk.Dialog("Import OPML from web", self.parent, gtk.DIALOG_DESTROY_WITH_PARENT,
+ ('Import', gtk.RESPONSE_OK,
+ gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL))
+ hb = gtk.HBox(False, 5)
+ hb.pack_start(gtk.Label('URL:'), expand=False)
+ entry = hildon.Entry(0)
+ entry.set_text("http://")
+ entry.select_region(-1, -1)
+ hb.pack_start(entry, expand=True)
+ hb.show_all()
+ dlg.vbox.pack_start(hb, False)
+
+ resp = dlg.run()
+ url = entry.get_text()
+ dlg.destroy()
+ if resp == gtk.RESPONSE_CANCEL:
+ return None
+ try:
+ f = urllib2.urlopen(url)
+ data = f.read()
+ f.close()
+ except:
+ #Show error note
+ return None
+ return data
+
+ def askForFile(self):
+ #dialog = hildon.FileChooserDialog(self.parent,
+ # gtk.FILE_CHOOSER_ACTION_OPEN)
+ #dialog = gobject.new(hildon.FileChooserDialog, \
+ # action=gtk.FILE_CHOOSER_ACTION_OPEN)
+ #dialog.set_default_response(gtk.RESPONSE_OK)
+ fs = hildon.FileSystemModel()
+ dialog = hildon.FileChooserDialog(self.parent, gtk.FILE_CHOOSER_ACTION_OPEN, fs)
+
+ filter = gtk.FileFilter()
+ filter.set_name("All files")
+ filter.add_pattern("*")
+ dialog.add_filter(filter)
+
+ filter = gtk.FileFilter()
+ filter.set_name("OPML")
+ filter.add_pattern("*.xml")
+ filter.add_pattern("*.opml")
+ dialog.add_filter(filter)
+
+ response = dialog.run()
+ if response == gtk.RESPONSE_OK:
+ file = open(dialog.get_filename())
+ data = file.read()
+ file.close()
+ dialog.destroy()
+ return data
+ elif response == gtk.RESPONSE_CANCEL:
+ dialog.destroy()
+ return None
+
+
+class OpmlDialog(gtk.Dialog):
+ def parse(self, opmlData):
+ self.feeds = []
+ dom1 = parseString(opmlData)
+
+ outlines = dom1.getElementsByTagName('outline')
+ for outline in outlines:
+ title = outline.getAttribute('text')
+ url = outline.getAttribute('xmlUrl')
+ if url == "":
+ url = outline.getAttribute('htmlUrl')
+ if not url == "":
+ self.feeds.append( (title, url) )
+
+ def getFeedLinks(self):
+ return self.feeds
+
+ def __init__(self, parent, opmlData):
+ self.parse(opmlData)
+ gtk.Dialog.__init__(self, "Select OPML Feeds", parent, gtk.DIALOG_DESTROY_WITH_PARENT, (gtk.STOCK_OK, gtk.RESPONSE_ACCEPT))
+
+ self.pannableArea = hildon.PannableArea()
+ self.treestore = gtk.ListStore(gobject.TYPE_STRING, gobject.TYPE_STRING)
+ self.treeview = gtk.TreeView(self.treestore)
+
+ self.displayFeeds()
+
+ self.set_default_size(-1, 600)
+ self.vbox.pack_start(self.pannableArea)
+
+ button = hildon.GtkButton(gtk.HILDON_SIZE_AUTO)
+ button.set_label("Select All")
+ button.connect("clicked", self.button_select_all_clicked)
+ self.action_area.pack_end(button)
+
+ button = hildon.GtkButton(gtk.HILDON_SIZE_AUTO)
+ button.set_label("Unselect All")
+ button.connect("clicked", self.button_select_none_clicked)
+ self.action_area.pack_end(button)
+
+ self.show_all()
+
+ def button_select_all_clicked(self, button):
+ self.treeview.get_selection().select_all()
+
+ def button_select_none_clicked(self, button):
+ self.treeview.get_selection().unselect_all()
+
+ def displayFeeds(self):
+ self.treeview.destroy()
+ self.treestore = gtk.ListStore(gobject.TYPE_STRING, gobject.TYPE_STRING)
+ self.treeview = gtk.TreeView()
+
+ self.treeview.get_selection().set_mode(gtk.SELECTION_MULTIPLE)
+ hildon.hildon_gtk_tree_view_set_ui_mode(self.treeview, gtk.HILDON_UI_MODE_EDIT)
+ self.refreshList()
+ self.treeview.append_column(gtk.TreeViewColumn('Feed Name', gtk.CellRendererText(), text = 0))
+
+ self.pannableArea.add(self.treeview)
+ self.pannableArea.show_all()
+ self.treeview.get_selection().select_all()
+
+ def refreshList(self, selected=None, offset=0):
+ rect = self.treeview.get_visible_rect()
+ y = rect.y+rect.height
+ self.treestore = gtk.ListStore(gobject.TYPE_STRING, gobject.TYPE_STRING)
+ self.treeview.set_model(self.treestore)
+ for (title, url) in self.feeds:
+ item = self.treestore.append([title, url])
+ self.treeview.get_selection().select_iter(item)
+ #self.treeview.get_selection().select_all()
+ self.pannableArea.show_all()
+
+ def getItems(self):
+ list = []
+ treeselection = self.treeview.get_selection()
+ (model, pathlist) = treeselection.get_selected_rows()
+ for path in pathlist:
+ list.append( (model.get_value(model.get_iter(path),0), model.get_value(model.get_iter(path),1)) )
+ return list
+
+def showOpmlData(widget, parent, button):
+ dialog = GetOpmlData(parent)
+ logger.debug("showOpmlData: %s" % dialog.getData())
+ #dialog.destroy()
+
+if __name__ == "__main__":
+ window = hildon.Window()
+ window.set_title("Test App")
+
+
+ button = gtk.Button("Click to confirm.")
+ window.add(button)
+ button.connect("clicked", showOpmlData, window, button)
+ window.connect("destroy", gtk.main_quit)
+ window.show_all()
+
+ gtk.main()
+ window.destroy()
--- /dev/null
+#!/usr/bin/env python2.5
+
+#
+# Copyright (c) 2007-2008 INdT.
+# Copyright (c) 2011 Neal H. Walfield
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+# ============================================================================
+# Name : FeedingIt.py
+# Author : Yves Marcoz
+# Version : 0.5.4
+# Description : Simple RSS Reader
+# ============================================================================
+
+from __future__ import with_statement
+
+import sqlite3
+from os.path import isfile, isdir
+from shutil import rmtree
+from os import mkdir, remove, utime
+import os
+import md5
+import feedparser
+import time
+import urllib2
+from BeautifulSoup import BeautifulSoup
+from urlparse import urljoin
+from calendar import timegm
+import threading
+import traceback
+from wc import wc, wc_init, woodchuck
+import subprocess
+import dbus
+from updatedbus import update_server_object
+
+from jobmanager import JobManager
+import mainthread
+from httpprogresshandler import HTTPProgressHandler
+import random
+import sys
+import logging
+logger = logging.getLogger(__name__)
+
+def getId(string):
+ return md5.new(string).hexdigest()
+
+def download_callback(connection):
+ if JobManager().do_quit:
+ raise KeyboardInterrupt
+
+def downloader(progress_handler=None, proxy=None):
+ openers = []
+
+ if progress_handler is not None:
+ openers.append(progress_handler)
+ else:
+ openers.append(HTTPProgressHandler(download_callback))
+
+ if proxy:
+ openers.append(proxy)
+
+ return urllib2.build_opener(*openers)
+
+def transfer_stats(sent, received, **kwargs):
+ """
+ This function takes two arguments: sent is the number of bytes
+ sent so far, received is the number of bytes received. The
+ function returns a continuation that you can call later.
+
+ The continuation takes the same two arguments. It returns a tuple
+ of the number of bytes sent, the number of bytes received and the
+ time since the original function was invoked.
+ """
+ start_time = time.time()
+ start_sent = sent
+ start_received = received
+
+ def e(sent, received, **kwargs):
+ return (sent - start_sent,
+ received - start_received,
+ time.time() - start_time)
+
+ return e
+
+# If not None, a subprocess.Popen object corresponding to a
+# update_feeds.py process.
+update_feed_process = None
+
+update_feeds_iface = None
+
+jobs_at_start = 0
+
+class BaseObject(object):
+ # Columns to cache. Classes that inherit from this and use the
+ # cache mechanism should set this to a list of tuples, each of
+ # which contains two entries: the table and the column. Note that
+ # both are case sensitive.
+ cached_columns = ()
+
+ def cache_invalidate(self, table=None):
+ """
+ Invalidate the cache.
+
+ If table is not None, invalidate only the specified table.
+ Otherwise, drop the whole cache.
+ """
+ if not hasattr(self, 'cache'):
+ return
+
+ if table is None:
+ del self.cache
+ else:
+ if table in self.cache:
+ del self.cache[table]
+
+ def lookup(self, table, column, id=None):
+ """
+ Look up a column or value. Uses a cache for columns in
+ cached_columns. Note: the column is returned unsorted.
+ """
+ if not hasattr(self, 'cache'):
+ self.cache = {}
+
+ # Cache data for at most 60 seconds.
+ now = time.time()
+ try:
+ cache = self.cache[table]
+
+ if time.time() - cache[None] > 60:
+ # logger.debug("%s: Cache too old: clearing" % (table,))
+ del self.cache[table]
+ cache = None
+ except KeyError:
+ cache = None
+
+ if (cache is None
+ or (table, column) not in self.cached_columns):
+ # The cache is empty or the caller wants a column that we
+ # don't cache.
+ if (table, column) in self.cached_columns:
+ # logger.debug("%s: Rebuilding cache" % (table,))
+
+ do_cache = True
+
+ self.cache[table] = cache = {}
+ columns = []
+ for t, c in self.cached_columns:
+ if table == t:
+ cache[c] = {}
+ columns.append(c)
+
+ columns.append('id')
+ where = ""
+ else:
+ do_cache = False
+
+ columns = (colums,)
+ if id is not None:
+ where = "where id = '%s'" % id
+ else:
+ where = ""
+
+ results = self.db.execute(
+ "SELECT %s FROM %s %s" % (','.join(columns), table, where))
+
+ if do_cache:
+ for r in results:
+ values = list(r)
+ i = values.pop()
+ for index, value in enumerate(values):
+ cache[columns[index]][i] = value
+
+ cache[None] = now
+ else:
+ results = []
+ for r in results:
+ if id is not None:
+ return values[0]
+
+ results.append(values[0])
+
+ return results
+ else:
+ cache = self.cache[table]
+
+ try:
+ if id is not None:
+ value = cache[column][id]
+ # logger.debug("%s.%s:%s -> %s" % (table, column, id, value))
+ return value
+ else:
+ return cache[column].values()
+ except KeyError:
+ # logger.debug("%s.%s:%s -> Not found" % (table, column, id))
+ return None
+
+class Feed(BaseObject):
+ # Columns to cache.
+ cached_columns = (('feed', 'read'),
+ ('feed', 'title'))
+
+ serial_execution_lock = threading.Lock()
+
+ def _getdb(self):
+ try:
+ db = self.tls.db
+ except AttributeError:
+ db = sqlite3.connect("%s/%s.db" % (self.dir, self.key), timeout=120)
+ self.tls.db = db
+ return db
+ db = property(_getdb)
+
+ def __init__(self, configdir, key):
+ self.key = key
+ self.configdir = configdir
+ self.dir = "%s/%s.d" %(self.configdir, self.key)
+ self.tls = threading.local ()
+
+ if not isdir(self.dir):
+ mkdir(self.dir)
+ if not isfile("%s/%s.db" %(self.dir, self.key)):
+ self.db.execute("CREATE TABLE feed (id text, title text, contentLink text, date float, updated float, link text, read int);")
+ self.db.execute("CREATE TABLE images (id text, imagePath text);")
+ self.db.commit()
+
+ def addImage(self, configdir, key, baseurl, url, proxy=None, opener=None):
+ filename = configdir+key+".d/"+getId(url)
+ if not isfile(filename):
+ try:
+ if not opener:
+ opener = downloader(proxy=proxy)
+
+ abs_url = urljoin(baseurl,url)
+ f = opener.open(abs_url)
+ try:
+ with open(filename, "w") as outf:
+ for data in f:
+ outf.write(data)
+ finally:
+ f.close()
+ except (urllib2.HTTPError, urllib2.URLError, IOError), exception:
+ logger.info("Could not download image %s: %s"
+ % (abs_url, str (exception)))
+ return None
+ except:
+ exception = sys.exc_info()[0]
+
+ logger.info("Downloading image %s: %s" %
+ (abs_url, traceback.format_exc()))
+ try:
+ remove(filename)
+ except OSError:
+ pass
+
+ return None
+ else:
+ #open(filename,"a").close() # "Touch" the file
+ file = open(filename,"a")
+ utime(filename, None)
+ file.close()
+ return filename
+
+ def updateFeed(self, configdir, url, etag, modified, expiryTime=24, proxy=None, imageCache=False, priority=0, postFeedUpdateFunc=None, *postFeedUpdateFuncArgs):
+ if (os.path.basename(sys.argv[0]) == 'update_feeds.py'):
+ def doit():
+ def it():
+ self._updateFeed(configdir, url, etag, modified, expiryTime, proxy, imageCache, postFeedUpdateFunc, *postFeedUpdateFuncArgs)
+ return it
+ JobManager().execute(doit(), self.key, priority=priority)
+ else:
+ def send_update_request():
+ global update_feeds_iface
+ if update_feeds_iface is None:
+ bus=dbus.SessionBus()
+ remote_object = bus.get_object(
+ "org.marcoz.feedingit", # Connection name
+ "/org/marcoz/feedingit/update" # Object's path
+ )
+ update_feeds_iface = dbus.Interface(
+ remote_object, 'org.marcoz.feedingit')
+
+ try:
+ update_feeds_iface.Update(self.key)
+ except Exception, e:
+ logger.error("Invoking org.marcoz.feedingit.Update: %s"
+ % str(e))
+ update_feeds_iface = None
+ else:
+ return True
+
+ if send_update_request():
+ # Success! It seems we were able to start the update
+ # daemon via dbus (or, it was already running).
+ return
+
+ global update_feed_process
+ if (update_feed_process is None
+ or update_feed_process.poll() is not None):
+ # The update_feeds process is not running. Start it.
+ update_feeds = os.path.join(os.path.dirname(__file__),
+ 'update_feeds.py')
+ argv = ['/usr/bin/env', 'python', update_feeds, '--daemon' ]
+ logger.debug("Starting update_feeds: running %s"
+ % (str(argv),))
+ update_feed_process = subprocess.Popen(argv)
+ # Make sure the dbus calls go to the right process:
+ # rebind.
+ update_feeds_iface = None
+
+ for _ in xrange(5):
+ if send_update_request():
+ break
+ time.sleep(1)
+
+ def _updateFeed(self, configdir, url, etag, modified, expiryTime=24, proxy=None, imageCache=False, postFeedUpdateFunc=None, *postFeedUpdateFuncArgs):
+ logger.debug("Updating %s" % url)
+
+ success = False
+ have_serial_execution_lock = False
+ try:
+ update_start = time.time ()
+
+ progress_handler = HTTPProgressHandler(download_callback)
+
+ openers = [progress_handler]
+ if proxy:
+ openers.append (proxy)
+ kwargs = {'handlers':openers}
+
+ feed_transfer_stats = transfer_stats(0, 0)
+
+ tmp=feedparser.parse(url, etag=etag, modified=modified, **kwargs)
+ download_duration = time.time () - update_start
+
+ opener = downloader(progress_handler, proxy)
+
+ if JobManager().do_quit:
+ raise KeyboardInterrupt
+
+ process_start = time.time()
+
+ # Expiry time is in hours
+ expiry = float(expiryTime) * 3600.
+
+ currentTime = 0
+
+ updated_objects = 0
+ new_objects = 0
+
+ def wc_success():
+ try:
+ wc().stream_register (self.key, "", 6 * 60 * 60)
+ except woodchuck.ObjectExistsError:
+ pass
+ try:
+ wc()[self.key].updated (
+ indicator=(woodchuck.Indicator.ApplicationVisual
+ |woodchuck.Indicator.StreamWide),
+ transferred_down=progress_handler.stats['received'],
+ transferred_up=progress_handler.stats['sent'],
+ transfer_time=update_start,
+ transfer_duration=download_duration,
+ new_objects=new_objects,
+ updated_objects=updated_objects,
+ objects_inline=new_objects + updated_objects)
+ except KeyError:
+ logger.warn(
+ "Failed to register update of %s with woodchuck!"
+ % (self.key))
+
+ http_status = tmp.get ('status', 200)
+
+ # Check if the parse was succesful. If the http status code
+ # is 304, then the download was successful, but there is
+ # nothing new. Indeed, no content is returned. This make a
+ # 304 look like an error because there are no entries and the
+ # parse fails. But really, everything went great! Check for
+ # this first.
+ if http_status == 304:
+ logger.debug("%s: No changes to feed." % (self.key,))
+ mainthread.execute(wc_success, async=True)
+ success = True
+ elif len(tmp["entries"])==0 and not tmp.version:
+ # An error occured fetching or parsing the feed. (Version
+ # will be either None if e.g. the connection timed our or
+ # '' if the data is not a proper feed)
+ logger.error(
+ "Error fetching %s: version is: %s: error: %s"
+ % (url, str (tmp.version),
+ str (tmp.get ('bozo_exception', 'Unknown error'))))
+ logger.debug(tmp)
+ def register_stream_update_failed(http_status):
+ def doit():
+ logger.debug("%s: stream update failed!" % self.key)
+
+ try:
+ # It's not easy to get the feed's title from here.
+ # At the latest, the next time the application is
+ # started, we'll fix up the human readable name.
+ wc().stream_register (self.key, "", 6 * 60 * 60)
+ except woodchuck.ObjectExistsError:
+ pass
+ ec = woodchuck.TransferStatus.TransientOther
+ if 300 <= http_status and http_status < 400:
+ ec = woodchuck.TransferStatus.TransientNetwork
+ if 400 <= http_status and http_status < 500:
+ ec = woodchuck.TransferStatus.FailureGone
+ if 500 <= http_status and http_status < 600:
+ ec = woodchuck.TransferStatus.TransientNetwork
+ wc()[self.key].update_failed(ec)
+ return doit
+ if wc().available:
+ mainthread.execute(
+ register_stream_update_failed(
+ http_status=http_status),
+ async=True)
+ else:
+ currentTime = time.time()
+ # The etag and modified value should only be updated if the content was not null
+ try:
+ etag = tmp["etag"]
+ except KeyError:
+ etag = None
+ try:
+ modified = tmp["modified"]
+ except KeyError:
+ modified = None
+ try:
+ abs_url = urljoin(tmp["feed"]["link"],"/favicon.ico")
+ f = opener.open(abs_url)
+ data = f.read()
+ f.close()
+ outf = open(self.dir+"/favicon.ico", "w")
+ outf.write(data)
+ outf.close()
+ del data
+ except (urllib2.HTTPError, urllib2.URLError), exception:
+ logger.debug("Could not download favicon %s: %s"
+ % (abs_url, str (exception)))
+
+ self.serial_execution_lock.acquire ()
+ have_serial_execution_lock = True
+
+ #reversedEntries = self.getEntries()
+ #reversedEntries.reverse()
+
+ ids = self.getIds()
+
+ tmp["entries"].reverse()
+ for entry in tmp["entries"]:
+ # Yield so as to make the main thread a bit more
+ # responsive.
+ time.sleep(0)
+
+ entry_transfer_stats = transfer_stats(
+ *feed_transfer_stats(**progress_handler.stats)[0:2])
+
+ if JobManager().do_quit:
+ raise KeyboardInterrupt
+
+ object_size = 0
+
+ date = self.extractDate(entry)
+ try:
+ entry["title"]
+ except KeyError:
+ entry["title"] = "No Title"
+ try :
+ entry["link"]
+ except KeyError:
+ entry["link"] = ""
+ try:
+ entry["author"]
+ except KeyError:
+ entry["author"] = None
+ if(not(entry.has_key("id"))):
+ entry["id"] = None
+ content = self.extractContent(entry)
+ object_size = len (content)
+ tmpEntry = {"title":entry["title"], "content":content,
+ "date":date, "link":entry["link"], "author":entry["author"], "id":entry["id"]}
+ id = self.generateUniqueId(tmpEntry)
+
+ current_version \
+ = self.db.execute('select date from feed where id=?',
+ (id,)).fetchone()
+ if (current_version is not None
+ and current_version[0] == date):
+ logger.debug("ALREADY DOWNLOADED %s (%s)"
+ % (entry["title"], entry["link"]))
+ continue
+
+ if current_version is not None:
+ # The version was updated. Mark it as unread.
+ logger.debug("UPDATED: %s (%s)"
+ % (entry["title"], entry["link"]))
+ self.setEntryUnread(id)
+ updated_objects += 1
+ else:
+ logger.debug("NEW: %s (%s)"
+ % (entry["title"], entry["link"]))
+ new_objects += 1
+
+ #articleTime = time.mktime(self.entries[id]["dateTuple"])
+ soup = BeautifulSoup(self.getArticle(tmpEntry)) #tmpEntry["content"])
+ images = soup('img')
+ baseurl = tmpEntry["link"]
+ #if not id in ids:
+ if imageCache and len(images) > 0:
+ self.serial_execution_lock.release ()
+ have_serial_execution_lock = False
+ for img in images:
+ filename = self.addImage(
+ configdir, self.key, baseurl, img['src'],
+ opener=opener)
+ if filename:
+ img['src']="file://%s" %filename
+ count = self.db.execute("SELECT count(1) FROM images where id=? and imagePath=?;", (id, filename )).fetchone()[0]
+ if count == 0:
+ self.db.execute("INSERT INTO images (id, imagePath) VALUES (?, ?);", (id, filename) )
+ self.db.commit()
+
+ try:
+ object_size += os.path.getsize (filename)
+ except os.error, exception:
+ logger.error ("Error getting size of %s: %s"
+ % (filename, exception))
+ self.serial_execution_lock.acquire ()
+ have_serial_execution_lock = True
+
+ tmpEntry["contentLink"] = configdir+self.key+".d/"+id+".html"
+ file = open(tmpEntry["contentLink"], "w")
+ file.write(soup.prettify())
+ file.close()
+ if id in ids:
+ self.db.execute("UPDATE feed SET updated=? WHERE id=?;", (currentTime, id) )
+ self.db.commit()
+ else:
+ values = (id, tmpEntry["title"], tmpEntry["contentLink"], tmpEntry["date"], currentTime, tmpEntry["link"], 0)
+ self.db.execute("INSERT INTO feed (id, title, contentLink, date, updated, link, read) VALUES (?, ?, ?, ?, ?, ?, ?);", values)
+ self.db.commit()
+# else:
+# try:
+# self.db.execute("UPDATE feed SET updated=? WHERE id=?;", (currentTime, id) )
+# self.db.commit()
+# filename = configdir+self.key+".d/"+id+".html"
+# file = open(filename,"a")
+# utime(filename, None)
+# file.close()
+# images = self.db.execute("SELECT imagePath FROM images where id=?;", (id, )).fetchall()
+# for image in images:
+# file = open(image[0],"a")
+# utime(image[0], None)
+# file.close()
+# except:
+# pass
+
+ # Register the object with Woodchuck and mark it as
+ # downloaded.
+ def register_object_transferred(
+ id, title, publication_time,
+ sent, received, object_size):
+ def doit():
+ logger.debug("Registering transfer of object %s"
+ % title)
+ try:
+ obj = wc()[self.key].object_register(
+ object_identifier=id,
+ human_readable_name=title)
+ except woodchuck.ObjectExistsError:
+ obj = wc()[self.key][id]
+ else:
+ obj.publication_time = publication_time
+ obj.transferred(
+ indicator=(
+ woodchuck.Indicator.ApplicationVisual
+ |woodchuck.Indicator.StreamWide),
+ transferred_down=received,
+ transferred_up=sent,
+ object_size=object_size)
+ return doit
+ if wc().available:
+ # If the entry does not contain a publication
+ # time, the attribute won't exist.
+ pubtime = entry.get('date_parsed', None)
+ if pubtime:
+ publication_time = time.mktime (pubtime)
+ else:
+ publication_time = None
+
+ sent, received, _ \
+ = entry_transfer_stats(**progress_handler.stats)
+ # sent and received are for objects (in
+ # particular, images) associated with this
+ # item. We also want to attribute the data
+ # transferred for the item's content. This is
+ # a good first approximation.
+ received += len(content)
+
+ mainthread.execute(
+ register_object_transferred(
+ id=id,
+ title=tmpEntry["title"],
+ publication_time=publication_time,
+ sent=sent, received=received,
+ object_size=object_size),
+ async=True)
+ self.db.commit()
+
+ sent, received, _ \
+ = feed_transfer_stats(**progress_handler.stats)
+ logger.debug (
+ "%s: Update successful: transferred: %d/%d; objects: %d)"
+ % (url, sent, received, len (tmp.entries)))
+ mainthread.execute (wc_success, async=True)
+ success = True
+
+ rows = self.db.execute("SELECT id FROM feed WHERE (read=0 AND updated<?) OR (read=1 AND updated<?);", (currentTime-2*expiry, currentTime-expiry))
+ for row in rows:
+ self.removeEntry(row[0])
+
+ from glob import glob
+ from os import stat
+ for file in glob(configdir+self.key+".d/*"):
+ #
+ stats = stat(file)
+ #
+ # put the two dates into matching format
+ #
+ lastmodDate = stats[8]
+ #
+ expDate = time.time()-expiry*3
+ # check if image-last-modified-date is outdated
+ #
+ if expDate > lastmodDate:
+ #
+ try:
+ #
+ #print 'Removing', file
+ #
+ # XXX: Tell woodchuck.
+ remove(file) # commented out for testing
+ #
+ except OSError, exception:
+ #
+ logger.error('Could not remove %s: %s'
+ % (file, str (exception)))
+ logger.debug("updated %s: %fs in download, %fs in processing"
+ % (self.key, download_duration,
+ time.time () - process_start))
+ except:
+ logger.error("Updating %s: %s" % (self.key, traceback.format_exc()))
+ finally:
+ self.db.commit ()
+
+ if have_serial_execution_lock:
+ self.serial_execution_lock.release ()
+
+ updateTime = 0
+ try:
+ rows = self.db.execute("SELECT MAX(date) FROM feed;")
+ for row in rows:
+ updateTime=row[0]
+ except Exception, e:
+ logger.error("Fetching update time: %s: %s"
+ % (str(e), traceback.format_exc()))
+ finally:
+ if not success:
+ etag = None
+ modified = None
+ title = None
+ try:
+ title = tmp.feed.title
+ except (AttributeError, UnboundLocalError), exception:
+ pass
+ if postFeedUpdateFunc is not None:
+ postFeedUpdateFunc (self.key, updateTime, etag, modified,
+ title, *postFeedUpdateFuncArgs)
+
+ self.cache_invalidate()
+
+ def setEntryRead(self, id):
+ self.db.execute("UPDATE feed SET read=1 WHERE id=?;", (id,) )
+ self.db.commit()
+
+ def doit():
+ try:
+ wc()[self.key][id].used()
+ except KeyError:
+ pass
+ if wc().available():
+ mainthread.execute(doit, async=True)
+ self.cache_invalidate('feed')
+
+ def setEntryUnread(self, id):
+ self.db.execute("UPDATE feed SET read=0 WHERE id=?;", (id,) )
+ self.db.commit()
+ self.cache_invalidate('feed')
+
+ def markAllAsRead(self):
+ self.db.execute("UPDATE feed SET read=1 WHERE read=0;")
+ self.db.commit()
+ self.cache_invalidate('feed')
+
+ def isEntryRead(self, id):
+ return self.lookup('feed', 'read', id) == 1
+
+ def getTitle(self, id):
+ return self.lookup('feed', 'title', id)
+
+ def getContentLink(self, id):
+ return self.db.execute("SELECT contentLink FROM feed WHERE id=?;", (id,) ).fetchone()[0]
+
+ def getExternalLink(self, id):
+ return self.db.execute("SELECT link FROM feed WHERE id=?;", (id,) ).fetchone()[0]
+
+ def getDate(self, id):
+ dateStamp = self.db.execute("SELECT date FROM feed WHERE id=?;", (id,) ).fetchone()[0]
+ return time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime(dateStamp))
+
+ def getDateTuple(self, id):
+ dateStamp = self.db.execute("SELECT date FROM feed WHERE id=?;", (id,) ).fetchone()[0]
+ return time.localtime(dateStamp)
+
+ def getDateStamp(self, id):
+ return self.db.execute("SELECT date FROM feed WHERE id=?;", (id,) ).fetchone()[0]
+
+ def generateUniqueId(self, entry):
+ """
+ Generate a stable identifier for the article. For the same
+ entry, this should result in the same identifier. If
+ possible, the identifier should remain the same even if the
+ article is updated.
+ """
+ # Prefer the entry's id, which is supposed to be globally
+ # unique.
+ key = entry.get('id', None)
+ if not key:
+ # Next, try the link to the content.
+ key = entry.get('link', None)
+ if not key:
+ # Ok, the title and the date concatenated are likely to be
+ # relatively stable.
+ key = entry.get('title', None) + entry.get('date', None)
+ if not key:
+ # Hmm, the article's content will at least guarantee no
+ # false negatives (i.e., missing articles)
+ key = entry.get('content', None)
+ if not key:
+ # If all else fails, just use a random number.
+ key = str (random.random ())
+ return getId (key)
+
+ def getIds(self, onlyUnread=False):
+ if onlyUnread:
+ rows = self.db.execute("SELECT id FROM feed where read=0 ORDER BY date DESC;").fetchall()
+ else:
+ rows = self.db.execute("SELECT id FROM feed ORDER BY date DESC;").fetchall()
+ ids = []
+ for row in rows:
+ ids.append(row[0])
+ #ids.reverse()
+ return ids
+
+ def getNextId(self, id, forward=True):
+ if forward:
+ delta = 1
+ else:
+ delta = -1
+ ids = self.getIds()
+ index = ids.index(id)
+ return ids[(index + delta) % len(ids)]
+
+ def getPreviousId(self, id):
+ return self.getNextId(id, forward=False)
+
+ def getNumberOfUnreadItems(self):
+ return self.db.execute("SELECT count(*) FROM feed WHERE read=0;").fetchone()[0]
+
+ def getNumberOfEntries(self):
+ return self.db.execute("SELECT count(*) FROM feed;").fetchone()[0]
+
+ def getArticle(self, entry):
+ #self.setEntryRead(id)
+ #entry = self.entries[id]
+ title = entry['title']
+ #content = entry.get('content', entry.get('summary_detail', {}))
+ content = entry["content"]
+
+ link = entry['link']
+ author = entry['author']
+ date = time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime(entry["date"]) )
+
+ #text = '''<div style="color: black; background-color: white;">'''
+ text = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">'
+ text += "<html><head><title>" + title + "</title>"
+ text += '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>\n'
+ #text += '<style> body {-webkit-user-select: none;} </style>'
+ text += '</head><body bgcolor=\"#ffffff\"><div><a href=\"' + link + '\">' + title + "</a>"
+ if author != None:
+ text += "<BR /><small><i>Author: " + author + "</i></small>"
+ text += "<BR /><small><i>Date: " + date + "</i></small></div>"
+ text += "<BR /><BR />"
+ text += content
+ text += "</body></html>"
+ return text
+
+ def getContent(self, id):
+ contentLink = self.db.execute("SELECT contentLink FROM feed WHERE id=?;", (id,)).fetchone()[0]
+ try:
+ file = open(self.entries[id]["contentLink"])
+ content = file.read()
+ file.close()
+ except:
+ content = "Content unavailable"
+ return content
+
+ def extractDate(self, entry):
+ if entry.has_key("updated_parsed"):
+ return timegm(entry["updated_parsed"])
+ elif entry.has_key("published_parsed"):
+ return timegm(entry["published_parsed"])
+ else:
+ return time.time()
+
+ def extractContent(self, entry):
+ content = ""
+ if entry.has_key('summary'):
+ content = entry.get('summary', '')
+ if entry.has_key('content'):
+ if len(entry.content[0].value) > len(content):
+ content = entry.content[0].value
+ if content == "":
+ content = entry.get('description', '')
+ return content
+
+ def removeEntry(self, id):
+ contentLink = self.db.execute("SELECT contentLink FROM feed WHERE id=?;", (id,)).fetchone()[0]
+ if contentLink:
+ try:
+ remove(contentLink)
+ except OSError, exception:
+ logger.error("Deleting %s: %s" % (contentLink, str (exception)))
+ self.db.execute("DELETE FROM feed WHERE id=?;", (id,) )
+ self.db.execute("DELETE FROM images WHERE id=?;", (id,) )
+ self.db.commit()
+
+ def doit():
+ try:
+ wc()[self.key][id].files_deleted (
+ woodchuck.DeletionResponse.Deleted)
+ del wc()[self.key][id]
+ except KeyError:
+ pass
+ if wc().available():
+ mainthread.execute (doit, async=True)
+
+class ArchivedArticles(Feed):
+ def addArchivedArticle(self, title, link, date, configdir):
+ id = self.generateUniqueId({"date":date, "title":title})
+ values = (id, title, link, date, 0, link, 0)
+ self.db.execute("INSERT INTO feed (id, title, contentLink, date, updated, link, read) VALUES (?, ?, ?, ?, ?, ?, ?);", values)
+ self.db.commit()
+
+ def updateFeed(self, configdir, url, etag, modified, expiryTime=24, proxy=None, imageCache=False):
+ currentTime = 0
+ rows = self.db.execute("SELECT id, link FROM feed WHERE updated=0;")
+ for row in rows:
+ currentTime = time.time()
+ id = row[0]
+ link = row[1]
+ f = urllib2.urlopen(link)
+ #entry["content"] = f.read()
+ html = f.read()
+ f.close()
+ soup = BeautifulSoup(html)
+ images = soup('img')
+ baseurl = link
+ for img in images:
+ filename = self.addImage(configdir, self.key, baseurl, img['src'], proxy=proxy)
+ img['src']=filename
+ self.db.execute("INSERT INTO images (id, imagePath) VALUES (?, ?);", (id, filename) )
+ self.db.commit()
+ contentLink = configdir+self.key+".d/"+id+".html"
+ file = open(contentLink, "w")
+ file.write(soup.prettify())
+ file.close()
+
+ self.db.execute("UPDATE feed SET read=0, contentLink=?, updated=? WHERE id=?;", (contentLink, time.time(), id) )
+ self.db.commit()
+ return (currentTime, None, None)
+
+ def purgeReadArticles(self):
+ rows = self.db.execute("SELECT id FROM feed WHERE read=1;")
+ #ids = self.getIds()
+ for row in rows:
+ self.removeArticle(row[0])
+
+ def removeArticle(self, id):
+ rows = self.db.execute("SELECT imagePath FROM images WHERE id=?;", (id,) )
+ for row in rows:
+ try:
+ count = self.db.execute("SELECT count(*) FROM images WHERE id!=? and imagePath=?;", (id,row[0]) ).fetchone()[0]
+ if count == 0:
+ os.remove(row[0])
+ except:
+ pass
+ self.removeEntry(id)
+
+class Listing(BaseObject):
+ # Columns to cache.
+ cached_columns = (('feeds', 'updateTime'),
+ ('feeds', 'unread'),
+ ('feeds', 'title'),
+ ('categories', 'title'))
+
+ def _getdb(self):
+ try:
+ db = self.tls.db
+ except AttributeError:
+ db = sqlite3.connect("%s/feeds.db" % self.configdir, timeout=120)
+ self.tls.db = db
+ return db
+ db = property(_getdb)
+
+ # Lists all the feeds in a dictionary, and expose the data
+ def __init__(self, config, configdir):
+ self.config = config
+ self.configdir = configdir
+
+ self.tls = threading.local ()
+
+ try:
+ table = self.db.execute("SELECT sql FROM sqlite_master").fetchone()
+ if table == None:
+ self.db.execute("CREATE TABLE feeds(id text, url text, title text, unread int, updateTime float, rank int, etag text, modified text, widget int, category int);")
+ self.db.execute("CREATE TABLE categories(id text, title text, unread int, rank int);")
+ self.addCategory("Default Category")
+ if isfile(self.configdir+"feeds.pickle"):
+ self.importOldFormatFeeds()
+ else:
+ self.addFeed("Maemo News", "http://maemo.org/news/items.xml")
+ else:
+ from string import find, upper
+ if find(upper(table[0]), "WIDGET")<0:
+ self.db.execute("ALTER TABLE feeds ADD COLUMN widget int;")
+ self.db.execute("UPDATE feeds SET widget=1;")
+ self.db.commit()
+ if find(upper(table[0]), "CATEGORY")<0:
+ self.db.execute("CREATE TABLE categories(id text, title text, unread int, rank int);")
+ self.addCategory("Default Category")
+ self.db.execute("ALTER TABLE feeds ADD COLUMN category int;")
+ self.db.execute("UPDATE feeds SET category=1;")
+ self.db.commit()
+ except:
+ pass
+
+ # Check that Woodchuck's state is up to date with respect our
+ # state.
+ updater = os.path.basename(sys.argv[0]) == 'update_feeds.py'
+ wc_init (self, True if updater else False)
+ if wc().available() and updater:
+ # The list of known streams.
+ streams = wc().streams_list ()
+ stream_ids = [s.identifier for s in streams]
+
+ # Register any unknown streams. Remove known streams from
+ # STREAMS_IDS.
+ for key in self.getListOfFeeds():
+ title = self.getFeedTitle(key)
+ # XXX: We should also check whether the list of
+ # articles/objects in each feed/stream is up to date.
+ if key not in stream_ids:
+ logger.debug(
+ "Registering previously unknown channel: %s (%s)"
+ % (key, title,))
+ # Use a default refresh interval of 6 hours.
+ wc().stream_register (key, title, 6 * 60 * 60)
+ else:
+ # Make sure the human readable name is up to date.
+ if wc()[key].human_readable_name != title:
+ wc()[key].human_readable_name = title
+ stream_ids.remove (key)
+
+
+ # Unregister any streams that are no longer subscribed to.
+ for id in stream_ids:
+ logger.debug("Unregistering %s" % (id,))
+ w.stream_unregister (id)
+
+ def importOldFormatFeeds(self):
+ """This function loads feeds that are saved in an outdated format, and converts them to sqlite"""
+ import rss
+ listing = rss.Listing(self.configdir)
+ rank = 0
+ for id in listing.getListOfFeeds():
+ try:
+ rank += 1
+ values = (id, listing.getFeedTitle(id) , listing.getFeedUrl(id), 0, time.time(), rank, None, "None", 1)
+ self.db.execute("INSERT INTO feeds (id, title, url, unread, updateTime, rank, etag, modified, widget, category) VALUES (?, ?, ? ,? ,? ,?, ?, ?, ?, 1);", values)
+ self.db.commit()
+
+ feed = listing.getFeed(id)
+ new_feed = self.getFeed(id)
+
+ items = feed.getIds()[:]
+ items.reverse()
+ for item in items:
+ if feed.isEntryRead(item):
+ read_status = 1
+ else:
+ read_status = 0
+ date = timegm(feed.getDateTuple(item))
+ title = feed.getTitle(item)
+ newId = new_feed.generateUniqueId({"date":date, "title":title})
+ values = (newId, title , feed.getContentLink(item), date, tuple(time.time()), feed.getExternalLink(item), read_status)
+ new_feed.db.execute("INSERT INTO feed (id, title, contentLink, date, updated, link, read) VALUES (?, ?, ?, ?, ?, ?, ?);", values)
+ new_feed.db.commit()
+ try:
+ images = feed.getImages(item)
+ for image in images:
+ new_feed.db.execute("INSERT INTO images (id, imagePath) VALUES (?, ?);", (item, image) )
+ new_feed.db.commit()
+ except:
+ pass
+ self.updateUnread(id)
+ except:
+ logger.error("importOldFormatFeeds: %s"
+ % (traceback.format_exc(),))
+ remove(self.configdir+"feeds.pickle")
+
+
+ def addArchivedArticle(self, key, index):
+ feed = self.getFeed(key)
+ title = feed.getTitle(index)
+ link = feed.getExternalLink(index)
+ date = feed.getDate(index)
+ count = self.db.execute("SELECT count(*) FROM feeds where id=?;", ("ArchivedArticles",) ).fetchone()[0]
+ if count == 0:
+ self.addFeed("Archived Articles", "", id="ArchivedArticles")
+
+ archFeed = self.getFeed("ArchivedArticles")
+ archFeed.addArchivedArticle(title, link, date, self.configdir)
+ self.updateUnread("ArchivedArticles")
+
+ def updateFeed(self, key, expiryTime=None, proxy=None, imageCache=None,
+ priority=0):
+ if expiryTime is None:
+ expiryTime = self.config.getExpiry()
+ if not expiryTime:
+ # Default to 24 hours
+ expriyTime = 24
+ if proxy is None:
+ (use_proxy, proxy) = self.config.getProxy()
+ if not use_proxy:
+ proxy = None
+ if imageCache is None:
+ imageCache = self.config.getImageCache()
+
+ feed = self.getFeed(key)
+ (url, etag, modified) = self.db.execute("SELECT url, etag, modified FROM feeds WHERE id=?;", (key,) ).fetchone()
+ try:
+ modified = time.struct_time(eval(modified))
+ except:
+ modified = None
+ feed.updateFeed(
+ self.configdir, url, etag, modified, expiryTime, proxy, imageCache,
+ priority, postFeedUpdateFunc=self._queuePostFeedUpdate)
+
+ def _queuePostFeedUpdate(self, *args, **kwargs):
+ mainthread.execute (self._postFeedUpdate, async=True, *args, **kwargs)
+
+ def _postFeedUpdate(self, key, updateTime, etag, modified, title):
+ if modified==None:
+ modified="None"
+ else:
+ modified=str(tuple(modified))
+ if updateTime > 0:
+ self.db.execute("UPDATE feeds SET updateTime=?, etag=?, modified=? WHERE id=?;", (updateTime, etag, modified, key) )
+ else:
+ self.db.execute("UPDATE feeds SET etag=?, modified=? WHERE id=?;", (etag, modified, key) )
+
+ if title is not None:
+ self.db.execute("UPDATE feeds SET title=(case WHEN title=='' THEN ? ELSE title END) where id=?;",
+ (title, key))
+ self.db.commit()
+ self.cache_invalidate('feeds')
+ self.updateUnread(key)
+
+ update_server_object().ArticleCountUpdated()
+
+ stats = JobManager().stats()
+ global jobs_at_start
+ completed = stats['jobs-completed'] - jobs_at_start
+ in_progress = stats['jobs-in-progress']
+ queued = stats['jobs-queued']
+
+ try:
+ percent = (100 * ((completed + in_progress / 2.))
+ / (completed + in_progress + queued))
+ except ZeroDivisionError:
+ percent = 100
+
+ update_server_object().UpdateProgress(
+ percent, completed, in_progress, queued, 0, 0, 0, key)
+
+ if in_progress == 0 and queued == 0:
+ jobs_at_start = stats['jobs-completed']
+
+ def getFeed(self, key):
+ if key == "ArchivedArticles":
+ return ArchivedArticles(self.configdir, key)
+ return Feed(self.configdir, key)
+
+ def editFeed(self, key, title, url, category=None):
+ if category:
+ self.db.execute("UPDATE feeds SET title=?, url=?, category=? WHERE id=?;", (title, url, category, key))
+ else:
+ self.db.execute("UPDATE feeds SET title=?, url=? WHERE id=?;", (title, url, key))
+ self.db.commit()
+ self.cache_invalidate('feeds')
+
+ if wc().available():
+ try:
+ wc()[key].human_readable_name = title
+ except KeyError:
+ logger.debug("Feed %s (%s) unknown." % (key, title))
+
+ def getFeedUpdateTime(self, key):
+ update_time = self.lookup('feeds', 'updateTime', key)
+
+ if not update_time:
+ return "Never"
+
+ delta = time.time() - update_time
+
+ delta_hours = delta / (60. * 60.)
+ if delta_hours < .1:
+ return "A few minutes ago"
+ if delta_hours < .75:
+ return "Less than an hour ago"
+ if delta_hours < 1.5:
+ return "About an hour ago"
+ if delta_hours < 18:
+ return "About %d hours ago" % (int(delta_hours + 0.5),)
+
+ delta_days = delta_hours / 24.
+ if delta_days < 1.5:
+ return "About a day ago"
+ if delta_days < 18:
+ return "%d days ago" % (int(delta_days + 0.5),)
+
+ delta_weeks = delta_days / 7.
+ if delta_weeks <= 8:
+ return "%d weeks ago" % int(delta_weeks + 0.5)
+
+ delta_months = delta_days / 30.
+ if delta_months <= 30:
+ return "%d months ago" % int(delta_months + 0.5)
+
+ return time.strftime("%x", time.gmtime(update_time))
+
+ def getFeedNumberOfUnreadItems(self, key):
+ return self.lookup('feeds', 'unread', key)
+
+ def getFeedTitle(self, key):
+ title = self.lookup('feeds', 'title', key)
+ if title:
+ return title
+
+ return self.getFeedUrl(key)
+
+ def getFeedUrl(self, key):
+ return self.db.execute("SELECT url FROM feeds WHERE id=?;", (key,)).fetchone()[0]
+
+ def getFeedCategory(self, key):
+ return self.db.execute("SELECT category FROM feeds WHERE id=?;", (key,)).fetchone()[0]
+
+ def getListOfFeeds(self, category=None):
+ if category:
+ rows = self.db.execute("SELECT id FROM feeds WHERE category=? ORDER BY rank;", (category, ) )
+ else:
+ rows = self.db.execute("SELECT id FROM feeds ORDER BY rank;" )
+ keys = []
+ for row in rows:
+ if row[0]:
+ keys.append(row[0])
+ return keys
+
+ def getListOfCategories(self):
+ return list(row[0] for row in self.db.execute(
+ "SELECT id FROM categories ORDER BY rank;"))
+
+ def getCategoryTitle(self, id):
+ return self.lookup('categories', 'title', id)
+
+ def getSortedListOfKeys(self, order, onlyUnread=False, category=1):
+ if order == "Most unread":
+ tmp = "ORDER BY unread DESC"
+ #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][1], reverse=True)
+ elif order == "Least unread":
+ tmp = "ORDER BY unread"
+ #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][1])
+ elif order == "Most recent":
+ tmp = "ORDER BY updateTime DESC"
+ #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][2], reverse=True)
+ elif order == "Least recent":
+ tmp = "ORDER BY updateTime"
+ #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][2])
+ else: # order == "Manual" or invalid value...
+ tmp = "ORDER BY rank"
+ #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][0])
+ if onlyUnread:
+ sql = "SELECT id FROM feeds WHERE unread>0 AND category=%s " %category + tmp
+ else:
+ sql = "SELECT id FROM feeds WHERE category=%s " %category + tmp
+ rows = self.db.execute(sql)
+ keys = []
+ for row in rows:
+ if row[0]:
+ keys.append(row[0])
+ return keys
+
+ def getFavicon(self, key):
+ filename = "%s%s.d/favicon.ico" % (self.configdir, key)
+ if isfile(filename):
+ return filename
+ else:
+ return False
+
+ def updateUnread(self, key):
+ feed = self.getFeed(key)
+ self.db.execute("UPDATE feeds SET unread=? WHERE id=?;", (feed.getNumberOfUnreadItems(), key))
+ self.db.commit()
+ self.cache_invalidate('feeds')
+
+ def addFeed(self, title, url, id=None, category=1):
+ if not id:
+ id = getId(url)
+ count = self.db.execute("SELECT count(*) FROM feeds WHERE id=?;", (id,) ).fetchone()[0]
+ if count == 0:
+ max_rank = self.db.execute("SELECT MAX(rank) FROM feeds;").fetchone()[0]
+ if max_rank == None:
+ max_rank = 0
+ values = (id, title, url, 0, 0, max_rank+1, None, "None", 1, category)
+ self.db.execute("INSERT INTO feeds (id, title, url, unread, updateTime, rank, etag, modified, widget, category) VALUES (?, ?, ? ,? ,? ,?, ?, ?, ?,?);", values)
+ self.db.commit()
+ # Ask for the feed object, it will create the necessary tables
+ self.getFeed(id)
+
+ if wc().available():
+ # Register the stream with Woodchuck. Update approximately
+ # every 6 hours.
+ wc().stream_register(stream_identifier=id,
+ human_readable_name=title,
+ freshness=6*60*60)
+
+ return True
+ else:
+ return False
+
+ def addCategory(self, title):
+ rank = self.db.execute("SELECT MAX(rank)+1 FROM categories;").fetchone()[0]
+ if rank==None:
+ rank=1
+ id = self.db.execute("SELECT MAX(id)+1 FROM categories;").fetchone()[0]
+ if id==None:
+ id=1
+ self.db.execute("INSERT INTO categories (id, title, unread, rank) VALUES (?, ?, 0, ?)", (id, title, rank))
+ self.db.commit()
+
+ def removeFeed(self, key):
+ if wc().available ():
+ try:
+ del wc()[key]
+ except KeyError:
+ logger.debug("Removing unregistered feed %s failed" % (key,))
+
+ rank = self.db.execute("SELECT rank FROM feeds WHERE id=?;", (key,) ).fetchone()[0]
+ self.db.execute("DELETE FROM feeds WHERE id=?;", (key, ))
+ self.db.execute("UPDATE feeds SET rank=rank-1 WHERE rank>?;", (rank,) )
+ self.db.commit()
+
+ if isdir(self.configdir+key+".d/"):
+ rmtree(self.configdir+key+".d/")
+
+ def removeCategory(self, key):
+ if self.db.execute("SELECT count(*) FROM categories;").fetchone()[0] > 1:
+ rank = self.db.execute("SELECT rank FROM categories WHERE id=?;", (key,) ).fetchone()[0]
+ self.db.execute("DELETE FROM categories WHERE id=?;", (key, ))
+ self.db.execute("UPDATE categories SET rank=rank-1 WHERE rank>?;", (rank,) )
+ self.db.execute("UPDATE feeds SET category=1 WHERE category=?;", (key,) )
+ self.db.commit()
+
+ #def saveConfig(self):
+ # self.listOfFeeds["feedingit-order"] = self.sortedKeys
+ # file = open(self.configdir+"feeds.pickle", "w")
+ # pickle.dump(self.listOfFeeds, file)
+ # file.close()
+
+ def moveUp(self, key):
+ rank = self.db.execute("SELECT rank FROM feeds WHERE id=?;", (key,)).fetchone()[0]
+ if rank>0:
+ self.db.execute("UPDATE feeds SET rank=? WHERE rank=?;", (rank, rank-1) )
+ self.db.execute("UPDATE feeds SET rank=? WHERE id=?;", (rank-1, key) )
+ self.db.commit()
+
+ def moveCategoryUp(self, key):
+ rank = self.db.execute("SELECT rank FROM categories WHERE id=?;", (key,)).fetchone()[0]
+ if rank>0:
+ self.db.execute("UPDATE categories SET rank=? WHERE rank=?;", (rank, rank-1) )
+ self.db.execute("UPDATE categories SET rank=? WHERE id=?;", (rank-1, key) )
+ self.db.commit()
+
+ def moveDown(self, key):
+ rank = self.db.execute("SELECT rank FROM feeds WHERE id=?;", (key,)).fetchone()[0]
+ max_rank = self.db.execute("SELECT MAX(rank) FROM feeds;").fetchone()[0]
+ if rank<max_rank:
+ self.db.execute("UPDATE feeds SET rank=? WHERE rank=?;", (rank, rank+1) )
+ self.db.execute("UPDATE feeds SET rank=? WHERE id=?;", (rank+1, key) )
+ self.db.commit()
+
+ def moveCategoryDown(self, key):
+ rank = self.db.execute("SELECT rank FROM categories WHERE id=?;", (key,)).fetchone()[0]
+ max_rank = self.db.execute("SELECT MAX(rank) FROM categories;").fetchone()[0]
+ if rank<max_rank:
+ self.db.execute("UPDATE categories SET rank=? WHERE rank=?;", (rank, rank+1) )
+ self.db.execute("UPDATE categories SET rank=? WHERE id=?;", (rank+1, key) )
+ self.db.commit()
+
+
--- /dev/null
+#!/usr/bin/env python2.5
+
+#
+# Copyright (c) 2007-2008 INdT.
+# Copyright (c) 2011 Neal H. Walfield
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+# ============================================================================
+# Name : update_feeds.py
+# Author : Yves Marcoz
+# Version : 0.6.1
+# Description : Simple RSS Reader
+# ============================================================================
+
+from rss_sqlite import Listing
+from config import Config
+from updatedbus import UpdateServerObject
+
+import os
+import traceback
+import sys
+import dbus
+
+from jobmanager import JobManager
+import mainthread
+
+import gobject
+gobject.threads_init()
+
+import logging
+logger = logging.getLogger(__name__)
+import debugging
+debugging.init(dot_directory=".feedingit", program_name="update_feeds")
+
+#CONFIGDIR="/home/user/.feedingit/"
+CONFIGDIR = os.environ.get("HOME", "/home/user") + "/.feedingit/"
+#DESKTOP_FILE = "/usr/share/applications/hildon-status-menu/feedingit_status.desktop"
+
+from socket import setdefaulttimeout
+timeout = 5
+setdefaulttimeout(timeout)
+del timeout
+
+class FeedUpdate(UpdateServerObject):
+ def __init__(self, bus_name):
+ UpdateServerObject.__init__(self, bus_name)
+
+ self.config = Config(self, CONFIGDIR+"config.ini")
+ self.listing = Listing(self.config, CONFIGDIR)
+
+ jm = JobManager(True)
+ jm.stats_hook_register (self.job_manager_update,
+ run_in_main_thread=True)
+
+ # Whether or no an update is in progress.
+ self.am_updating = False
+
+ # After an update an finished, we start the inactivity timer.
+ # If this fires before a new job arrives, we quit.
+ self.inactivity_timer = 0
+
+ # Whether we started in daemon mode, or not.
+ self.daemon = '--daemon' in sys.argv
+
+ if self.daemon:
+ logger.debug("Running in daemon mode: waiting for commands.")
+ self.inactivity_timer = gobject.timeout_add(
+ 5 * 60 * 1000, self.inactivity_cb)
+ else:
+ # Update all feeds.
+ logger.debug("Not running in daemon mode: updating all feeds.")
+ gobject.idle_add(self.UpdateAll)
+
+# # If the system becomes idle
+# bus = dbus.SystemBus()
+#
+# mce_request_proxy = bus.get_object(
+# 'com.nokia.mce', '/com/nokia/mce/request')
+# mce_request_iface = dbus.Interface(
+# mce_request_proxy, 'com.nokia.mce.request')
+# system_idle = mce_request_iface.get_inactivity_status()
+# # Force self.system_inactivity_ind to run: ensure that a state
+# # change occurs.
+# self.system_idle = not system_idle
+# self.system_inactivity_ind(system_idle)
+#
+# mce_signal_proxy = bus.get_object(
+# 'com.nokia.mce', '/com/nokia/mce/signal')
+# mce_signal_iface = dbus.Interface(
+# mce_signal_proxy, 'com.nokia.mce.signal')
+# mce_signal_iface.connect_to_signal(
+# 'system_inactivity_ind', self.system_inactivity_ind)
+
+ def increase_download_parallelism(self):
+ # The system has been idle for a while. Enable parallel
+ # downloads.
+ logger.debug("Increasing parallelism to 4 workers.")
+ JobManager().num_threads = 4
+ gobject.source_remove (self.increase_download_parallelism_id)
+ del self.increase_download_parallelism_id
+ return False
+
+ def system_inactivity_ind(self, idle):
+ # The system's idle state changed.
+ if (self.system_idle and idle) or (not self.system_idle and not idle):
+ # No change.
+ return
+
+ if not idle:
+ if hasattr (self, 'increase_download_parallelism_id'):
+ gobject.source_remove (self.increase_download_parallelism_id)
+ del self.increase_download_parallelism_id
+ else:
+ self.increase_download_parallelism_id = \
+ gobject.timeout_add_seconds(
+ 60, self.increase_download_parallelism)
+
+ if not idle:
+ logger.debug("Reducing parallelism to 1 worker.")
+ JobManager().num_threads = 1
+
+ self.system_idle = idle
+
+ def job_manager_update(self, jm, old_stats, new_stats, updated_feed):
+ queued = new_stats['jobs-queued']
+ in_progress = new_stats['jobs-in-progress']
+
+ if (queued or in_progress) and not self.am_updating:
+ logger.debug("new update started")
+ self.am_updating = True
+ self.UpdateStarted()
+ self.UpdateProgress(0, 0, in_progress, queued, 0, 0, 0, "")
+
+ if not queued and not in_progress:
+ logger.debug("update finished!")
+ self.am_updating = False
+ self.UpdateFinished()
+ self.ArticleCountUpdated()
+
+ if self.daemon:
+ self.inactivity_timer = gobject.timeout_add(
+ 60 * 1000, self.inactivity_cb)
+ else:
+ logger.debug("update finished, not running in daemon mode: "
+ "quitting")
+ mainloop.quit()
+
+ if (queued or in_progress) and self.inactivity_timer:
+ gobject.source_remove(self.inactivity_timer)
+ self.inactivity_timer = 0
+
+ def inactivity_cb(self):
+ """
+ The updater has been inactive for a while. Quit.
+ """
+ assert self.inactivity_timer
+ self.inactivity_timer = 0
+
+ if not self.am_updating:
+ logger.info("Nothing to do for a while. Quitting.")
+ mainloop.quit()
+
+ def StopUpdate(self):
+ """
+ Stop updating.
+ """
+ super(FeedUpdate, self).stopUpdate()
+
+ JobManager().quit()
+
+ def UpdateAll(self):
+ """
+ Update all feeds.
+ """
+ logger.info("starting update.")
+ super(FeedUpdate, self).UpdateAll()
+
+ feeds = self.listing.getListOfFeeds()
+ for k in feeds:
+ self.listing.updateFeed(k)
+ logger.debug("Queued all feeds (%d) for update." % len(feeds))
+
+ def Update(self, feed):
+ """
+ Update a particular feed.
+ """
+ super(FeedUpdate, self).Update(feed)
+
+ # We got a request via dbus. If we weren't in daemon mode
+ # before, enter it now.
+ self.daemon = True
+
+ self.listing.updateFeed(feed)
+
+
+import dbus.mainloop.glib
+dbus.mainloop.glib.DBusGMainLoop(set_as_default=True)
+
+mainloop = gobject.MainLoop()
+mainthread.init()
+
+# Acquire our name on the session bus. If this doesn't work, most
+# likely another update_feeds instance is already running. In this
+# case, just quit.
+try:
+ bus_name = dbus.service.BusName('org.marcoz.feedingit',
+ bus=dbus.SessionBus(),
+ do_not_queue=True)
+except Exception:
+ # We failed to acquire our bus name. Die.
+ try:
+ dbus_proxy = dbus.SessionBus().get_object(
+ 'org.freedesktop.DBus', '/org/freedesktop/DBus')
+ dbus_iface = dbus.Interface(dbus_proxy, 'org.freedesktop.DBus')
+ pid = dbus_iface.GetConnectionUnixProcessID('org.marcoz.feedingit')
+ logger.error("update_feeds already running: pid %d." % pid)
+ except Exception, e:
+ logger.error("Getting pid associated with org.marcoz.feedingit: %s"
+ % str(e))
+ logger.error("update_feeds already running.")
+
+ sys.exit(1)
+
+# Run the updater. Note: we run this until feed.am_updating is false.
+# Only is this case have all worker threads exited. If the main
+# thread exits before all threads have exited and the process gets a
+# signal, the Python interpreter is unable to handle the signal and it
+# runs really slow (rescheduling after ever single instruction instead
+# of every few thousand).
+feed = FeedUpdate(bus_name)
+while True:
+ try:
+ mainloop.run()
+ except KeyboardInterrupt:
+ logger.error("Interrupted. Quitting.")
+ JobManager().quit()
+
+ if not feed.am_updating:
+ break
+
--- /dev/null
+#!/usr/bin/env python2.5
+
+#
+# Copyright (c) 2007-2008 INdT.
+# Copyright (c) 2011 Neal H. Walfield
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+# ============================================================================
+# Name : FeedingIt.py
+# Author : Yves Marcoz
+# Version : 0.6.1
+# Description : Simple RSS Reader
+# ============================================================================
+
+import dbus.service
+import logging
+logger = logging.getLogger(__name__)
+
+_update_server_object = None
+def update_server_object():
+ global _update_server_object
+ assert _update_server_object is not None, \
+ "No UpdateServerObject instantiated!"
+ return _update_server_object
+
+class UpdateServerObject(dbus.service.Object):
+ def __init__(self, bus_name):
+ """
+ Start listening for requests.
+ """
+ global _update_server_object
+ assert _update_server_object is None, \
+ "Attempt to instantiate multiple UpdateServerObject objects."
+ _update_server_object = self
+
+ dbus.service.Object.__init__(self, bus_name,
+ '/org/marcoz/feedingit/update')
+
+ @dbus.service.method('org.marcoz.feedingit')
+ def StopUpdate(self):
+ logger.debug("Stop update called.")
+
+ @dbus.service.method('org.marcoz.feedingit')
+ def UpdateAll(self):
+ logger.debug("UpdateAll called.")
+
+ @dbus.service.method('org.marcoz.feedingit', in_signature='s')
+ def Update(self, feed):
+ logger.debug("Update(%s) called." % feed)
+
+ # A signal that will be exported to dbus
+ @dbus.service.signal('org.marcoz.feedingit', signature='')
+ def ArticleCountUpdated(self):
+ pass
+
+ # A signal that will be exported to dbus
+ @dbus.service.signal('org.marcoz.feedingit', signature='uuuuttus')
+ def UpdateProgress(self, percent_complete,
+ feeds_downloaded, feeds_downloading, feeds_pending,
+ bytes_downloaded, bytes_uploaded, bytes_per_second,
+ updated_feed):
+ pass
+
+ # A signal that will be exported to dbus
+ @dbus.service.signal('org.marcoz.feedingit', signature='')
+ def UpdateStarted(self):
+ pass
+
+ # A signal that will be exported to dbus
+ @dbus.service.signal('org.marcoz.feedingit', signature='')
+ def UpdateFinished(self):
+ pass
+
+
--- /dev/null
+# Copyright (c) 2011 Neal H. Walfield
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import logging
+logger = logging.getLogger(__name__)
+import traceback
+
+# Don't fail if the Woodchuck modules are not available. Just disable
+# Woodchuck's functionality.
+
+# Whether we imported the woodchuck modules successfully.
+woodchuck_imported = True
+try:
+ import pywoodchuck
+ from pywoodchuck import PyWoodchuck
+ from pywoodchuck import woodchuck
+except ImportError, exception:
+ logger.info(
+ "Unable to load Woodchuck modules: disabling Woodchuck support: %s"
+ % traceback.format_exc ())
+ woodchuck_imported = False
+ class PyWoodchuck (object):
+ def available(self):
+ return False
+ woodchuck = None
+
+# The default channel refresh interval: 6 hours.
+refresh_interval = 6 * 60 * 60
+
+class mywoodchuck (PyWoodchuck):
+ def __init__(self, listing, human_readable_name, identifier,
+ request_feedback):
+ try:
+ PyWoodchuck.__init__ (self, human_readable_name, identifier,
+ request_feedback)
+ except Exception, e:
+ logger.error(
+ "Failed to establish a connection to the Woodchuck server: %s"
+ % (str(e),))
+ self.available = self.not_available
+ return
+
+ self.listing = listing
+
+ def not_available(self):
+ return False
+
+ # Woodchuck upcalls.
+ def stream_update_cb(self, stream):
+ logger.debug("stream update called on %s (%s)"
+ % (stream.human_readable_name, stream.identifier,))
+
+ # Make sure no one else is concurrently updating this
+ # feed.
+ try:
+ self.listing.updateFeed(stream.identifier)
+ except:
+ logger.debug("Updating %s: %s"
+ % (stream.identifier, traceback.format_exc ()))
+
+ def object_transfer_cb(self, stream, object,
+ version, filename, quality):
+ log ("object transfer called on %s (%s) in stream %s (%s)"
+ % (object.human_readable_name, object.identifier,
+ stream.human_readable_name, stream.identifier))
+
+_w = None
+def wc_init(listing, request_feedback=False):
+ """Connect to the woodchuck server and initialize any state."""
+ global _w
+ assert _w is None
+
+ _w = mywoodchuck (listing, "FeedingIt", "org.marcoz.feedingit",
+ request_feedback)
+
+ if not woodchuck_imported or not _w.available ():
+ logger.info("Unable to contact Woodchuck server.")
+ else:
+ logger.debug("Woodchuck appears to be available.")
+
+def wc():
+ """Return the Woodchuck singleton."""
+ global _w
+ assert _w is not None
+ return _w
--- /dev/null
+import Qt 4.7
+import QtWebKit 1.0
+import "common" as Common
+
+Rectangle {
+ /*x: parent.width; height: parent.height;*/
+ width: parent.width;
+ height: parent.height
+ property alias zoomEnabled: slider.visible;
+ property alias value: slider.value;
+ //anchors.top: parent.top; anchors.bottom: parent.bottom;
+ color: "white";
+
+ Flickable {
+ id: flickable
+ //anchors.fill: screen;
+ height: parent.height;
+ width: parent.width;
+ contentWidth: webView.width*webView.scale; //Math.max(screen.width,webView.width*webView.scale)
+ contentHeight: Math.max(articleViewer.height,webView.height*webView.scale)
+ //contentWidth: childrenRect.width; contentHeight: childrenRect.height
+ interactive: parent.vertPanningEnabled;
+
+ flickDeceleration: 1500;
+ flickableDirection: Flickable.VerticalFlick
+ WebView {
+ id: webView
+ //url: flipItem.url;
+ html: flipItem.html;
+ preferredWidth: flickable.width
+ preferredHeight: flickable.height
+ //scale: 1.25;
+ transformOrigin: Item.TopLeft
+ scale: slider.value;
+ settings.defaultFontSize: 24
+ }
+
+// onFlickStarted: {
+// console.log("start contentx"+contentX)
+// console.log("start contenty"+contentY)
+// }
+ }
+
+ Common.Slider {
+ id: slider; visible: false
+ minimum: 0.2;
+ maximum: 2;
+ property real prevScale: 1
+ anchors {
+ bottom: parent.bottom; bottomMargin: 65
+ left: parent.left; leftMargin: 25
+ right: parent.right; rightMargin: 25
+ }
+ onValueChanged: {
+ if (webView.width * value > flickable.width) {
+ var xoff = (flickable.width/2 + flickable.contentX) * value / prevScale;
+ flickable.contentX = xoff - flickable.width/2;
+ }
+ if (webView.height * value > flickable.height) {
+ var yoff = (flickable.height/2 + flickable.contentY) * value / prevScale;
+ flickable.contentY = yoff - flickable.height/2;
+ }
+ prevScale = value;
+ }
+ Component.onCompleted: {value=0; value=1; }
+ }
+}
--- /dev/null
+import Qt 4.7
+
+Item {
+ id: articleViewer
+ //width: 480; height: 360;
+ width: parent.width; height: parent.height;
+ //property string feedid: "61ac1458d761423344998dc76770e36e" //articlesItem.feedid;
+ //property string hideReadArticles: "";
+ property alias articleShown: articleView.visible;
+ property bool zoomEnabled: false;
+ property bool vertPanningEnabled: true
+
+ function modulo(x,y) {
+ // Fixes modulo for negative numbers
+ return ((x%y)+y)%y;
+ }
+
+ function reload() {
+ articles.xml = articleViewer.feedid == "" ? "<?xml version=\"1.0\" encoding=\"utf-8\"?><xml></xml>" : controller.getArticlesXml(articleViewer.feedid);
+ articles.reload()
+ }
+
+ function next() {
+ if (articleView.visible) {
+ //articleView.positionViewAtIndex(modulo(articleView.currentIndex+1, articleView.count), ListView.Contain);
+ articleView.incrementCurrentIndex();
+ }
+ }
+
+ function prev() {
+ if (articleView.visible) {
+ //articleView.positionViewAtIndex(modulo(articleView.currentIndex-1, articleView.count), ListView.Contain);
+ articleView.decrementCurrentIndex();
+ }
+ }
+
+ function markAllAsRead() {
+ if (feedid!="") {
+ controller.markAllAsRead(feedid)
+ articles.reload();
+ }
+ }
+
+ function viewArticle(articleid) {
+ var index = 0;
+ for (var i=0; i<articleList.count; ++i) {
+ if (articles.get(0).articleid==articleid) {
+ index = i;
+ }
+ }
+ articleView.positionViewAtIndex(index, ListView.Contain); articleView.visible = true;
+ }
+
+ ListView {
+ id: articleList; model: visualModel.parts.list; z: 6
+ width: parent.width; height: parent.height; /*x: 0;*/
+ cacheBuffer: 100;
+ flickDeceleration: 1500
+ }
+
+ ListView {
+ id: articleView; model: visualModel.parts.flip; orientation: ListView.Horizontal
+ width: parent.width; height: parent.height; visible: false; z:8
+ //onCurrentIndexChanged: photosGridView.positionViewAtIndex(currentIndex, GridView.Contain)
+ highlightRangeMode: ListView.StrictlyEnforceRange; snapMode: ListView.SnapOneItem
+ //cacheBuffer: 5;
+ onMovementStarted: articleViewer.vertPanningEnabled=false;
+ onMovementEnded: articleViewer.vertPanningEnabled=true;
+ highlightMoveDuration: 300;
+ }
+
+ Rectangle {
+ id: noArticle
+ //width: parent.width; height: parent.height;
+ //color: "#000000"
+ anchors.centerIn: parent;
+ visible: false;
+ z:8;
+ Text { id: noText; color: "#ffffff"; anchors.centerIn: parent; text: qsTr("No articles available"); }
+ Image { id: loadingImage; anchors.centerIn: parent; source: "common/images/loading.png";
+ height: 96; width: 96;
+ NumberAnimation on rotation {
+ from: 0; to: 360; running: (loadingImage.visible == true); loops: Animation.Infinite; duration: 900
+ }
+ }
+
+ states: [ State {
+ name: "noArticle"; when: articles.count==0 && articles.status==XmlListModel.Ready
+ PropertyChanges { target: noArticle; visible: true; }
+ PropertyChanges { target: loadingImage; visible: false; }
+ PropertyChanges { target: noText; visible: true; }
+ }, State {
+ name: "loading"; when: articles.count==0 && articles.status != XmlListModel.Ready
+ PropertyChanges { target: noArticle; visible: true; }
+ PropertyChanges { target: noText; visible: false; }
+ PropertyChanges { target: loadingImage; visible: true; }
+ }
+ ]
+ }
+
+ VisualDataModel {
+ id: visualModel;
+ delegate: Package {
+ id: packageItem
+ Item { id: flipItem; Package.name: 'flip'; width: articleViewer.width; height: articleViewer.height;
+
+ property string url: (articleView.visible && Math.abs(articleView.currentIndex-index)<2) ? path: ""; //http://localhost:8000/html/" + articleViewer.feedid + "/" + articleid : "";
+ property string html: controller.getArticle(articleViewer.feedid, articleid)
+ ArticleDisplay {
+ zoomEnabled: articleViewer.zoomEnabled;
+ property bool vertPanningEnabled: articleViewer.vertPanningEnabled;
+
+ states: [ State {
+ name: 'articleIsRead';
+ when: articleView.visible && articleView.currentIndex == index;
+ StateChangeScript {
+ name: "myScript"
+ script: {
+ flipItem.url=path; //"http://localhost:8000/html/" + articleViewer.feedid + "/" + articleid;
+ controller.setEntryRead(articleViewer.feedid, articleid)
+ }
+ }
+ }, State {
+ name: 'articleIsClose'; when: articleView.visible && Math.abs(articleView.currentIndex-index)<2;
+ StateChangeScript {
+ script: { flipItem.url=path; } //"http://localhost:8000/html/" + articleViewer.feedid + "/" + articleid;}
+ }
+ }
+ ]
+ }
+ }
+
+ Item { Package.name: 'list';
+ id: wrapper; width: articleViewer.width; height: 86
+ Item {
+ id: moveMe
+ Rectangle { id: backRect; color: "black"; opacity: index % 2 ? 0.2 : 0.4; height: 84; width: wrapper.width; y: 1 }
+ Text {
+ anchors.fill: backRect
+ anchors.margins: 5
+ verticalAlignment: Text.AlignVCenter; text: title; color: (unread=="True") ? "white" : "#7b97fd";
+ width: wrapper.width; wrapMode: Text.WordWrap; font.bold: false;
+ }
+ }
+ MouseArea { anchors.fill: wrapper;
+ onClicked: { articleView.positionViewAtIndex(index, ListView.Contain); articleView.visible = true; }
+ }
+ }
+ }
+ model: articles
+ }
+
+ XmlListModel {
+ id: articles
+
+ //source: articleViewer.feedid == "" ? "" : "http://localhost:8000/articles/" + feedid + "?onlyUnread=" + hideReadArticles
+ //xml: articleViewer.feedid == "" ? "" : controller.getArticlesXml(articleViewer.feedid)
+ query: "/xml/article"
+
+ XmlRole { name: "title"; query: "title/string()" }
+ XmlRole { name: "articleid"; query: "articleid/string()"; isKey: true }
+ XmlRole { name: "path"; query: "path/string()" }
+ XmlRole { name: "unread"; query: "unread/string()"; isKey: true}
+ }
+
+
+}
--- /dev/null
+import Qt 4.7
+
+Item {
+ //anchors.fill: parent;
+ width: parent.width;
+ property string feedid : ""
+ property alias count: articles.count
+ property alias url: articles.source
+
+ x: parent.width; height: parent.height;
+ anchors.top: parent.top; anchors.bottom: parent.bottom
+
+ function getArticleid(index) {
+ return articles.get(index).articleid
+ }
+
+ function reload() {
+ //articlesModel.reload()
+ }
+
+ ListView {
+ id: articleList; model: articlesModel; delegate: articleDelegate; z: 6
+ width: parent.width; height: parent.height; /*x: 0;*/
+ cacheBuffer: 100;
+ flickDeceleration: 1500
+ }
+
+ XmlListModel {
+ id: articles
+
+ source: feedid == "" ? "" : "http://localhost:8000/articles/" + feedid + "?onlyUnread=" + hideReadArticles
+ query: "/xml/article"
+
+ XmlRole { name: "title"; query: "title/string()" }
+ XmlRole { name: "articleid"; query: "articleid/string()"; isKey: true }
+ XmlRole { name: "path"; query: "path/string()" }
+ XmlRole { name: "unread"; query: "unread/string()"; isKey: true}
+ }
+
+ Component {
+ id: articleDelegate
+
+ Item {
+ id: wrapper; width: wrapper.ListView.view.width; height: 86
+ Item {
+ id: moveMe
+ Rectangle { id: backRect; color: "black"; opacity: index % 2 ? 0.2 : 0.4; height: 84; width: wrapper.width; y: 1 }
+ Text {
+ anchors.fill: backRect
+ anchors.margins: 5
+ verticalAlignment: Text.AlignVCenter; text: title; color: (model.article.unread=="True") ? "white" : "#7b97fd";
+ width: wrapper.width; wrapMode: Text.WordWrap; font.bold: false;
+ }
+// Rectangle {
+// x: 3; y: 4; width: 77; height: 77; color: "#ff0000"; smooth: true
+
+// }
+
+// Column {
+// x: 3;
+
+// width: wrapper.width - 3; y: 5; spacing: 2
+// height: parent.height;
+// Text { Rectangle {anchors.fill: parent; color: "white"; opacity: 0.5;}
+// verticalAlignment: Text.AlignVCenter; text: model.article.title; color: (model.article.unread=="True") ? "white" : "#7b97fd"; width: parent.width; wrapMode: Text.WordWrap; font.bold: false; /*elide: Text.ElideRight;*/ /*style: Text.Raised;*/ styleColor: "black"; }
+// //Text { text: feedname; width: parent.width; elide: Text.ElideLeft; color: "#cccccc"; style: Text.Raised; styleColor: "black" }
+// }
+ }
+ MouseArea {
+ anchors.fill: wrapper;
+ onClicked: {
+ container.articleClicked(model.article.articleid, index)
+ }
+ }
+ }
+
+ }
+
+}
--- /dev/null
+import Qt 4.7
+
+Item {
+// anchors.fill: parent;
+ width: parent.width; height: parent.height;
+ //anchors.top: parent.top; anchors.bottom: parent.bottom
+ property bool inEditMode: true
+
+ function reload() {
+ categories.reload();
+ }
+
+ ListView {
+ id: categoryList; model: categories; delegate: categoryDelegate; z: 6;
+ cacheBuffer: 100; width: parent.width; height: parent.height;
+ }
+
+ XmlListModel {
+
+ id: categories
+
+ xml: controller.getCategoryXml()
+ query: "/xml/category"
+
+ XmlRole { name: "title"; query: "catname/string()" }
+ XmlRole { name: "catid"; query: "catid/string()"; isKey: true }
+ }
+
+ Component {
+ id: categoryDelegate
+
+ Item {
+
+ id: wrapper; width: wrapper.ListView.view.width; height: 86
+ Item {
+ id: moveMe
+ height: parent.height
+ Rectangle { color: "black"; opacity: index % 2 ? 0.2 : 0.4; height: 84; width: wrapper.width; y: 1 }
+ Rectangle {
+ x: 6; y: 4; width: 77; height: parent.height - 9; color: "white"; smooth: true
+
+ }
+ Column {
+ x: 92; width: wrapper.ListView.view.width - 95; y: 15; spacing: 2
+ Text { text: title; color: "white"; width: parent.width; font.bold: true; elide: Text.ElideRight; style: Text.Raised; styleColor: "black" }
+ //Text { text: feedname; width: parent.width; elide: Text.ElideLeft; color: "#cccccc"; style: Text.Raised; styleColor: "black" }
+ }
+ Item {
+ x: wrapper.ListView.view.width - 128; y: 12
+ height: 58; width: 58;
+ //anchors.horizontalCenter: parent.horizontalCenter;
+ Image { source: "common/images/wmEditIcon.png" }
+ MouseArea {
+ anchors.fill: parent; onClicked: { container.categoryEdit(catname, catid); }
+ }
+ visible: inEditMode
+ }
+ Item {
+ x: wrapper.ListView.view.width - 64; y: 12
+ height: 58; width: 58;
+ //anchors.horizontalCenter: parent.horizontalCenter;
+ Image { source: "common/images/delete.png" }
+ MouseArea {
+ anchors.fill: parent; onClicked: { container.categoryDeleted(catid); }
+ }
+ visible: inEditMode
+ }
+ }
+ MouseArea { enabled: !inEditMode; anchors.fill: wrapper; onClicked: { container.categoryClicked(catid); } }
+ }
+ }
+}
--- /dev/null
+import Qt 4.7
+import "common" as Common
+// Depends on qt4-declarative-qmlviewer
+
+Item {
+ width: 480
+ height: 640
+ anchors.fill: parent
+ id: screen
+
+ Rectangle {
+ id: container
+ anchors.fill: parent; color: "#343434";
+ anchors.centerIn: parent
+ //transformOrigin: Item.Center
+ property bool editMode: false
+ property bool inPortrait: width < height
+
+ function categoryClicked(catid) {
+ feedsItem.catid = catid;
+ feedsItem.reload();
+ categoriesItem.isShown = false;
+ feedsItem.visible = true;
+ }
+
+ function feedClicked(feedid, updating) {
+ flipper.feedid = feedid;
+ flipper.reload();
+ toolBar.feedUpdating = updating;
+ flipper.visible = true;
+ }
+
+ function backClicked() {
+ if (flipper.visible && flipper.articleShown) {
+ // We're viewing an article, and going back to article listing
+ flipper.articleShown = false;
+ flipper.reload()
+ //flipper.articleid = "";
+ //flipper.value = 1;
+ //articlesItem.reload()
+ return;
+ }
+ if (flipper.visible) {
+ feedsItem.reload();
+ toolBar.feedUpdating = false;
+ flipper.visible = false;
+ flipper.feedid = "";
+ flipper.reload();
+ return;
+ }
+
+ if (feedsItem.visible) {
+ // Viewing feeds, going back to categories
+ //feedsItem.catid = "";
+ feedsItem.visible = false;
+ //feedsItem.reload();
+ categoriesItem.isShown = true;
+ return;
+ }
+ if (!feedsItem.visible) {
+ // Viewing categories, quitting
+ Qt.quit();
+ }
+ }
+
+ function categoryDeleted(catid) {
+ confirmationMessage.catid=catid;
+ confirmationMessage.state="deleteCat";
+ }
+
+ function feedDeleted(catid, feedid) {
+ confirmationMessage.catid=catid;
+ confirmationMessage.feedid=feedid;
+ confirmationMessage.state="deleteFeed";
+ }
+
+ function feedEdit(feedname, feedid, url) {
+ addFeed.feedEdit = true;
+ addFeed.feedName = feedname;
+ addFeed.feedUrl = url;
+ addFeed.visible = true;
+ }
+
+ function addCategory(categoryName) {
+ controller.addCategory(categoryName)
+ categoriesItem.reload();
+ addCat.visible=false;
+ }
+
+ function addFeed(catid, feedName, feedURL) {
+ controller.addFeed(feedName, feedURL, catid)
+ var doc = new XMLHttpRequest();
+ feedsItem.reload();
+ addFeedDialog.visible=false;
+ }
+
+ function updateClicked(feedid) {
+ controller.updateFeed(feedid);
+ }
+
+ function updateAllClicked() {
+ controller.updateAll();
+ }
+
+ Common.Menu {
+ id: config
+ z: 5
+ property string hideReadFeeds : "False"
+ property string hideReadArticles : "False"
+
+ property bool isShown: false;
+
+ //width: parent.width; height: parent.height;
+
+ //height: 0
+ states: State {
+ name: "shown"; when: config.isShown == true
+ PropertyChanges { target: config; y: 66 }
+ }
+
+ transitions: Transition {
+ NumberAnimation { properties: "y"; duration: 300; easing.type: "InOutQuad" }
+ }
+
+ }
+
+ Common.ConfirmationMessage {
+ id: confirmationMessage;
+ property string catid: "";
+ property string feedid: "";
+
+ function action() {
+ if (state=="markAll") {
+ flipper.markAllAsRead();
+ state="hidden"
+ feedsItem.reload()
+ return;
+ }
+ if (state=="deleteCat") {
+ var doc = new XMLHttpRequest();
+ var url = "http://localhost:8000/deleteCat/"+catid
+ doc.open("GET", url);
+ doc.send();
+ categoriesItem.reload();
+ state="hidden";
+ return;
+ }
+ if (state=="deleteFeed") {
+ var doc = new XMLHttpRequest();
+ var url = "http://localhost:8000/deleteFeed/"+catid+"/"+feedid
+ doc.open("GET", url);
+ doc.send();
+ feedsItem.reload();
+ state="hidden";
+ return;
+ }
+ }
+ visible: false
+ onOkClicked: action()
+ onCancelClicked: visible=false
+ state: "hidden"
+ states: [ State {name: "markAll";
+ PropertyChanges { target: confirmationMessage; text: qsTr("Do you want to mark all items as read?") }
+ PropertyChanges { target: confirmationMessage; visible: true; }
+
+ }, State {name: "deleteCat";
+ PropertyChanges { target: confirmationMessage; text: qsTr("Do you want to delete this category?") }
+ PropertyChanges { target: confirmationMessage; visible: true; }
+ }, State {name: "deleteFeed";
+ PropertyChanges { target: confirmationMessage; text: qsTr("Do you want to delete this feed and all its articles?") }
+ PropertyChanges { target: confirmationMessage; visible: true; }
+ }, State {name: "hidden";
+ PropertyChanges { target: confirmationMessage; visible: false; }
+ }
+ ]
+
+ }
+
+ Common.ToolBar {
+ id: toolBar; z: 7
+ height: 66; anchors.top: parent.top; width: parent.width; opacity: 0.9
+ menuLabel: qsTr("Config"); backLabel: qsTr("Back")
+ nextLabel: qsTr("Next"); prevLabel: qsTr("Previous")
+ markAllLabel: qsTr("Mark All As Read"); zoomLabel: qsTr("Zoom")
+ taskSwitcherLabel: qsTr("Task Switch")
+ onMenuClicked: config.isShown = !config.isShown;
+ onBackClicked: container.backClicked()
+ onPrevClicked: flipper.prev();
+ onNextClicked: flipper.next();
+ onMarkAllClicked: {
+ confirmationMessage.state = "markAll";
+ }
+ onZoomClicked: { flipper.zoomEnabled = !flipper.zoomEnabled; }
+ onTaskSwitcherClicked: {
+ var doc = new XMLHttpRequest();
+ var url = "http://localhost:8000/task"
+ doc.open("GET", url);
+ doc.send();
+ }
+ onAddClicked: {
+ if (feedsItem.visible) {
+ addFeedDialog.feedName="";
+ addFeedDialog.catid = feedsItem.catid;
+ addFeedDialog.visible = true;
+ return;
+ }
+ if (categoriesItem.visible) {
+ addCat.catName="";
+ addCat.visible=true;
+ return;
+ }
+ }
+ onUpdateClicked: {
+ if (flipper.visible) {
+ toolBar.feedUpdating = true
+ container.updateClicked(flipper.feedid);
+ } else {
+ container.updateAllClicked();
+ }
+ }
+
+ states: [ State {
+ name: "navButtons"; when: flipper.articleShown
+ PropertyChanges { target: toolBar; nextVisible: !container.inPortrait; }
+ PropertyChanges { target: toolBar; prevVisible: !container.inPortrait; }
+ //PropertyChanges { target: toolBar; zoomVisible: true; }
+ PropertyChanges { target: toolBar; addVisible: false; }
+ },
+ State {
+ name: "feedButtons"; when: (flipper.visible)&&(!flipper.articleShown)
+ PropertyChanges { target: toolBar; markAllVisible: true; }
+ PropertyChanges { target: toolBar; addVisible: false; }
+ PropertyChanges { target: toolBar; updateVisible: true; }
+ },
+ State {
+ name: "quitButton"; when: (!feedsItem.visible)
+ PropertyChanges { target: toolBar; quitVisible: true;}
+ PropertyChanges { target: toolBar; updateVisible: true; }
+ //PropertyChanges { target: toolBar; addVisible: true; }
+ }
+ ]
+ }
+
+ Item {
+ id: views
+ //x: 2;
+ //y:66;
+ width: parent.width // - 4
+ height: parent.height-toolBar.height;
+ anchors.top: toolBar.bottom; anchors.bottom: parent.bottom
+ y: toolBar.height;
+
+ Common.AddCat {
+ visible: false;
+ id: addCat
+ width: parent.width;
+ height: parent.height;
+ z: 10;
+ }
+
+ Common.AddFeed {
+ visible: false;
+ id: addFeedDialog
+ width: parent.width;
+ height: parent.height;
+ z: 10;
+ }
+
+ Timer {
+ function checkUpdates() {
+ if (categoriesItem.visible && !feedsItem.visible) {
+ var doc = new XMLHttpRequest();
+ var url = "http://localhost:8000/isUpdating/"
+ doc.onreadystatechange = function() {
+ if (doc.readyState == XMLHttpRequest.DONE) {
+ var xmlDoc = doc.responseXML.documentElement;
+ //var els = xmlDoc.getElementsByTagName("updating");
+ var isUpdating = xmlDoc.firstChild.firstChild.nodeValue;
+
+ //console.log(isUpdating);
+ if (isUpdating=="True") {
+ toolBar.feedUpdating = true;
+ } else {
+ if (toolBar.feedUpdating) {
+ // We changed from updating to not updating, so we reload the listing
+ toolBar.feedUpdating = false;
+ categoriesItem.reload();
+ }
+ }
+ var commands = xmlDoc.lastChild.childNodes;
+ for (var ii = 0; ii < commands.length; ++ii) {
+ // process the commands
+ var command = commands[ii].attributes[0].value; //("c")
+ //console.log(command)
+ if (command=="openFeed") {
+ // Open feed feed
+ var catid = commands[ii].attributes[1].value;
+ var feedid = commands[ii].firstChild.nodeValue;
+ if (!flipper.visible) {
+ container.categoryClicked(catid);
+ container.feedClicked(feedid,false);
+ console.log("feedid: " + feedid);
+ }
+ }
+ if (command=="openArticle") {
+ // Open feed and article
+ var catid = commands[ii].attributes[1].value;
+ var feedid = commands[ii].attributes[2].value; //("key");
+ var articleid = commands[ii].firstChild.nodeValue;
+ if (!flipper.visible) {
+ container.categoryClicked(catid);
+ container.feedClicked(feedid,false);
+ flipper.viewArticle(articleid)
+ }
+ }
+ if (command=="addFeed") {
+ // Open the addFeed dialog
+ var url = commands[ii].firstChild.nodeValue;
+ //console.log("add: "+url)
+
+ }
+ }
+
+ }
+ }
+ doc.open("GET", url);
+ doc.send();
+ //categoriesItem.reload()
+ }
+ if (feedsItem.visible && !flipper.visible) {
+ //feedsItem.reload()
+ }
+ if (flipper.visible) {
+ var doc = new XMLHttpRequest();
+ var url = "http://localhost:8000/isUpdating/" + flipper.feedid
+ doc.onreadystatechange = function() {
+ if (doc.readyState == XMLHttpRequest.DONE) {
+ var xmlDoc = doc.responseXML.documentElement;
+ var isUpdating = xmlDoc.firstChild.firstChild.nodeValue;
+ //console.log(isUpdating);
+ if (isUpdating=="True") {
+ toolBar.feedUpdating = true;
+ } else {
+ if (toolBar.feedUpdating) {
+ // We changed from updating to not updating, so we reload the listing
+ toolBar.feedUpdating = false;
+ flipper.reload();
+ }
+ }
+ }
+ }
+ doc.open("GET", url);
+ doc.send();
+
+ //flipper.reload()
+ }
+ }
+ interval: 2000; running: false; repeat: true
+ onTriggered: checkUpdates();
+ }
+
+ Categories {
+ // Loads the categoryList view and delegate
+ id: categoriesItem
+ property bool isShown: true;
+ inEditMode: container.editMode;
+
+ states: State {
+ name: "shown"; when: categoriesItem.isShown == false
+ PropertyChanges { target: categoriesItem; x: -screen.width }
+ }
+
+ transitions: Transition {
+ NumberAnimation { properties: "x"; duration: 300; easing.type: "InOutQuad" }
+ }
+
+ }
+
+ Feeds {
+
+ // Loads the feedList view and delegate
+ id: feedsItem;
+ property string hideReadFeeds: config.hideReadFeeds
+ visible: false;
+ inEditMode: container.editMode;
+
+ states: [
+ State { name: "articlesShown"; when: flipper.visible; PropertyChanges { target: feedsItem; x: -parent.width } },
+ State { name: "shown"; when: feedsItem.visible; PropertyChanges { target: feedsItem; x: 0 } }
+ ]
+
+ transitions: Transition {
+ NumberAnimation { properties: "x"; duration: 300; easing.type: "InOutQuad" }
+ }
+
+ }
+
+ ArticleViewer {
+ id: flipper
+ visible: false;
+ property string hideReadArticles: config.hideReadArticles
+ property string feedid: ""
+ x: parent.width
+
+ states: State { name: "shown"; when: flipper.visible; PropertyChanges { target: flipper; x: 0 }
+ }
+
+ transitions: Transition {
+ NumberAnimation { properties: "x"; duration: 300; easing.type: "InOutQuad" }
+ }
+ }
+ }
+
+// Text {
+// x: container.width/2
+// y:container.height/2
+// text: runtime.orientation;
+// }
+
+}
+}
--- /dev/null
+/* File generated by QtCreator */
+
+import QmlProject 1.0
+
+Project {
+ /* Include .qml, .js, and image files from current directory and subdirectories */
+ QmlFiles {
+ directory: "."
+ }
+ JavaScriptFiles {
+ directory: "."
+ }
+ ImageFiles {
+ directory: "."
+ }
+ /* List of plugin directories passed to QML runtime */
+ // importPaths: [ "../exampleplugin" ]
+}
--- /dev/null
+<!DOCTYPE QtCreatorProject>
+<qtcreator>
+ <data>
+ <variable>ProjectExplorer.Project.ActiveTarget</variable>
+ <value type="int">0</value>
+ </data>
+ <data>
+ <variable>ProjectExplorer.Project.EditorSettings</variable>
+ <valuemap type="QVariantMap">
+ <value key="EditorConfiguration.Codec" type="QByteArray">Default</value>
+ </valuemap>
+ </data>
+ <data>
+ <variable>ProjectExplorer.Project.Target.0</variable>
+ <valuemap type="QVariantMap">
+ <value key="ProjectExplorer.ProjectConfiguration.DefaultDisplayName" type="QString">QML Viewer</value>
+ <value key="ProjectExplorer.ProjectConfiguration.DisplayName" type="QString">QML Viewer</value>
+ <value key="ProjectExplorer.ProjectConfiguration.Id" type="QString">QmlProjectManager.QmlTarget</value>
+ <value key="ProjectExplorer.Target.ActiveBuildConfiguration" type="int">-1</value>
+ <value key="ProjectExplorer.Target.ActiveDeployConfiguration" type="int">-1</value>
+ <value key="ProjectExplorer.Target.ActiveRunConfiguration" type="int">0</value>
+ <value key="ProjectExplorer.Target.BuildConfigurationCount" type="int">0</value>
+ <value key="ProjectExplorer.Target.DeployConfigurationCount" type="int">0</value>
+ <valuemap key="ProjectExplorer.Target.RunConfiguration.0" type="QVariantMap">
+ <value key="ProjectExplorer.ProjectConfiguration.DefaultDisplayName" type="QString"></value>
+ <value key="ProjectExplorer.ProjectConfiguration.DisplayName" type="QString">QML Viewer</value>
+ <value key="ProjectExplorer.ProjectConfiguration.Id" type="QString">QmlProjectManager.QmlRunConfiguration</value>
+ <value key="QmlProjectManager.QmlRunConfiguration.MainScript" type="QString">CurrentFile</value>
+ <value key="QmlProjectManager.QmlRunConfiguration.QDeclarativeViewerArguments" type="QString"></value>
+ <value key="QmlProjectManager.QmlRunConfiguration.QtVersion" type="int">4</value>
+ <value key="RunConfiguration.QmlDebugServerPort" type="uint">3768</value>
+ <value key="RunConfiguration.UseCppDebugger" type="bool">false</value>
+ <value key="RunConfiguration.UseQmlDebugger" type="bool">true</value>
+ </valuemap>
+ <value key="ProjectExplorer.Target.RunConfigurationCount" type="int">1</value>
+ </valuemap>
+ </data>
+ <data>
+ <variable>ProjectExplorer.Project.TargetCount</variable>
+ <value type="int">1</value>
+ </data>
+ <data>
+ <variable>ProjectExplorer.Project.Updater.EnvironmentId</variable>
+ <value type="QString">{6449687d-a4d3-4afc-95ac-89e1027ef47e}</value>
+ </data>
+ <data>
+ <variable>ProjectExplorer.Project.Updater.FileVersion</variable>
+ <value type="int">8</value>
+ </data>
+</qtcreator>
--- /dev/null
+import Qt 4.7
+
+Item {
+ //anchors.fill: parent;
+ width: parent.width;
+ property string catid : ""
+ property bool inEditMode: true
+ x: parent.width; height: parent.height;
+ anchors.top: parent.top; anchors.bottom: parent.bottom
+
+ function reload() {
+ feeds.xml = catid == "" ? "" : controller.getFeedsXml(catid);
+ //feeds.reload()
+ }
+
+ //Component.onCompleted: { console.log(x + " /") }
+
+ ListView {
+ id: feedList; model: feeds; delegate: feedDelegate; z: 6
+ width: parent.width; height: parent.height; /*x: 0;*/
+ cacheBuffer: 100;
+ flickDeceleration: 1500
+ }
+
+ XmlListModel {
+
+ id: feeds
+
+ //source: catid == "" ? "" : "http://localhost:8000/feeds/" + catid //+ "?onlyUnread=" + parent.hideReadArticles
+ //xml: catid == "" ? "" : controller.getFeedsXml(catid)
+ query: "/xml/feed"
+
+ XmlRole { name: "title"; query: "feedname/string()" }
+ XmlRole { name: "feedid"; query: "feedid/string()"; isKey: true }
+ XmlRole { name: "unread"; query: "unread/string()"; isKey: true }
+ XmlRole { name: "updatedDate"; query: "updatedDate/string()" }
+ XmlRole { name: "icon"; query: "icon/string()" }
+ XmlRole { name: "updating"; query: "updating/string()"; isKey: true }
+ //XmlRole { name: "url"; query: "url/string()"; }
+ }
+
+ Component {
+ id: feedDelegate
+
+ Item {
+ id: wrapper; width: wrapper.ListView.view.width;
+ visible: (unread == "0" && feedsItem.hideReadFeeds=="True") ? false : true
+ height: (visible) ? 86 : 0
+
+ Item {
+ id: moveMe
+ Rectangle { color: "black"; opacity: index % 2 ? 0.2 : 0.4; height: 84; width: wrapper.width; y: 1 }
+ Rectangle {
+ x: 3; y: 4; width: 77; height: 77; color: "#000000"; smooth: true
+ Image { width:32; height: 32; anchors.verticalCenter: parent.verticalCenter; anchors.horizontalCenter: parent.horizontalCenter;
+ source: (updating=="True")? "common/images/loading.png" : (icon == "False") ? "common/images/feedingit.png" : icon;
+ NumberAnimation on rotation {
+ from: 0; to: 360; running: (updating=="True"); loops: Animation.Infinite; duration: 900
+ }
+ }
+ }
+
+ Column {
+ x: 92; width: wrapper.ListView.view.width - 95; y: 5; spacing: 2
+ Text { text: title; color: "white"; width: parent.width; font.bold: true; elide: Text.ElideRight; style: Text.Raised; styleColor: "black" }
+ Text { text: updatedDate + " / " + qsTr("%1 unread items").arg(unread); color: (unread=="0") ? "white" : "#7b97fd"; width: parent.width; font.bold: false; elide: Text.ElideRight; style: Text.Raised; styleColor: "black" }
+ //Text { text: feedname; width: parent.width; elide: Text.ElideLeft; color: "#cccccc"; style: Text.Raised; styleColor: "black" }
+ }
+// Item {
+// x: wrapper.ListView.view.width - 128; y: 12
+// height: 58; width: 58;
+// //anchors.horizontalCenter: parent.horizontalCenter;
+// Image { source: "common/images/wmEditIcon.png" }
+// MouseArea {
+// anchors.fill: parent; onClicked: { container.feedEdit(feedname, feedid, url); }
+// }
+// visible: inEditMode
+// }
+ Item {
+ x: wrapper.ListView.view.width - 64; y: 12
+ height: 58; width: 58;
+ //anchors.horizontalCenter: parent.horizontalCenter;
+ Image { source: "common/images/delete.png" }
+ MouseArea {
+ anchors.fill: parent; onClicked: { container.feedDeleted(feedid); }
+ }
+ visible: inEditMode
+ }
+ }
+ MouseArea {
+ anchors.fill: wrapper;
+ onClicked: {
+ controller.feedClicked(model.feed)
+ container.feedClicked(feedid, updating=="True")
+
+ }
+ }
+ }
+
+ }
+
+}
--- /dev/null
+import QtQuick 1.1
+import com.meego 1.0
+
+Page {
+ id: mainPage
+ tools: commonTools
+ Label {
+ id: label
+ anchors.centerIn: parent
+ text: qsTr("Hello world!")
+ visible: false
+ }
+ Button{
+ anchors.horizontalCenter: parent.horizontalCenter
+ anchors.top: label.bottom
+ anchors.topMargin: 10
+ text: qsTr("Click here!")
+ onClicked: label.visible=true
+ }
+}
--- /dev/null
+import Qt 4.7
+import QtWebKit 1.0
+import "common" as Common
+
+Rectangle {
+ width: 380
+ height: 480
+
+ //anchors.top: parent.top; anchors.bottom: parent.bottom;
+ color: "white";
+
+ property string url: "";
+ Flickable {
+ id: flickable
+ //anchors.fill: screen;
+ height: parent.height;
+ width: parent.width;
+ contentWidth: webView.width*scale; //Math.max(screen.width,webView.width*webView.scale)
+ contentHeight: Math.max(parent.height,webView.height*webView.scale)
+
+ WebView {
+ id: webView
+ url: "http://www.google.com";
+ //url: "/home/user/.feedingit/640fb167aca8bf5318ed721c5162f5eb.d/56a86b6b1675716ab54db83b1a78ab4c.html"
+ preferredWidth: flickable.width
+ preferredHeight: flickable.height
+ settings.defaultFontSize: 32
+ scale: slider.value;
+ //smooth: false
+ //width: 200
+ //width: parent.width; height: parent.height;
+// Rectangle {
+// color: "#10000000"
+// anchors.fill: parent
+// }
+ //onLoadFinished: {console.log("Hello"); url="javascript:void(document.body.style.background='red');" }
+ onLoadFinished: {console.log(url);/* url="javascript:(function() { " +
+ "document.getElementsByTagName('body')[0].style.background = 'red'; " +
+ "})()"; console.log(url);*/ /*heuristicZoom(0,0,100)*/ }
+ }
+ }
+ Common.Slider {
+ id: slider; visible: true
+ minimum: 0.2;
+ maximum: 2;
+ value: 1
+ property real prevScale: 1
+ anchors {
+ bottom: parent.bottom; bottomMargin: 65
+ left: parent.left; leftMargin: 25
+ right: parent.right; rightMargin: 25
+ }
+ onValueChanged: {
+ if (webView.width * value > flickable.width) {
+ var xoff = (flickable.width/2 + flickable.contentX) * value / prevScale;
+ flickable.contentX = xoff - flickable.width/2;
+ }
+ if (webView.height * value > flickable.height) {
+ var yoff = (flickable.height/2 + flickable.contentY) * value / prevScale;
+ flickable.contentY = yoff - flickable.height/2;
+ }
+ prevScale = value;
+ }
+ Component.onCompleted: { value=0; value=1; }
+ }
+}
--- /dev/null
+import Qt 4.7
+
+Rectangle {
+ id: addCat;
+ width: 200 //parent.width
+ height: 172
+ color: "white"
+ property alias catName: categoryName.text
+ MouseArea { anchors.fill: parent; onClicked: {} }
+ Column {
+ Row {
+ width: addCat.width
+ height: 86;
+ Text { anchors.verticalCenter: parent.verticalCenter; text: qsTr("Category name:") }
+ LineInput{
+ id: categoryName
+ anchors.centerIn: parent
+ width: 140
+ focus: true
+ }
+ }
+ Row {
+ width: addCat.width
+ Button {
+ id: ok
+ text: qsTr("OK")
+ anchors.margins: 5; y: 3; width: 80; height: 60
+ onClicked: container.addCategory(categoryName.text)
+ }
+
+ Button {
+ id: cancel
+ text: qsTr("Cancel")
+ anchors.margins: 5; y: 3; width: 80; height: 60
+ onClicked: addCat.visible=false;
+ }
+ }
+ }
+
+}
--- /dev/null
+import Qt 4.7
+
+Rectangle {
+ id: addFeed;
+ width: 500 //parent.width
+ height: 172
+ color: "white"
+ property alias feedName: feedName.text
+ property string catid
+ property string feedUrl: feedURL.text
+ //property boolean feedEdit: false;
+
+ MouseArea { anchors.fill: parent; onClicked: {} }
+ Column {
+ Row {
+ width: addFeed.width
+ height: 86;
+ Text { anchors.verticalCenter: parent.verticalCenter; text: qsTr("Feed name:") }
+ LineInput{
+ id: feedName
+ anchors.centerIn: parent
+ width: 140
+ focus: true
+ }
+ }
+ Row {
+ width: addFeed.width
+ height: 86;
+ Text { anchors.verticalCenter: parent.verticalCenter; text: qsTr("Feed URL:") }
+ LineInput{
+ id: feedURL
+ anchors.centerIn: parent
+ width: 140
+ focus: true
+ text: "http://"
+ }
+ }
+ Row {
+ width: addFeed.width
+ Button {
+ id: ok
+ text: qsTr("OK")
+ anchors.margins: 5; y: 3; width: 80; height: 60
+ onClicked: container.addFeed(catid, feedName.text, feedURL.text)
+ }
+ Button {
+ id: cancel
+ text: qsTr("Cancel")
+ anchors.margins: 5; y: 3; width: 80; height: 60
+ onClicked: addFeed.visible=false;
+ }
+ }
+ }
+}
--- /dev/null
+import Qt 4.7
+
+Item {
+ id: container
+
+ signal clicked
+
+ property string text
+ property string imageSource: ""
+ property int imageRotation: 0
+
+ property alias iconRotation: icon.rotation
+
+ BorderImage {
+ id: buttonImage
+ source: "images/toolbutton.sci"
+ width: container.width; height: container.height
+ //visible: (container.imageSource=="")
+ }
+ BorderImage {
+ id: pressed
+ opacity: 0
+ source: "images/toolbutton.sci"
+ width: container.width; height: container.height
+ //visible: (container.imageSource=="")
+ }
+ Image {
+ id: icon
+ source: container.imageSource
+ rotation: container.imageRotation
+ //fillMode: Image.PreserveAspectFit
+ smooth: true
+ anchors.centerIn: buttonImage;
+ //width: container.width; height: container.height
+ }
+ MouseArea {
+ id: mouseRegion
+ anchors.fill: buttonImage
+ onClicked: { container.clicked(); }
+ }
+ Text {
+ color: "white"
+ anchors.centerIn: buttonImage; font.bold: true
+ text: container.text; style: Text.Raised; styleColor: "black"
+ visible: (container.imageSource=="")
+ }
+ states: [
+ State {
+ name: "Pressed"
+ when: mouseRegion.pressed == true
+ PropertyChanges { target: pressed; opacity: 1 }
+ }
+ ]
+}
--- /dev/null
+import Qt 4.7
+
+Rectangle {
+ id: confirmationMessage
+ signal okClicked
+ signal cancelClicked
+
+ property alias text: question.text
+
+ border.color: "black";
+ border.width : 4;
+ radius: 10;
+ color: "white"
+ height: 160;
+ width: 160;
+ z: 10;
+ anchors.fill: parent
+
+ Text {
+ id: question
+ text: qsTr("Are you sure?")
+ width: parent.width; height: 80
+ horizontalAlignment: Text.AlignHCenter
+ verticalAlignment: Text.AlignVCenter
+ anchors.top: parent.top
+ //anchors.bottom: parent.bottom
+ anchors.margins: 10;
+ //anchors.verticalCenter: parent.verticalCenter
+ }
+
+ Button {
+ id: ok
+ text: qsTr("OK")
+ width: parent.width/2 - 10;
+ anchors.left: parent.left; anchors.margins: 5; y: 3; height: 60
+ anchors.top: question.bottom
+ //anchors.bottom: parent.bottom
+ onClicked: confirmationMessage.okClicked()
+ }
+
+ Button {
+ id: cancel
+ text: qsTr("Cancel")
+ width: parent.width/2 - 10;
+ anchors.right: parent.right; anchors.margins: 5; y: 3; height: 60
+ anchors.top: question.bottom
+ //anchors.bottom: parent.bottom
+ anchors.left: ok.right
+ onClicked: confirmationMessage.cancelClicked()
+ }
+
+}
--- /dev/null
+import Qt 4.7
+
+FocusScope {
+ property alias text: input.text
+ property alias maximumLength: input.maximumLength
+ //anchors.centerIn: parent
+ width: 180; height: 28
+ BorderImage {
+ source: "images/lineedit.sci"
+ anchors.fill: parent
+ }
+ TextInput {
+ id: input
+ color: "#151515"; selectionColor: "green"
+ font.pixelSize: 16; font.bold: true
+ width: parent.width-16
+ anchors.centerIn: parent
+ focus: true
+ }
+}
--- /dev/null
+import Qt 4.7
+
+Rectangle {
+ width: 640
+ height: 480
+
+ ListView {
+ id: categoryList; model: categories; delegate: categoryDelegate; z: 6;
+ cacheBuffer: 100; width: parent.width; height: parent.height;
+ }
+
+
+
+}
--- /dev/null
+import Qt 4.7
+
+Item {
+// anchors.fill: parent;
+ width: 300; //height: 0;
+ //anchors.top: parent.top; anchors.bottom: parent.bottom
+ y: -parent.height
+
+ function getConfig() {
+ config.hideReadFeeds = controller.getConfig("hideReadFeeds");
+ config.hideReadArticles = controller.getConfig("hideReadArticles");
+
+ }
+
+ Switch {
+ id: hideReadFeedsSwitch;
+ text: qsTr("Hide Read Feeds");
+ value: config.hideReadFeeds
+ onClicked: config.hideReadFeeds = (config.hideReadFeeds == "False") ? "True" : "False"
+ }
+
+ Switch {
+ id: hideReadArticlesSwitch;
+ text: qsTr("Hide Read Articles");
+ value: config.hideReadArticles
+ onClicked: config.hideReadArticles = (config.hideReadArticles == "False") ? "True" : "False"
+ anchors.top: hideReadFeedsSwitch.bottom
+ }
+
+ Switch {
+ id: lockRotation;
+ text: qsTr("Lock Rotation");
+ value: container.lockRotation ? "True" : "False"
+ onClicked: { container.lockRotation=!container.lockRotation;
+ container.selectedOrientation = (container.lockRotation) ? container.activeOrientation : Orientation.UnknownOrientation }
+ anchors.top: hideReadArticlesSwitch.bottom
+ }
+
+ Switch {
+ id: editMode;
+ text: qsTr("Enter Edit Mode");
+ value: container.editMode ? "True" : "False"
+ onClicked: { container.editMode=!container.editMode; }
+ anchors.top: lockRotation.bottom
+ }
+
+ Rectangle {
+ id: closeButton
+ height: 50;
+ gradient: Gradient {
+ GradientStop {
+ position: 0.00;
+ color: "#343434";
+ }
+ GradientStop {
+ position: 1.00;
+ color: "#ffffff";
+ }
+ }
+ radius: 10;
+ width: parent.width
+ anchors.top: editMode.bottom
+
+ MouseArea {
+ id: mouseRegion
+ anchors.fill: closeButton
+ onClicked: { config.isShown = false }
+ }
+ }
+
+// ListView {
+// id: configList; model: configs; delegate: configDelegate; z: 6;
+// cacheBuffer: 100; width: parent.width; height: parent.height;
+// }
+
+// XmlListModel {
+
+// id: configs
+
+// //source: "http://api.flickr.com/services/feeds/photos_public.gne?"+(tags ? "tags="+tags+"&" : "")+"format=rss2"
+// //source: "/home/ymarcoz/feedlist.xml"
+// source: "http://localhost:8000/config"
+// query: "/xml/config"
+// //namespaceDeclarations: "declare namespace media=\"http://search.yahoo.com/mrss/\";"
+
+// XmlRole { name: "hideReadFeeds"; query: "hideReadFeeds/string()" }
+// XmlRole { name: "hideReadArticles"; query: "hideReadArticles/string()" }
+// //XmlRole { name: "catid"; query: "catid/string()"; isKey: true }
+
+// }
+
+// Component {
+// id: configDelegate
+
+// Item {
+
+// id: wrapper; width: wrapper.ListView.view.width; height: 86
+// Item {
+// id: moveMe
+// height: parent.height
+// Rectangle { color: "black"; opacity: index % 2 ? 0.2 : 0.4; height: 84; width: wrapper.width; y: 1 }
+// Rectangle {
+// x: 6; y: 4; width: 77; height: parent.height - 9; color: "white"; smooth: true
+
+// }
+// Column {
+// x: 92; width: wrapper.ListView.view.width - 95; y: 15; spacing: 2
+// Text { text: title; color: "white"; width: parent.width; font.bold: true; elide: Text.ElideRight; style: Text.Raised; styleColor: "black" }
+// //Text { text: feedname; width: parent.width; elide: Text.ElideLeft; color: "#cccccc"; style: Text.Raised; styleColor: "black" }
+// }
+// }
+// MouseArea { anchors.fill: wrapper; onClicked: { container.categoryClicked(catid); } }
+// }
+// }
+
+ Component.onCompleted: getConfig();
+}
--- /dev/null
+import Qt 4.7
+
+Item {
+ id: slider; width: 340; height: 48
+
+ // value is read/write.
+ property real value
+ onValueChanged: { handle.x = 2 + (value - minimum) * slider.xMax / (maximum - minimum); }
+ property real maximum: 1
+ property real minimum: 1
+ property int xMax: slider.width - handle.width - 4
+
+ Rectangle {
+ anchors.fill: parent
+ border.color: "white"; border.width: 0; radius: 8
+ gradient: Gradient {
+ GradientStop { position: 0.0; color: "#66343434" }
+ GradientStop { position: 1.0; color: "#66000000" }
+ }
+ }
+
+ Rectangle {
+ id: handle; smooth: true
+ x: slider.width / 2 - handle.width / 2; y: 2; width: 30; height: slider.height-4; radius: 6
+ gradient: Gradient {
+ GradientStop { position: 0.0; color: "lightgray" }
+ GradientStop { position: 1.0; color: "gray" }
+ }
+
+ MouseArea {
+ anchors.fill: parent; drag.target: parent
+ drag.axis: "XAxis"; drag.minimumX: 2; drag.maximumX: slider.xMax+2
+ onPositionChanged: { value = (maximum - minimum) * (handle.x-2) / slider.xMax + minimum; }
+ }
+ }
+}
--- /dev/null
+import Qt 4.7
+
+Item {
+ id: container
+
+ signal clicked
+
+ property string text
+ property string value
+
+ width: parent.width;
+ height: 86;
+ //anchors.fill: parent;
+
+// BorderImage {
+// id: buttonImage
+// source: "images/toolbutton.sci"
+// width: container.width; height: container.height
+// }
+// BorderImage {
+// id: pressed
+// opacity: 0
+// source: "images/toolbutton.sci"
+// width: container.width; height: container.height
+// }
+
+ Rectangle {
+ id: back
+ width: parent.width;
+ height: 82;
+ color: "#343434";
+ border.width : 4;
+ border.color: "black";
+ radius: 10;
+ }
+
+ Rectangle {
+ id: valueSwitch
+ color: (value=="False") ? "red" : "green";
+ border.width : 4;
+ border.color: "black";
+ radius: 10;
+ height: 40;
+ width: 40;
+ anchors.verticalCenter: back.verticalCenter
+ //anchors.verticalCenter: parent.verticalCenter
+ anchors.margins: 10;
+ anchors.right: back.right;
+ Text {
+ color: "white"
+ anchors.centerIn: valueSwitch; font.bold: true
+ text: (container.value == "False") ? "OFF" : "ON"; style: Text.Raised; styleColor: "black"
+ }
+ }
+
+ MouseArea {
+ id: mouseRegion
+ anchors.fill: back
+ onClicked: { container.clicked(); }
+ }
+ Text {
+ color: "white"
+ /*anchors.centerIn: back;*/ font.bold: true
+ anchors.left: parent.left;
+ anchors.margins: 10
+ anchors.verticalCenter: back.verticalCenter
+ text: container.text; style: Text.Raised; styleColor: "black"
+ }
+// states: [
+// State {
+// name: "Pressed"
+// when: mouseRegion.pressed == true
+// PropertyChanges { target: pressed; opacity: 1 }
+// }
+// ]
+}
--- /dev/null
+import Qt 4.7
+
+Item {
+ id: toolbar
+
+ property alias menuLabel: menuButton.text
+ property alias backLabel: backButton.text
+ property alias prevLabel: prevButton.text
+ property alias nextLabel: nextButton.text
+ property alias markAllLabel: markAllButton.text
+ property alias zoomLabel: zoomButton.text
+ property alias taskSwitcherLabel: taskSwitcherButton.text
+
+ property alias nextVisible: nextButton.visible
+ property alias prevVisible: prevButton.visible
+ property alias markAllVisible: markAllButton.visible
+ property alias zoomVisible: zoomButton.visible
+ property alias quitVisible: quitButton.visible
+ property alias addVisible: addButton.visible
+ property alias updateVisible: updateFeedButton.visible
+
+ property bool feedUpdating: false
+
+ signal menuClicked
+ signal backClicked
+ signal prevClicked
+ signal nextClicked
+ signal markAllClicked
+ signal zoomClicked
+ signal taskSwitcherClicked
+ signal addClicked
+ signal updateClicked
+ //signal rotateClicked
+
+ //BorderImage { source: "images/titlebar.sci"; width: parent.width; height: parent.height + 14; y: -7 }
+ Rectangle {
+ anchors.fill: parent; color: "#343434";
+ border.color: "black"
+ gradient: Gradient {
+ GradientStop {
+ position: 0.00;
+ color: "#343434";
+ }
+ GradientStop {
+ position: 1.00;
+ color: "#ffffff";
+ }
+ }
+
+ Row {
+ anchors.fill: parent
+ Button {
+ id: taskSwitcherButton
+ /*anchors.left: parent.left;*/ anchors.leftMargin: 5; y: 3; width: 116; height: 60
+ onClicked: toolbar.taskSwitcherClicked()
+ imageSource: "images/wmTaskLauncherIcon.png"
+ visible: false
+ }
+
+ Button {
+ id: menuButton
+ /*anchors.left: taskSwitcherButton.right;*/ anchors.leftMargin: 5; y: 3; width: 60; height: 60
+ onClicked: toolbar.menuClicked()
+ imageSource: "images/wmEditIcon.png"
+ }
+
+ Button {
+ id: addButton
+ visible: true; /*anchors.left: menuButton.right;*/
+ anchors.rightMargin: 5; y: 3; width: 60; height: 60
+ onClicked: toolbar.addClicked()
+ imageSource: "images/plus.png"
+
+ }
+
+ Button {
+ id: updateFeedButton
+ visible: false; /*anchors.left: menuButton.right;*/
+ anchors.rightMargin: 5; y: 3; width: 60; height: 60
+ onClicked: toolbar.updateClicked()
+ //imageSource: (!feedUpdating) ? "images/rotate.png" : "images/loading.png"
+ NumberAnimation on iconRotation {
+ from: 0; to: 360; running: (visible == true) && (feedUpdating); loops: Animation.Infinite; duration: 900
+ }
+ state: "update"
+ states : [State {name: "loading"; when: (feedUpdating);
+ PropertyChanges {target: updateFeedButton; imageSource: "images/loading2.png" }
+ }, State { name: "update"; when: (!feedUpdating);
+ PropertyChanges {target: updateFeedButton; iconRotation: 0}
+ PropertyChanges {target: updateFeedButton; imageSource: "images/rotate.png"}
+ }
+ ]
+ }
+
+ Button {
+ id: markAllButton
+ visible: false
+ /*anchors.left: updateFeedButton.right;*/ anchors.rightMargin: 5; y: 3; width: 60; height: 60
+ onClicked: toolbar.markAllClicked()
+ imageSource: "images/checkmark.png"
+ }
+
+ Button {
+ id: prevButton
+ visible: false
+ /*anchors.left: menuButton.right;*/ anchors.rightMargin: 5; y: 3; width: 120; height: 60
+ onClicked: toolbar.prevClicked()
+ imageSource: "images/InputMethodShiftButtonNormal.png"
+ imageRotation: -90;
+ }
+
+ Button {
+ id: zoomButton
+ visible: false
+ /*anchors.right: backButton.left; */anchors.rightMargin: 5; y: 3; width: 80; height: 60
+ onClicked: toolbar.zoomClicked()
+ imageSource: "images/Zoom-In-icon.png"
+ }
+
+ Button {
+ id: nextButton
+ visible: false
+ /*anchors.right: zoomButton.left;*/ anchors.rightMargin: 5; y: 3; width: 120; height: 60
+ onClicked: toolbar.nextClicked()
+ imageSource: "images/InputMethodShiftButtonNormal.png"
+ imageRotation: 90
+ }
+
+ Button {
+ id: backButton
+ anchors.rightMargin: 5; y: 3; width: 116; height: 60
+ anchors.right: parent.right
+ onClicked: toolbar.backClicked()
+ imageSource: "images/wmBackIcon.png"
+ visible: !quitButton.visible
+ }
+
+ Button {
+ id: quitButton
+ visible: false
+ anchors.rightMargin: 5; y: 3; width: 116; height: 60
+ anchors.right: parent.right
+ onClicked: toolbar.backClicked()
+ imageSource: "images/wmCloseIcon.png"
+ }
+ }
+ }
+}
--- /dev/null
+[Dolphin]
+ShowPreview=true
+Timestamp=2010,11,7,0,33,30
--- /dev/null
+border.left: 10
+border.top: 10
+border.bottom: 10
+border.right: 10
+source: lineedit.png
--- /dev/null
+border.left: 15
+border.top: 4
+border.bottom: 4
+border.right: 15
+source: toolbutton.png
--- /dev/null
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE TS>
+<TS version="2.0" language="en_CA">
+<context>
+ <name>FeedingItUI2</name>
+ <message>
+ <location filename="../FeedingItUI2.qml" line="53"/>
+ <source>Back</source>
+ <translation type="unfinished">Back 2</translation>
+ </message>
+</context>
+</TS>
--- /dev/null
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE TS>
+<TS version="2.0" language="en_CA">
+<context>
+ <name>FeedingItUI2</name>
+ <message>
+ <location filename="../FeedingItUI2.qml" line="75"/>
+ <source>Back</source>
+ <translation>Back 2</translation>
+ </message>
+ <message>
+ <location filename="../FeedingItUI2.qml" line="75"/>
+ <source>Config</source>
+ <translation>Config</translation>
+ </message>
+</context>
+<context>
+ <name>Feeds</name>
+ <message>
+ <source>unreadItems</source>
+ <translation type="obsolete"> %1 unread items</translation>
+ </message>
+ <message>
+ <location filename="../Feeds.qml" line="55"/>
+ <source>%1 unread items</source>
+ <translation>%1 unread items</translation>
+ </message>
+</context>
+</TS>
--- /dev/null
+import QtQuick 1.0
+import com.nokia.meego 1.0
+
+
+PageStackWindow {
+ initialPage: mainPage
+
+ Page{
+ id: mainPage
+ Component.onCompleted: {
+ var main = Qt.createComponent("FeedingIt.qml");
+ main.createObject(mainPage);
+ }
+ }
+}
--- /dev/null
+from distutils.core import setup
+import os, sys, glob
+
+def read(fname):
+ return open(os.path.join(os.path.dirname(__file__), fname)).read()
+
+setup(name="feedingit",
+ scripts=['feedingit'],
+ version='0.1.0',
+ maintainer="Yves",
+ maintainer_email="yves@marcoz.org",
+ description="FeedingIt - RSS Reader",
+ long_description=read('feedingit.longdesc'),
+ data_files=[('share/applications',['feedingit.desktop']),
+ ('share/icons/hicolor/64x64/apps', ['feedingit.png']),
+ ('share/feedingit/qml', glob.glob('qml/*.qml')),
+ ('share/feedingit/qml/common', glob.glob('qml/common/*.qml')),
+ ('share/feedingit/qml/common/images', glob.glob('qml/common/images/*')),
+ ('share/feedingit/qml/i18n', glob.glob('qml/i18n/*')),
+ ('share/dbus-1/services', ['feedingit_status.service']),
+ ('share/feedingit', glob.glob('pysrc/*.py')) ],)
--- /dev/null
+[DEFAULT]
+XS-Python-Version: 2.6
+Package: feedingit
+Section: user/development
+Depends: python-pyside.qtgui, python-pyside.qtopengl, python-pyside.qtdeclarative, python-dbus, python-gconf
--- /dev/null
+Format: 3.0 (quilt)
+Source: feedingit
+Binary: feedingit
+Architecture: all
+Version: 0.1.0-1
+Maintainer: Yves <yves@marcoz.org>
+Standards-Version: 3.9.1
+Build-Depends: python-all (>= 2.6.6-3), debhelper (>= 7.4.3)
+Checksums-Sha1:
+ 941d0754142974dd20314aece5506464c83dfe7c 169288 feedingit_0.1.0.orig.tar.gz
+ 91664070662f39656a7ae615429aa01024bb8362 859 feedingit_0.1.0-1.debian.tar.gz
+Checksums-Sha256:
+ 714e1c576c226a0001236dc952db962787f50b8484c0006421c67d9648abf6c2 169288 feedingit_0.1.0.orig.tar.gz
+ 3ad60d4661c04b51728eadcdd7a52e6fb9fc974e75978b5766f6b641d7562442 859 feedingit_0.1.0-1.debian.tar.gz
+Files:
+ b21a4a9cd2915faccd2998f0fdb25706 169288 feedingit_0.1.0.orig.tar.gz
+ 3cdefa2c1e40af13c690ec7a992f00fe 859 feedingit_0.1.0-1.debian.tar.gz
--- /dev/null
+Format: 1.8
+Date: Fri, 07 Oct 2011 20:59:08 -0700
+Source: feedingit
+Binary: feedingit
+Architecture: source all
+Version: 0.1.0-1
+Distribution: unstable
+Urgency: low
+Maintainer: Yves <yves@marcoz.org>
+Changed-By: Yves <yves@marcoz.org>
+Description:
+ feedingit - FeedingIt - RSS Reader
+Changes:
+ feedingit (0.1.0-1) unstable; urgency=low
+ .
+ * source package automatically created by stdeb 0.6.0+git
+Checksums-Sha1:
+ 2c2a6756d6048d1822c0a636c661d57497769a31 753 feedingit_0.1.0-1.dsc
+ 941d0754142974dd20314aece5506464c83dfe7c 169288 feedingit_0.1.0.orig.tar.gz
+ 91664070662f39656a7ae615429aa01024bb8362 859 feedingit_0.1.0-1.debian.tar.gz
+ 1744a0ffe2bafd2bda16b1faa9e75dcbfd7ab7cb 163214 feedingit_0.1.0-1_all.deb
+Checksums-Sha256:
+ da5ddb46c667c764b9ce1990e5013c7c50329ef552b9a3fdc8a0912861bb3de4 753 feedingit_0.1.0-1.dsc
+ 714e1c576c226a0001236dc952db962787f50b8484c0006421c67d9648abf6c2 169288 feedingit_0.1.0.orig.tar.gz
+ 3ad60d4661c04b51728eadcdd7a52e6fb9fc974e75978b5766f6b641d7562442 859 feedingit_0.1.0-1.debian.tar.gz
+ e4fec5ae33afaab67b452dcf6791fcb4dab50e0ff806d89e94c1f4417c1c47a4 163214 feedingit_0.1.0-1_all.deb
+Files:
+ 5f67a6a88a9f83e7fd790608449c0e71 753 user/development optional feedingit_0.1.0-1.dsc
+ b21a4a9cd2915faccd2998f0fdb25706 169288 user/development optional feedingit_0.1.0.orig.tar.gz
+ 3cdefa2c1e40af13c690ec7a992f00fe 859 user/development optional feedingit_0.1.0-1.debian.tar.gz
+ 3e9380a5029f1d564e92fb997322476e 163214 user/development optional feedingit_0.1.0-1_all.deb
--- /dev/null
+psa build-deb
+#scp /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit_harmattan/feedingit/deb_dist/feedingit_0.1.0-1_all.deb root@192.168.1.136:
+scp /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/deb_dist/feedingit_0.1.0-1_all.deb root@192.168.1.136:
+ssh root@192.168.1.136 "dpkg -i --force-depends-version feedingit_0.1.0-1_all.deb"
--- /dev/null
+#!/bin/sh
+
+case "$1" in
+dbus)
+ nice python /usr/share/feedingit/update_feeds.py
+ ;;
+*)
+ cd /usr/share/feedingit
+ python feedingit.py 2>&1 >/dev/null
+ ;;
+
+esac
+
--- /dev/null
+begin-base64 600 /scratchbox/users/ymarcoz/home/ymarcoz/workspace/feedingit-pyside/psa/feedingit/feedingit.png
+iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAYAAACqaXHeAAAAGXRFWHRTb2Z0
+d2FyZQBBZG9iZSBJbWFnZVJlYWR5ccllPAAAEptJREFUeNrkWwl0FFXWvlW9
+pLN0kiaEEAiYsAaQNXBYZJmBYfGI/8jyH/k9uCC4/IAMjMgIDDMIItsoKKsH
+GVEQxWERGRgDCOIgQYUEkElAZTGSBQPpJJ2k93pzX/Wr7qrqqk4Hg2fOmTrn
+ppZUVb/73f2+VxwhBP6bNyP9w3HcL/V70g/x7Fg6J4wEtv9lAbiLzFJGDex3
+TGwvkUEGAGXcj+RTkZddJ3cLGONdYpq+18IojlEsUgy99vupkN3cBklofSIA
+PA/CuUIo2XUQivHUg+SUUT2SC8nNQBGadMDUBzSBCXCM6RjGrBUpidLGJTC6
+fVvI7pQFXVumQhtLjAiK7lZVA/byCiguuQlXPz4KJ954B/Lx8m2kSqQaBobw
+nwIAx1SZSjcBqRlSyvrFMHbEIBiV3R56SjcSjxH8zhgQ6s2iMvtrYkU2qA/m
+eD8Y4jzisTHZBZzFC7zFJz7n9oDzu+vw1ab34NWNO+BLihEzDWgKs7hTACRV
+j2GMp/btDh1WzIPHc+6FXyUngo0IPPgd8eCvs4C/Oh6IjxeHSwSZq2PHRLJy
+f+j/vNkHhmQnmFrVgrGZE34ogdOZQ+Fx/K+dmYlPtie/pA+Q1J2qegpS632b
+4ekxQ2EiVW/iM4Lnpg38tTKmgemJQCNO4A1EUL5QOuckbNxGEMqs4gkF4OBx
++Jya1KxZszpcvHix9tixYxUMDAfzD/47AaKxGkBvRB2GRKSWsx6H4Qumwwtp
+zaE1ZdxbZQOfwxoahjywyaUvyM79Kk1QHScMKoYar89u6w1TqP07HI7tZrM5
+/uTJk5+MGDFiHV4rZT6ilmlEo0DgGyn5GGbnme/+Baa//idY26IZ39pXYwNX
+WRu08QTgDCRAPN0zyRsCv0TPOfk5HzqW/i8dUzK1cgAf64O9ubCX2v2iRYsG
+JyQkpCIAccOHDx9fVVW1Z/369dPwf1nUDJlWysNrk2mAgvmCA7CqV1e4T/Ca
+UeqpQPfBTR6xiVzanML2FdKWaULwOu7jet8Ah+Ch0n+CSreoqGhFdnZ2T/Xg
+CgoKTj3zzDMrvv7660I8rWChMyqT4Bsp+az8A7CaMk+l7bGn44DNQcnKKShZ
+dky1Qk/aHK86xl802FxgSPDAsTz4Bw2Bjz76aLIW83Tr3bv3oAMHDmyaM2fO
+/XjaijnmqDSBj9LmbVTyyPyq3l1hkN+VAF5HamC0RpmaG0J5XxgQPAOBjwAC
+FzqOaWsHlxtcyzbAbrx6y2az1ZSWlhbqDTQtLa31K6+8svq5554bg6fp0YIQ
+yQQkb0+Zz0K1f1WUPGW+NjWYydNH+ZQJwFsHAGfOoAEe/PbD4L+1R6nygtzB
+cSFn6A+pvbhH4i0eiOt2Aw7/E/aPfgL+hFfL2FjSly5d+sC0adOebdmyZSut
+Qbtwmzdv3vx169Z9whxkHTOHRgNgZBldWwxzcx8aCZP9bmS+LjUYysCYCKZ2
+bwKf0D/sYVJfCJ7LjwDx1oTFeAUI/hDjEgiWthVgTHFAr7Hw8PkiOMvCHWGm
+SDPM9Nzc3NmjRo36rR4I8+fPn7d27dpDeFrOUmqhMSYg2X3zpb+H/6HMC15L
+gHmZXRtb/U6TefEFcV3B1OHNkJob1CpPFCov1Yd8jE9kPv9fkIfMl7FxtKBj
+YUKhYFwZPXr0yy/hRplV/7YFtxdffPEPffv27cY02KRnCnoA0AeSMLvr8txj
+MBsIj8ynKUOYJQMMzaco8/iqKkDUAccF27ZtQ7PoD1x8V00npwBBRuZ0u/iu
+l96A7VTqa9asGePxeM589tlny5ChPswZ0+yvZPHixXvmzp27UAsE6hN27Njx
+EnOKVjby8E2jIUKHmIzU6+SHcJhcwTT+fBpx5WcR1zmkC1nEfTGLeEvmEvXW
+q1cvIguEZPbs2cRXvpa4CvC5M1nE+RVSXhapP4l0AulYFqk72o7U5SIdQjqY
+SejvndkPefj8SKQRly9fzpfej3zWY9xfhddzkDJoMobUbebMmXOcuBGNDX3B
+arwnmzlFLhoNoNJPnHg/9L4vB0YKnjgQfHFKz03VOSZD8RBKCM6dO6e4RrUh
+KF255DmZRnAhLTA1rxafw7SXOjDn+PHjszp16tRbel9MTEzsjBkzXvjiiy9e
+wdM2LELdRlCOPP/884u0BPzUU0/NQM3pxLTA2BAAPKvsmmOKSzMs8DlTgoOX
+qzJxfRldqiXUaKq/WvU5g4A5fzVcvwHf/XktHKfJDGrQb7ReOWjQoFGY/Kxm
+WkCZur1x48ZP9+7du1N9LwVt+fLlU5gPsai1gNfy/Cj9nmK8R69PiDE0aBkQ
+xHM6wBzbMjMzITk5WfGyYUP6YFTcE2JczrDq3JDkwPsEeHcfvMdy+5rY2Fi3
+Hq5obvchCKtYzKdbxYQJEzZfu3bte/W9gwcPHpOTk9NeSwt4Dc9vWzgdpgak
+bwsfsEwL/JUvKAA4fvw4HZh4/tBDD8GeLeizSI0+43K7S66GagdUovQ/Zc0P
+e79+/ZZjdjfr5s2bJXogbN++fQpzjLQQKkNTWKUVFVauXPkYiwgx8l+X5wEG
+Vt52dRbCP8xcnIV6/lAqq9xLx1zcSDAkovkZQj6B+G6AULkUBMeRYIwPxnof
+hF0zxNaCuXkFbN8Hmx+bC+g44AZresQx1b3nzJkzL6MUB2jF/ClTpjz9wQcf
+fM66RW3Onj37Rp8+fQaqIpQds0maN1xmGuZXawBVjYQNL8H9tK73e+K1swMV
+Ec8RzPyGglA1FvxVj6BWIJXjufOIQtKc1juC0reLrTBk/gPW8XEzABwsm7uM
+jmxefn5+npZ0MfN7lgmPbpXLli3bqr4PzdOGfuIBFg2MWiZAPWrCwD4wWPRd
+7jjthrZO2kT8hZj1nQbiPt2ohoQhrh44ow/2H4W/0ZyfqbKF9RykfJ46mx9Q
+AxZo2TgthiZOnNiTPVePzvAiVo4X1PcNGDBgCHunWQ2AVPRYszKgC2WeEJV/
+JCqSXSd+ZauLQPi9ROtdVBQJ1WLR88Y2yGXMJ7IQ14FRBrtG09mScePG/VEr
+8Vm4cOFUlibTUVSjyZxS35OVldVFBgAnB4CibHn4AWhP+3mCNybACAlnnBBl
+P0/R0fcrOz66JKEf4xLp868gF1NfmvbGTp48edD58+ffRJs9TOnChQubsRQe
+zECow/99d+LEicNq5rp06dKHJXCUlzrMII9qmcGkSZPaM0fIywEQG5zD+kNX
+Uf09Fn3GibKpEXRuTAsUmhAJQNrriw9If95KeJ8+hYPr+dZbb23t0aPHkKSk
+pGaUunfvPnTLli1/RRD6sSStGgud7VrxHm1/CDMDH4bIsvLy8lL1fd26dctk
+ABjkAFCnYO6UBZ0CAJj1pacqbSVvTlRaoOj4aLyH431gsNTD2YvwTyx6aJgz
+LFmyZDZlRIs5zPmnszjuQeZ+LCws/EZ9X//+/XMYc/RXnCUlJcUaSVQvLQDo
+3uT1Bbwj8fOaKk/UNbwOEbUpaJiF0Rooel7fBjuZkzN07Nixj56z7Ny5c18W
+FumbHJcuXQoDICMjI0sGgNtut5er7/F6vdI0XRgAxux2kC24LCHGhXBbV9b0
+2gR+HU1ge1H6GPtpyfu3Q/AN6+j6I0ULphkc0zMXasAljXviZSHOi5HgigaQ
+Xdg9vDoKhMpFlfS0VF4hdZ+GFugwT8kQFyh6Fr0Gu1jHhjYx64uLiy/rAfDt
+t98WsEhAcwQPFl4lGpUtJ5ta8Ffjphd91VGAC2SFYrKq3cZSaUOYxH0RtED+
+LP6h0r9dBTWHPhMTnSTmuDw7d+58Rw+A/fv30ySpmoVK2LNnT1kDU/BEp/zn
+5Pfx8siMGTFRtLA1WtZEx+Y1fYKgancxAEDgISUZEivz4dW/rhSrznbUwaF3
+z9uwYcNqFFylNOCamprbW7dufRmzvY9ZlkjfZkCPr+cvJMZ5QRDCyn1OrL9D
+4jDKHvLLTYAqQ+DewD/lcBEiK2zUWaI6/MkjgWgaRnDdaINmUAvJNnvylIm+
+aeNGwYR9h2HPk3+Av8+cOfM40jH2pIv5BzujeiliWa1Wmw5zfqljieG0g/qe
+Gzdu/CCfT+RlVu+7fBWK+FinsnEZQfoNOkF/uBYFzjnwVVvBebUtuEtTISnW
+aJsyEabZC+DtrStgAvPSdWySo4ylyLVs4GLShsVODzVzGPauy2aOzVgmJGkU
+T7UyrxUEgJ54TSbwKZwgGyz4NWJ+FKTtDzgFMD67FeqLEIhiBMJitD35vzAV
+gXjn3b/AZDbdlSRbXCG1VQGl21vN3PXr179jTpJqgIV5fMWGpXUp8yNhALhP
+5YPY0+ItLhXTnC5zIvlkpAMSZVyhUarQ6r1lxTy3LbiupEJijNH26Dh4FoH4
+CIH4HevptWEVn+XBBx+0lJaWnlczd+jQoVMsUlCg4tPT0zPU92CNcEk2mxzs
+B9AH0iaNhRHvvw7vespSwGtPUrTAQv08olnS6hVOotMlGrPCGnlF8Bz3plQH
+mO+xiwslaKl84FPYxcplO1PzBKwA750zZ84kLJUHV1ZWXkWG/w+v/0T/h37k
+t+vWrXtNw0/8CndFzLyIBADHuiqdEfW/WyHe5ipOU/btNXp5XADCsChEdFJo
+9RS5VmgNAyLNATHt7eIsMQWCls1PvCBOl9VIEWHYsGFxAwcONKxYseICAyct
+Ly9vDZa/Y+Qjw+zxPBZNT9J5BfY8kXeErGzyc1Ovzvyg+ouZGtLXaGhwOkFI
+qwgStIHQWxsgB8LcCoHohEDEhQFhZ8zUsAYKHXFLj8dTYDKZFF2dHTt2bMKi
+ag0e/sgijKIhQh1DbV4+nKQdWoO1TtfrQyOdINHLDTRqB73s0f2DFWpy20L9
+WfQRRqPt8fHwNGrrLvQRU9nkh43V+jSCOKdPn/7Y6dOnP3G73U6JQcwxDsii
+iWZPkPbfutCeoNEZb3FdSVP28bWkH6UGaGlB+KRpAyYhm90z3+MAS5eQRjAf
+8T7r90n5QgL6hzYLFiwYhqrfBmk+8xG10kjlAHAMwXZoBhtpW7z+QlssjY0K
+5rlI6q/hCEkUviCiP5Afa2w6QOxineV62VulHqNTlisoTIAwM6jaths+FJuV
+Laojhz+95MgXZdocDfNC5FWBHmoan4RMg4XPvWga/8+mznjGuASIL9L0uDQv
+2PHaCdiemQEd6/IDWhCVA4xgBrrpcSQtkJtLlJtcI2qw5jp4HNY8MhveZqbh
+bmhqjDCUKnZ+DGJlZs6wayY+emVwg07Qr/TuTcm8XCOcF1IgMQFSfj0AJmlN
+iUUCgNpH1cJXIbegEE6ZWjgwIriiMoVo1V8zCmhFhZ+xDpRqAJto/SjSYkqt
+FSJSizy1Tzfod2o3vGcSjLF1ZzLEVhkHjcgDACInRZGiws/YjKkuSBhSCljc
+ncseCU/jpauyUrrB6XHJGVbn/wuK3toF62k6Gtu5QlvyPv1aQC9nIIJ2bhDJ
+20e97s8kQPyAcnGNMdr+ErZEpraxS2QICxcVMxfDri/OwhFjah1YOtyOOglS
++4iIUaMJVF5iPmFoqbhfvQWWowALZYurozYBtSnQGqFd0WHYQFd/O79JBU+p
+NWQKOvCRhpIiQWem6Wcyb0jySJOsm1jK61CHvmjXCQbzAqRifOGim7egJLZ7
+BZjTHfpOMIqWeVjbvAmZ/+gI7MCxbmGTqg12m6NZKsuzMEKXxPc6uBXWt0iB
+DM+PVnCeS9XWAqJqj0HTSlvt7eMHlgeZH/csvE4F1pDqN3atcHDpDJ2rx1R5
+OU2Vfbdioe7LNCBeXh8EgLv2GZSpVR3E5VSIGsDUfgtbWyB1jxv85cYsl5c0
+gVZdGfs2w0y6fpAyT5MOmoD8UhtlmGZ7MR2qRW9PHd6i12A/U/uaaCR/JwDI
+HSPt09HvBX4tfS9AtcFVZANfheWuMk9T3dget0UQaJynoY55+59kpW7UOncn
+n8xIa4jjWYRofXQ7zLkvJ7CyhALh/j4RvKXxTSpxyjiVOLV5upZox0ewFUP0
+btY1rmJhu9FfjTTFN0NU95vn3Asd6TdDg/vCGAqEUG8UQfCWxd+RVlCmjalO
+MKXXi8zTjTL+8VH4EG19J0twKpnUvXeaQjXFV2PSN4IUiGYYKdqhadw/agg8
+SE1DupFqBgWC+gx/dYx2CtvcGWScevVgL+8KnD+WB7kz/gwH2ByBxLgH7vBb
+oaYCQAuIBLaaI/nhB6Dz2OEwoFsn6JmeCpktU6FVNC+79iN8X/oTXDtfBAVv
+74ZTZ74RPXsVc3B1TcF4UwMAqllmMwMjFkJfjdK95flp0DUlGZKlr0aD07UG
+8H94EC6jQ7sNyq9GpWOXTNWbLLA2NQBaYMi/GzbJFieoP54WQPnNsFd2ftc+
+qr6bAOhNW/MR2qkC3MUPpbW2fwswAOLT2NCG5vJpAAAAAElFTkSuQmCC
+====
--- /dev/null
+[Desktop Entry]
+Encoding=UTF-8
+Version=1.0
+Type=Application
+Name=FeedingIt RSS Reader
+Exec=invoker --single-instance --type=e /usr/bin/feedingit
+Icon=/usr/share/icons/hicolor/64x64/apps/feedingit.png
+Categories=Development;
--- /dev/null
+This file should contain a writeup describing what your application does,
+and how to use it. The content of this file goes into the long_description
+field of setup.py, which in turn becomes the long version of the Description
+field in the debian/control file of the project.
--- /dev/null
+[Project]
+category = Development
+maintainer = Yves
+appname = PySide app
+section = development
+pyversion = 2.7
+project = feedingit
+email = email@example.com
+desc = A PySide example
+template = harmattan
+
--- /dev/null
+[D-BUS Service]
+Name=org.marcoz.feedingit
+Exec=/usr/bin/feedingit dbus
--- /dev/null
+"""Beautiful Soup
+Elixir and Tonic
+"The Screen-Scraper's Friend"
+http://www.crummy.com/software/BeautifulSoup/
+
+Beautiful Soup parses a (possibly invalid) XML or HTML document into a
+tree representation. It provides methods and Pythonic idioms that make
+it easy to navigate, search, and modify the tree.
+
+A well-formed XML/HTML document yields a well-formed data
+structure. An ill-formed XML/HTML document yields a correspondingly
+ill-formed data structure. If your document is only locally
+well-formed, you can use this library to find and process the
+well-formed part of it.
+
+Beautiful Soup works with Python 2.2 and up. It has no external
+dependencies, but you'll have more success at converting data to UTF-8
+if you also install these three packages:
+
+* chardet, for auto-detecting character encodings
+ http://chardet.feedparser.org/
+* cjkcodecs and iconv_codec, which add more encodings to the ones supported
+ by stock Python.
+ http://cjkpython.i18n.org/
+
+Beautiful Soup defines classes for two main parsing strategies:
+
+ * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
+ language that kind of looks like XML.
+
+ * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
+ or invalid. This class has web browser-like heuristics for
+ obtaining a sensible parse tree in the face of common HTML errors.
+
+Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
+the encoding of an HTML or XML document, and converting it to
+Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
+
+For more than you ever wanted to know about Beautiful Soup, see the
+documentation:
+http://www.crummy.com/software/BeautifulSoup/documentation.html
+
+Here, have some legalese:
+
+Copyright (c) 2004-2010, Leonard Richardson
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials provided
+ with the distribution.
+
+ * Neither the name of the the Beautiful Soup Consortium and All
+ Night Kosher Bakery nor the names of its contributors may be
+ used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
+
+"""
+from __future__ import generators
+
+__author__ = "Leonard Richardson (leonardr@segfault.org)"
+__version__ = "3.2.0"
+__copyright__ = "Copyright (c) 2004-2010 Leonard Richardson"
+__license__ = "New-style BSD"
+
+from sgmllib import SGMLParser, SGMLParseError
+import codecs
+import markupbase
+import types
+import re
+import sgmllib
+try:
+ from htmlentitydefs import name2codepoint
+except ImportError:
+ name2codepoint = {}
+try:
+ set
+except NameError:
+ from sets import Set as set
+
+#These hacks make Beautiful Soup able to parse XML with namespaces
+sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
+markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
+
+DEFAULT_OUTPUT_ENCODING = "utf-8"
+
+def _match_css_class(str):
+ """Build a RE to match the given CSS class."""
+ return re.compile(r"(^|.*\s)%s($|\s)" % str)
+
+# First, the classes that represent markup elements.
+
+class PageElement(object):
+ """Contains the navigational information for some part of the page
+ (either a tag or a piece of text)"""
+
+ def setup(self, parent=None, previous=None):
+ """Sets up the initial relations between this element and
+ other elements."""
+ self.parent = parent
+ self.previous = previous
+ self.next = None
+ self.previousSibling = None
+ self.nextSibling = None
+ if self.parent and self.parent.contents:
+ self.previousSibling = self.parent.contents[-1]
+ self.previousSibling.nextSibling = self
+
+ def replaceWith(self, replaceWith):
+ oldParent = self.parent
+ myIndex = self.parent.index(self)
+ if hasattr(replaceWith, "parent")\
+ and replaceWith.parent is self.parent:
+ # We're replacing this element with one of its siblings.
+ index = replaceWith.parent.index(replaceWith)
+ if index and index < myIndex:
+ # Furthermore, it comes before this element. That
+ # means that when we extract it, the index of this
+ # element will change.
+ myIndex = myIndex - 1
+ self.extract()
+ oldParent.insert(myIndex, replaceWith)
+
+ def replaceWithChildren(self):
+ myParent = self.parent
+ myIndex = self.parent.index(self)
+ self.extract()
+ reversedChildren = list(self.contents)
+ reversedChildren.reverse()
+ for child in reversedChildren:
+ myParent.insert(myIndex, child)
+
+ def extract(self):
+ """Destructively rips this element out of the tree."""
+ if self.parent:
+ try:
+ del self.parent.contents[self.parent.index(self)]
+ except ValueError:
+ pass
+
+ #Find the two elements that would be next to each other if
+ #this element (and any children) hadn't been parsed. Connect
+ #the two.
+ lastChild = self._lastRecursiveChild()
+ nextElement = lastChild.next
+
+ if self.previous:
+ self.previous.next = nextElement
+ if nextElement:
+ nextElement.previous = self.previous
+ self.previous = None
+ lastChild.next = None
+
+ self.parent = None
+ if self.previousSibling:
+ self.previousSibling.nextSibling = self.nextSibling
+ if self.nextSibling:
+ self.nextSibling.previousSibling = self.previousSibling
+ self.previousSibling = self.nextSibling = None
+ return self
+
+ def _lastRecursiveChild(self):
+ "Finds the last element beneath this object to be parsed."
+ lastChild = self
+ while hasattr(lastChild, 'contents') and lastChild.contents:
+ lastChild = lastChild.contents[-1]
+ return lastChild
+
+ def insert(self, position, newChild):
+ if isinstance(newChild, basestring) \
+ and not isinstance(newChild, NavigableString):
+ newChild = NavigableString(newChild)
+
+ position = min(position, len(self.contents))
+ if hasattr(newChild, 'parent') and newChild.parent is not None:
+ # We're 'inserting' an element that's already one
+ # of this object's children.
+ if newChild.parent is self:
+ index = self.index(newChild)
+ if index > position:
+ # Furthermore we're moving it further down the
+ # list of this object's children. That means that
+ # when we extract this element, our target index
+ # will jump down one.
+ position = position - 1
+ newChild.extract()
+
+ newChild.parent = self
+ previousChild = None
+ if position == 0:
+ newChild.previousSibling = None
+ newChild.previous = self
+ else:
+ previousChild = self.contents[position-1]
+ newChild.previousSibling = previousChild
+ newChild.previousSibling.nextSibling = newChild
+ newChild.previous = previousChild._lastRecursiveChild()
+ if newChild.previous:
+ newChild.previous.next = newChild
+
+ newChildsLastElement = newChild._lastRecursiveChild()
+
+ if position >= len(self.contents):
+ newChild.nextSibling = None
+
+ parent = self
+ parentsNextSibling = None
+ while not parentsNextSibling:
+ parentsNextSibling = parent.nextSibling
+ parent = parent.parent
+ if not parent: # This is the last element in the document.
+ break
+ if parentsNextSibling:
+ newChildsLastElement.next = parentsNextSibling
+ else:
+ newChildsLastElement.next = None
+ else:
+ nextChild = self.contents[position]
+ newChild.nextSibling = nextChild
+ if newChild.nextSibling:
+ newChild.nextSibling.previousSibling = newChild
+ newChildsLastElement.next = nextChild
+
+ if newChildsLastElement.next:
+ newChildsLastElement.next.previous = newChildsLastElement
+ self.contents.insert(position, newChild)
+
+ def append(self, tag):
+ """Appends the given tag to the contents of this tag."""
+ self.insert(len(self.contents), tag)
+
+ def findNext(self, name=None, attrs={}, text=None, **kwargs):
+ """Returns the first item that matches the given criteria and
+ appears after this Tag in the document."""
+ return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
+
+ def findAllNext(self, name=None, attrs={}, text=None, limit=None,
+ **kwargs):
+ """Returns all items that match the given criteria and appear
+ after this Tag in the document."""
+ return self._findAll(name, attrs, text, limit, self.nextGenerator,
+ **kwargs)
+
+ def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
+ """Returns the closest sibling to this Tag that matches the
+ given criteria and appears after this Tag in the document."""
+ return self._findOne(self.findNextSiblings, name, attrs, text,
+ **kwargs)
+
+ def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
+ **kwargs):
+ """Returns the siblings of this Tag that match the given
+ criteria and appear after this Tag in the document."""
+ return self._findAll(name, attrs, text, limit,
+ self.nextSiblingGenerator, **kwargs)
+ fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
+
+ def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
+ """Returns the first item that matches the given criteria and
+ appears before this Tag in the document."""
+ return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
+
+ def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
+ **kwargs):
+ """Returns all items that match the given criteria and appear
+ before this Tag in the document."""
+ return self._findAll(name, attrs, text, limit, self.previousGenerator,
+ **kwargs)
+ fetchPrevious = findAllPrevious # Compatibility with pre-3.x
+
+ def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
+ """Returns the closest sibling to this Tag that matches the
+ given criteria and appears before this Tag in the document."""
+ return self._findOne(self.findPreviousSiblings, name, attrs, text,
+ **kwargs)
+
+ def findPreviousSiblings(self, name=None, attrs={}, text=None,
+ limit=None, **kwargs):
+ """Returns the siblings of this Tag that match the given
+ criteria and appear before this Tag in the document."""
+ return self._findAll(name, attrs, text, limit,
+ self.previousSiblingGenerator, **kwargs)
+ fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
+
+ def findParent(self, name=None, attrs={}, **kwargs):
+ """Returns the closest parent of this Tag that matches the given
+ criteria."""
+ # NOTE: We can't use _findOne because findParents takes a different
+ # set of arguments.
+ r = None
+ l = self.findParents(name, attrs, 1)
+ if l:
+ r = l[0]
+ return r
+
+ def findParents(self, name=None, attrs={}, limit=None, **kwargs):
+ """Returns the parents of this Tag that match the given
+ criteria."""
+
+ return self._findAll(name, attrs, None, limit, self.parentGenerator,
+ **kwargs)
+ fetchParents = findParents # Compatibility with pre-3.x
+
+ #These methods do the real heavy lifting.
+
+ def _findOne(self, method, name, attrs, text, **kwargs):
+ r = None
+ l = method(name, attrs, text, 1, **kwargs)
+ if l:
+ r = l[0]
+ return r
+
+ def _findAll(self, name, attrs, text, limit, generator, **kwargs):
+ "Iterates over a generator looking for things that match."
+
+ if isinstance(name, SoupStrainer):
+ strainer = name
+ # (Possibly) special case some findAll*(...) searches
+ elif text is None and not limit and not attrs and not kwargs:
+ # findAll*(True)
+ if name is True:
+ return [element for element in generator()
+ if isinstance(element, Tag)]
+ # findAll*('tag-name')
+ elif isinstance(name, basestring):
+ return [element for element in generator()
+ if isinstance(element, Tag) and
+ element.name == name]
+ else:
+ strainer = SoupStrainer(name, attrs, text, **kwargs)
+ # Build a SoupStrainer
+ else:
+ strainer = SoupStrainer(name, attrs, text, **kwargs)
+ results = ResultSet(strainer)
+ g = generator()
+ while True:
+ try:
+ i = g.next()
+ except StopIteration:
+ break
+ if i:
+ found = strainer.search(i)
+ if found:
+ results.append(found)
+ if limit and len(results) >= limit:
+ break
+ return results
+
+ #These Generators can be used to navigate starting from both
+ #NavigableStrings and Tags.
+ def nextGenerator(self):
+ i = self
+ while i is not None:
+ i = i.next
+ yield i
+
+ def nextSiblingGenerator(self):
+ i = self
+ while i is not None:
+ i = i.nextSibling
+ yield i
+
+ def previousGenerator(self):
+ i = self
+ while i is not None:
+ i = i.previous
+ yield i
+
+ def previousSiblingGenerator(self):
+ i = self
+ while i is not None:
+ i = i.previousSibling
+ yield i
+
+ def parentGenerator(self):
+ i = self
+ while i is not None:
+ i = i.parent
+ yield i
+
+ # Utility methods
+ def substituteEncoding(self, str, encoding=None):
+ encoding = encoding or "utf-8"
+ return str.replace("%SOUP-ENCODING%", encoding)
+
+ def toEncoding(self, s, encoding=None):
+ """Encodes an object to a string in some encoding, or to Unicode.
+ ."""
+ if isinstance(s, unicode):
+ if encoding:
+ s = s.encode(encoding)
+ elif isinstance(s, str):
+ if encoding:
+ s = s.encode(encoding)
+ else:
+ s = unicode(s)
+ else:
+ if encoding:
+ s = self.toEncoding(str(s), encoding)
+ else:
+ s = unicode(s)
+ return s
+
+class NavigableString(unicode, PageElement):
+
+ def __new__(cls, value):
+ """Create a new NavigableString.
+
+ When unpickling a NavigableString, this method is called with
+ the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
+ passed in to the superclass's __new__ or the superclass won't know
+ how to handle non-ASCII characters.
+ """
+ if isinstance(value, unicode):
+ return unicode.__new__(cls, value)
+ return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
+
+ def __getnewargs__(self):
+ return (NavigableString.__str__(self),)
+
+ def __getattr__(self, attr):
+ """text.string gives you text. This is for backwards
+ compatibility for Navigable*String, but for CData* it lets you
+ get the string without the CData wrapper."""
+ if attr == 'string':
+ return self
+ else:
+ raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
+
+ def __unicode__(self):
+ return str(self).decode(DEFAULT_OUTPUT_ENCODING)
+
+ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ if encoding:
+ return self.encode(encoding)
+ else:
+ return self
+
+class CData(NavigableString):
+
+ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
+
+class ProcessingInstruction(NavigableString):
+ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ output = self
+ if "%SOUP-ENCODING%" in output:
+ output = self.substituteEncoding(output, encoding)
+ return "<?%s?>" % self.toEncoding(output, encoding)
+
+class Comment(NavigableString):
+ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ return "<!--%s-->" % NavigableString.__str__(self, encoding)
+
+class Declaration(NavigableString):
+ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ return "<!%s>" % NavigableString.__str__(self, encoding)
+
+class Tag(PageElement):
+
+ """Represents a found HTML tag with its attributes and contents."""
+
+ def _invert(h):
+ "Cheap function to invert a hash."
+ i = {}
+ for k,v in h.items():
+ i[v] = k
+ return i
+
+ XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
+ "quot" : '"',
+ "amp" : "&",
+ "lt" : "<",
+ "gt" : ">" }
+
+ XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
+
+ def _convertEntities(self, match):
+ """Used in a call to re.sub to replace HTML, XML, and numeric
+ entities with the appropriate Unicode characters. If HTML
+ entities are being converted, any unrecognized entities are
+ escaped."""
+ x = match.group(1)
+ if self.convertHTMLEntities and x in name2codepoint:
+ return unichr(name2codepoint[x])
+ elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
+ if self.convertXMLEntities:
+ return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
+ else:
+ return u'&%s;' % x
+ elif len(x) > 0 and x[0] == '#':
+ # Handle numeric entities
+ if len(x) > 1 and x[1] == 'x':
+ return unichr(int(x[2:], 16))
+ else:
+ return unichr(int(x[1:]))
+
+ elif self.escapeUnrecognizedEntities:
+ return u'&%s;' % x
+ else:
+ return u'&%s;' % x
+
+ def __init__(self, parser, name, attrs=None, parent=None,
+ previous=None):
+ "Basic constructor."
+
+ # We don't actually store the parser object: that lets extracted
+ # chunks be garbage-collected
+ self.parserClass = parser.__class__
+ self.isSelfClosing = parser.isSelfClosingTag(name)
+ self.name = name
+ if attrs is None:
+ attrs = []
+ elif isinstance(attrs, dict):
+ attrs = attrs.items()
+ self.attrs = attrs
+ self.contents = []
+ self.setup(parent, previous)
+ self.hidden = False
+ self.containsSubstitutions = False
+ self.convertHTMLEntities = parser.convertHTMLEntities
+ self.convertXMLEntities = parser.convertXMLEntities
+ self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
+
+ # Convert any HTML, XML, or numeric entities in the attribute values.
+ convert = lambda(k, val): (k,
+ re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
+ self._convertEntities,
+ val))
+ self.attrs = map(convert, self.attrs)
+
+ def getString(self):
+ if (len(self.contents) == 1
+ and isinstance(self.contents[0], NavigableString)):
+ return self.contents[0]
+
+ def setString(self, string):
+ """Replace the contents of the tag with a string"""
+ self.clear()
+ self.append(string)
+
+ string = property(getString, setString)
+
+ def getText(self, separator=u""):
+ if not len(self.contents):
+ return u""
+ stopNode = self._lastRecursiveChild().next
+ strings = []
+ current = self.contents[0]
+ while current is not stopNode:
+ if isinstance(current, NavigableString):
+ strings.append(current.strip())
+ current = current.next
+ return separator.join(strings)
+
+ text = property(getText)
+
+ def get(self, key, default=None):
+ """Returns the value of the 'key' attribute for the tag, or
+ the value given for 'default' if it doesn't have that
+ attribute."""
+ return self._getAttrMap().get(key, default)
+
+ def clear(self):
+ """Extract all children."""
+ for child in self.contents[:]:
+ child.extract()
+
+ def index(self, element):
+ for i, child in enumerate(self.contents):
+ if child is element:
+ return i
+ raise ValueError("Tag.index: element not in tag")
+
+ def has_key(self, key):
+ return self._getAttrMap().has_key(key)
+
+ def __getitem__(self, key):
+ """tag[key] returns the value of the 'key' attribute for the tag,
+ and throws an exception if it's not there."""
+ return self._getAttrMap()[key]
+
+ def __iter__(self):
+ "Iterating over a tag iterates over its contents."
+ return iter(self.contents)
+
+ def __len__(self):
+ "The length of a tag is the length of its list of contents."
+ return len(self.contents)
+
+ def __contains__(self, x):
+ return x in self.contents
+
+ def __nonzero__(self):
+ "A tag is non-None even if it has no contents."
+ return True
+
+ def __setitem__(self, key, value):
+ """Setting tag[key] sets the value of the 'key' attribute for the
+ tag."""
+ self._getAttrMap()
+ self.attrMap[key] = value
+ found = False
+ for i in range(0, len(self.attrs)):
+ if self.attrs[i][0] == key:
+ self.attrs[i] = (key, value)
+ found = True
+ if not found:
+ self.attrs.append((key, value))
+ self._getAttrMap()[key] = value
+
+ def __delitem__(self, key):
+ "Deleting tag[key] deletes all 'key' attributes for the tag."
+ for item in self.attrs:
+ if item[0] == key:
+ self.attrs.remove(item)
+ #We don't break because bad HTML can define the same
+ #attribute multiple times.
+ self._getAttrMap()
+ if self.attrMap.has_key(key):
+ del self.attrMap[key]
+
+ def __call__(self, *args, **kwargs):
+ """Calling a tag like a function is the same as calling its
+ findAll() method. Eg. tag('a') returns a list of all the A tags
+ found within this tag."""
+ return apply(self.findAll, args, kwargs)
+
+ def __getattr__(self, tag):
+ #print "Getattr %s.%s" % (self.__class__, tag)
+ if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
+ return self.find(tag[:-3])
+ elif tag.find('__') != 0:
+ return self.find(tag)
+ raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
+
+ def __eq__(self, other):
+ """Returns true iff this tag has the same name, the same attributes,
+ and the same contents (recursively) as the given tag.
+
+ NOTE: right now this will return false if two tags have the
+ same attributes in a different order. Should this be fixed?"""
+ if other is self:
+ return True
+ if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
+ return False
+ for i in range(0, len(self.contents)):
+ if self.contents[i] != other.contents[i]:
+ return False
+ return True
+
+ def __ne__(self, other):
+ """Returns true iff this tag is not identical to the other tag,
+ as defined in __eq__."""
+ return not self == other
+
+ def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ """Renders this tag as a string."""
+ return self.__str__(encoding)
+
+ def __unicode__(self):
+ return self.__str__(None)
+
+ BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
+ + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
+ + ")")
+
+ def _sub_entity(self, x):
+ """Used with a regular expression to substitute the
+ appropriate XML entity for an XML special character."""
+ return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
+
+ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
+ prettyPrint=False, indentLevel=0):
+ """Returns a string or Unicode representation of this tag and
+ its contents. To get Unicode, pass None for encoding.
+
+ NOTE: since Python's HTML parser consumes whitespace, this
+ method is not certain to reproduce the whitespace present in
+ the original string."""
+
+ encodedName = self.toEncoding(self.name, encoding)
+
+ attrs = []
+ if self.attrs:
+ for key, val in self.attrs:
+ fmt = '%s="%s"'
+ if isinstance(val, basestring):
+ if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
+ val = self.substituteEncoding(val, encoding)
+
+ # The attribute value either:
+ #
+ # * Contains no embedded double quotes or single quotes.
+ # No problem: we enclose it in double quotes.
+ # * Contains embedded single quotes. No problem:
+ # double quotes work here too.
+ # * Contains embedded double quotes. No problem:
+ # we enclose it in single quotes.
+ # * Embeds both single _and_ double quotes. This
+ # can't happen naturally, but it can happen if
+ # you modify an attribute value after parsing
+ # the document. Now we have a bit of a
+ # problem. We solve it by enclosing the
+ # attribute in single quotes, and escaping any
+ # embedded single quotes to XML entities.
+ if '"' in val:
+ fmt = "%s='%s'"
+ if "'" in val:
+ # TODO: replace with apos when
+ # appropriate.
+ val = val.replace("'", "&squot;")
+
+ # Now we're okay w/r/t quotes. But the attribute
+ # value might also contain angle brackets, or
+ # ampersands that aren't part of entities. We need
+ # to escape those to XML entities too.
+ val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
+
+ attrs.append(fmt % (self.toEncoding(key, encoding),
+ self.toEncoding(val, encoding)))
+ close = ''
+ closeTag = ''
+ if self.isSelfClosing:
+ close = ' /'
+ else:
+ closeTag = '</%s>' % encodedName
+
+ indentTag, indentContents = 0, 0
+ if prettyPrint:
+ indentTag = indentLevel
+ space = (' ' * (indentTag-1))
+ indentContents = indentTag + 1
+ contents = self.renderContents(encoding, prettyPrint, indentContents)
+ if self.hidden:
+ s = contents
+ else:
+ s = []
+ attributeString = ''
+ if attrs:
+ attributeString = ' ' + ' '.join(attrs)
+ if prettyPrint:
+ s.append(space)
+ s.append('<%s%s%s>' % (encodedName, attributeString, close))
+ if prettyPrint:
+ s.append("\n")
+ s.append(contents)
+ if prettyPrint and contents and contents[-1] != "\n":
+ s.append("\n")
+ if prettyPrint and closeTag:
+ s.append(space)
+ s.append(closeTag)
+ if prettyPrint and closeTag and self.nextSibling:
+ s.append("\n")
+ s = ''.join(s)
+ return s
+
+ def decompose(self):
+ """Recursively destroys the contents of this tree."""
+ self.extract()
+ if len(self.contents) == 0:
+ return
+ current = self.contents[0]
+ while current is not None:
+ next = current.next
+ if isinstance(current, Tag):
+ del current.contents[:]
+ current.parent = None
+ current.previous = None
+ current.previousSibling = None
+ current.next = None
+ current.nextSibling = None
+ current = next
+
+ def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ return self.__str__(encoding, True)
+
+ def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
+ prettyPrint=False, indentLevel=0):
+ """Renders the contents of this tag as a string in the given
+ encoding. If encoding is None, returns a Unicode string.."""
+ s=[]
+ for c in self:
+ text = None
+ if isinstance(c, NavigableString):
+ text = c.__str__(encoding)
+ elif isinstance(c, Tag):
+ s.append(c.__str__(encoding, prettyPrint, indentLevel))
+ if text and prettyPrint:
+ text = text.strip()
+ if text:
+ if prettyPrint:
+ s.append(" " * (indentLevel-1))
+ s.append(text)
+ if prettyPrint:
+ s.append("\n")
+ return ''.join(s)
+
+ #Soup methods
+
+ def find(self, name=None, attrs={}, recursive=True, text=None,
+ **kwargs):
+ """Return only the first child of this Tag matching the given
+ criteria."""
+ r = None
+ l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
+ if l:
+ r = l[0]
+ return r
+ findChild = find
+
+ def findAll(self, name=None, attrs={}, recursive=True, text=None,
+ limit=None, **kwargs):
+ """Extracts a list of Tag objects that match the given
+ criteria. You can specify the name of the Tag and any
+ attributes you want the Tag to have.
+
+ The value of a key-value pair in the 'attrs' map can be a
+ string, a list of strings, a regular expression object, or a
+ callable that takes a string and returns whether or not the
+ string matches for some custom definition of 'matches'. The
+ same is true of the tag name."""
+ generator = self.recursiveChildGenerator
+ if not recursive:
+ generator = self.childGenerator
+ return self._findAll(name, attrs, text, limit, generator, **kwargs)
+ findChildren = findAll
+
+ # Pre-3.x compatibility methods
+ first = find
+ fetch = findAll
+
+ def fetchText(self, text=None, recursive=True, limit=None):
+ return self.findAll(text=text, recursive=recursive, limit=limit)
+
+ def firstText(self, text=None, recursive=True):
+ return self.find(text=text, recursive=recursive)
+
+ #Private methods
+
+ def _getAttrMap(self):
+ """Initializes a map representation of this tag's attributes,
+ if not already initialized."""
+ if not getattr(self, 'attrMap'):
+ self.attrMap = {}
+ for (key, value) in self.attrs:
+ self.attrMap[key] = value
+ return self.attrMap
+
+ #Generator methods
+ def childGenerator(self):
+ # Just use the iterator from the contents
+ return iter(self.contents)
+
+ def recursiveChildGenerator(self):
+ if not len(self.contents):
+ raise StopIteration
+ stopNode = self._lastRecursiveChild().next
+ current = self.contents[0]
+ while current is not stopNode:
+ yield current
+ current = current.next
+
+
+# Next, a couple classes to represent queries and their results.
+class SoupStrainer:
+ """Encapsulates a number of ways of matching a markup element (tag or
+ text)."""
+
+ def __init__(self, name=None, attrs={}, text=None, **kwargs):
+ self.name = name
+ if isinstance(attrs, basestring):
+ kwargs['class'] = _match_css_class(attrs)
+ attrs = None
+ if kwargs:
+ if attrs:
+ attrs = attrs.copy()
+ attrs.update(kwargs)
+ else:
+ attrs = kwargs
+ self.attrs = attrs
+ self.text = text
+
+ def __str__(self):
+ if self.text:
+ return self.text
+ else:
+ return "%s|%s" % (self.name, self.attrs)
+
+ def searchTag(self, markupName=None, markupAttrs={}):
+ found = None
+ markup = None
+ if isinstance(markupName, Tag):
+ markup = markupName
+ markupAttrs = markup
+ callFunctionWithTagData = callable(self.name) \
+ and not isinstance(markupName, Tag)
+
+ if (not self.name) \
+ or callFunctionWithTagData \
+ or (markup and self._matches(markup, self.name)) \
+ or (not markup and self._matches(markupName, self.name)):
+ if callFunctionWithTagData:
+ match = self.name(markupName, markupAttrs)
+ else:
+ match = True
+ markupAttrMap = None
+ for attr, matchAgainst in self.attrs.items():
+ if not markupAttrMap:
+ if hasattr(markupAttrs, 'get'):
+ markupAttrMap = markupAttrs
+ else:
+ markupAttrMap = {}
+ for k,v in markupAttrs:
+ markupAttrMap[k] = v
+ attrValue = markupAttrMap.get(attr)
+ if not self._matches(attrValue, matchAgainst):
+ match = False
+ break
+ if match:
+ if markup:
+ found = markup
+ else:
+ found = markupName
+ return found
+
+ def search(self, markup):
+ #print 'looking for %s in %s' % (self, markup)
+ found = None
+ # If given a list of items, scan it for a text element that
+ # matches.
+ if hasattr(markup, "__iter__") \
+ and not isinstance(markup, Tag):
+ for element in markup:
+ if isinstance(element, NavigableString) \
+ and self.search(element):
+ found = element
+ break
+ # If it's a Tag, make sure its name or attributes match.
+ # Don't bother with Tags if we're searching for text.
+ elif isinstance(markup, Tag):
+ if not self.text:
+ found = self.searchTag(markup)
+ # If it's text, make sure the text matches.
+ elif isinstance(markup, NavigableString) or \
+ isinstance(markup, basestring):
+ if self._matches(markup, self.text):
+ found = markup
+ else:
+ raise Exception, "I don't know how to match against a %s" \
+ % markup.__class__
+ return found
+
+ def _matches(self, markup, matchAgainst):
+ #print "Matching %s against %s" % (markup, matchAgainst)
+ result = False
+ if matchAgainst is True:
+ result = markup is not None
+ elif callable(matchAgainst):
+ result = matchAgainst(markup)
+ else:
+ #Custom match methods take the tag as an argument, but all
+ #other ways of matching match the tag name as a string.
+ if isinstance(markup, Tag):
+ markup = markup.name
+ if markup and not isinstance(markup, basestring):
+ markup = unicode(markup)
+ #Now we know that chunk is either a string, or None.
+ if hasattr(matchAgainst, 'match'):
+ # It's a regexp object.
+ result = markup and matchAgainst.search(markup)
+ elif hasattr(matchAgainst, '__iter__'): # list-like
+ result = markup in matchAgainst
+ elif hasattr(matchAgainst, 'items'):
+ result = markup.has_key(matchAgainst)
+ elif matchAgainst and isinstance(markup, basestring):
+ if isinstance(markup, unicode):
+ matchAgainst = unicode(matchAgainst)
+ else:
+ matchAgainst = str(matchAgainst)
+
+ if not result:
+ result = matchAgainst == markup
+ return result
+
+class ResultSet(list):
+ """A ResultSet is just a list that keeps track of the SoupStrainer
+ that created it."""
+ def __init__(self, source):
+ list.__init__([])
+ self.source = source
+
+# Now, some helper functions.
+
+def buildTagMap(default, *args):
+ """Turns a list of maps, lists, or scalars into a single map.
+ Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
+ NESTING_RESET_TAGS maps out of lists and partial maps."""
+ built = {}
+ for portion in args:
+ if hasattr(portion, 'items'):
+ #It's a map. Merge it.
+ for k,v in portion.items():
+ built[k] = v
+ elif hasattr(portion, '__iter__'): # is a list
+ #It's a list. Map each item to the default.
+ for k in portion:
+ built[k] = default
+ else:
+ #It's a scalar. Map it to the default.
+ built[portion] = default
+ return built
+
+# Now, the parser classes.
+
+class BeautifulStoneSoup(Tag, SGMLParser):
+
+ """This class contains the basic parser and search code. It defines
+ a parser that knows nothing about tag behavior except for the
+ following:
+
+ You can't close a tag without closing all the tags it encloses.
+ That is, "<foo><bar></foo>" actually means
+ "<foo><bar></bar></foo>".
+
+ [Another possible explanation is "<foo><bar /></foo>", but since
+ this class defines no SELF_CLOSING_TAGS, it will never use that
+ explanation.]
+
+ This class is useful for parsing XML or made-up markup languages,
+ or when BeautifulSoup makes an assumption counter to what you were
+ expecting."""
+
+ SELF_CLOSING_TAGS = {}
+ NESTABLE_TAGS = {}
+ RESET_NESTING_TAGS = {}
+ QUOTE_TAGS = {}
+ PRESERVE_WHITESPACE_TAGS = []
+
+ MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
+ lambda x: x.group(1) + ' />'),
+ (re.compile('<!\s+([^<>]*)>'),
+ lambda x: '<!' + x.group(1) + '>')
+ ]
+
+ ROOT_TAG_NAME = u'[document]'
+
+ HTML_ENTITIES = "html"
+ XML_ENTITIES = "xml"
+ XHTML_ENTITIES = "xhtml"
+ # TODO: This only exists for backwards-compatibility
+ ALL_ENTITIES = XHTML_ENTITIES
+
+ # Used when determining whether a text node is all whitespace and
+ # can be replaced with a single space. A text node that contains
+ # fancy Unicode spaces (usually non-breaking) should be left
+ # alone.
+ STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
+
+ def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
+ markupMassage=True, smartQuotesTo=XML_ENTITIES,
+ convertEntities=None, selfClosingTags=None, isHTML=False):
+ """The Soup object is initialized as the 'root tag', and the
+ provided markup (which can be a string or a file-like object)
+ is fed into the underlying parser.
+
+ sgmllib will process most bad HTML, and the BeautifulSoup
+ class has some tricks for dealing with some HTML that kills
+ sgmllib, but Beautiful Soup can nonetheless choke or lose data
+ if your data uses self-closing tags or declarations
+ incorrectly.
+
+ By default, Beautiful Soup uses regexes to sanitize input,
+ avoiding the vast majority of these problems. If the problems
+ don't apply to you, pass in False for markupMassage, and
+ you'll get better performance.
+
+ The default parser massage techniques fix the two most common
+ instances of invalid HTML that choke sgmllib:
+
+ <br/> (No space between name of closing tag and tag close)
+ <! --Comment--> (Extraneous whitespace in declaration)
+
+ You can pass in a custom list of (RE object, replace method)
+ tuples to get Beautiful Soup to scrub your input the way you
+ want."""
+
+ self.parseOnlyThese = parseOnlyThese
+ self.fromEncoding = fromEncoding
+ self.smartQuotesTo = smartQuotesTo
+ self.convertEntities = convertEntities
+ # Set the rules for how we'll deal with the entities we
+ # encounter
+ if self.convertEntities:
+ # It doesn't make sense to convert encoded characters to
+ # entities even while you're converting entities to Unicode.
+ # Just convert it all to Unicode.
+ self.smartQuotesTo = None
+ if convertEntities == self.HTML_ENTITIES:
+ self.convertXMLEntities = False
+ self.convertHTMLEntities = True
+ self.escapeUnrecognizedEntities = True
+ elif convertEntities == self.XHTML_ENTITIES:
+ self.convertXMLEntities = True
+ self.convertHTMLEntities = True
+ self.escapeUnrecognizedEntities = False
+ elif convertEntities == self.XML_ENTITIES:
+ self.convertXMLEntities = True
+ self.convertHTMLEntities = False
+ self.escapeUnrecognizedEntities = False
+ else:
+ self.convertXMLEntities = False
+ self.convertHTMLEntities = False
+ self.escapeUnrecognizedEntities = False
+
+ self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
+ SGMLParser.__init__(self)
+
+ if hasattr(markup, 'read'): # It's a file-type object.
+ markup = markup.read()
+ self.markup = markup
+ self.markupMassage = markupMassage
+ try:
+ self._feed(isHTML=isHTML)
+ except StopParsing:
+ pass
+ self.markup = None # The markup can now be GCed
+
+ def convert_charref(self, name):
+ """This method fixes a bug in Python's SGMLParser."""
+ try:
+ n = int(name)
+ except ValueError:
+ return
+ if not 0 <= n <= 127 : # ASCII ends at 127, not 255
+ return
+ return self.convert_codepoint(n)
+
+ def _feed(self, inDocumentEncoding=None, isHTML=False):
+ # Convert the document to Unicode.
+ markup = self.markup
+ if isinstance(markup, unicode):
+ if not hasattr(self, 'originalEncoding'):
+ self.originalEncoding = None
+ else:
+ dammit = UnicodeDammit\
+ (markup, [self.fromEncoding, inDocumentEncoding],
+ smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
+ markup = dammit.unicode
+ self.originalEncoding = dammit.originalEncoding
+ self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
+ if markup:
+ if self.markupMassage:
+ if not hasattr(self.markupMassage, "__iter__"):
+ self.markupMassage = self.MARKUP_MASSAGE
+ for fix, m in self.markupMassage:
+ markup = fix.sub(m, markup)
+ # TODO: We get rid of markupMassage so that the
+ # soup object can be deepcopied later on. Some
+ # Python installations can't copy regexes. If anyone
+ # was relying on the existence of markupMassage, this
+ # might cause problems.
+ del(self.markupMassage)
+ self.reset()
+
+ SGMLParser.feed(self, markup)
+ # Close out any unfinished strings and close all the open tags.
+ self.endData()
+ while self.currentTag.name != self.ROOT_TAG_NAME:
+ self.popTag()
+
+ def __getattr__(self, methodName):
+ """This method routes method call requests to either the SGMLParser
+ superclass or the Tag superclass, depending on the method name."""
+ #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
+
+ if methodName.startswith('start_') or methodName.startswith('end_') \
+ or methodName.startswith('do_'):
+ return SGMLParser.__getattr__(self, methodName)
+ elif not methodName.startswith('__'):
+ return Tag.__getattr__(self, methodName)
+ else:
+ raise AttributeError
+
+ def isSelfClosingTag(self, name):
+ """Returns true iff the given string is the name of a
+ self-closing tag according to this parser."""
+ return self.SELF_CLOSING_TAGS.has_key(name) \
+ or self.instanceSelfClosingTags.has_key(name)
+
+ def reset(self):
+ Tag.__init__(self, self, self.ROOT_TAG_NAME)
+ self.hidden = 1
+ SGMLParser.reset(self)
+ self.currentData = []
+ self.currentTag = None
+ self.tagStack = []
+ self.quoteStack = []
+ self.pushTag(self)
+
+ def popTag(self):
+ tag = self.tagStack.pop()
+
+ #print "Pop", tag.name
+ if self.tagStack:
+ self.currentTag = self.tagStack[-1]
+ return self.currentTag
+
+ def pushTag(self, tag):
+ #print "Push", tag.name
+ if self.currentTag:
+ self.currentTag.contents.append(tag)
+ self.tagStack.append(tag)
+ self.currentTag = self.tagStack[-1]
+
+ def endData(self, containerClass=NavigableString):
+ if self.currentData:
+ currentData = u''.join(self.currentData)
+ if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
+ not set([tag.name for tag in self.tagStack]).intersection(
+ self.PRESERVE_WHITESPACE_TAGS)):
+ if '\n' in currentData:
+ currentData = '\n'
+ else:
+ currentData = ' '
+ self.currentData = []
+ if self.parseOnlyThese and len(self.tagStack) <= 1 and \
+ (not self.parseOnlyThese.text or \
+ not self.parseOnlyThese.search(currentData)):
+ return
+ o = containerClass(currentData)
+ o.setup(self.currentTag, self.previous)
+ if self.previous:
+ self.previous.next = o
+ self.previous = o
+ self.currentTag.contents.append(o)
+
+
+ def _popToTag(self, name, inclusivePop=True):
+ """Pops the tag stack up to and including the most recent
+ instance of the given tag. If inclusivePop is false, pops the tag
+ stack up to but *not* including the most recent instqance of
+ the given tag."""
+ #print "Popping to %s" % name
+ if name == self.ROOT_TAG_NAME:
+ return
+
+ numPops = 0
+ mostRecentTag = None
+ for i in range(len(self.tagStack)-1, 0, -1):
+ if name == self.tagStack[i].name:
+ numPops = len(self.tagStack)-i
+ break
+ if not inclusivePop:
+ numPops = numPops - 1
+
+ for i in range(0, numPops):
+ mostRecentTag = self.popTag()
+ return mostRecentTag
+
+ def _smartPop(self, name):
+
+ """We need to pop up to the previous tag of this type, unless
+ one of this tag's nesting reset triggers comes between this
+ tag and the previous tag of this type, OR unless this tag is a
+ generic nesting trigger and another generic nesting trigger
+ comes between this tag and the previous tag of this type.
+
+ Examples:
+ <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
+ <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
+ <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
+
+ <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
+ <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
+ <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
+ """
+
+ nestingResetTriggers = self.NESTABLE_TAGS.get(name)
+ isNestable = nestingResetTriggers != None
+ isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
+ popTo = None
+ inclusive = True
+ for i in range(len(self.tagStack)-1, 0, -1):
+ p = self.tagStack[i]
+ if (not p or p.name == name) and not isNestable:
+ #Non-nestable tags get popped to the top or to their
+ #last occurance.
+ popTo = name
+ break
+ if (nestingResetTriggers is not None
+ and p.name in nestingResetTriggers) \
+ or (nestingResetTriggers is None and isResetNesting
+ and self.RESET_NESTING_TAGS.has_key(p.name)):
+
+ #If we encounter one of the nesting reset triggers
+ #peculiar to this tag, or we encounter another tag
+ #that causes nesting to reset, pop up to but not
+ #including that tag.
+ popTo = p.name
+ inclusive = False
+ break
+ p = p.parent
+ if popTo:
+ self._popToTag(popTo, inclusive)
+
+ def unknown_starttag(self, name, attrs, selfClosing=0):
+ #print "Start tag %s: %s" % (name, attrs)
+ if self.quoteStack:
+ #This is not a real tag.
+ #print "<%s> is not real!" % name
+ attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs])
+ self.handle_data('<%s%s>' % (name, attrs))
+ return
+ self.endData()
+
+ if not self.isSelfClosingTag(name) and not selfClosing:
+ self._smartPop(name)
+
+ if self.parseOnlyThese and len(self.tagStack) <= 1 \
+ and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
+ return
+
+ tag = Tag(self, name, attrs, self.currentTag, self.previous)
+ if self.previous:
+ self.previous.next = tag
+ self.previous = tag
+ self.pushTag(tag)
+ if selfClosing or self.isSelfClosingTag(name):
+ self.popTag()
+ if name in self.QUOTE_TAGS:
+ #print "Beginning quote (%s)" % name
+ self.quoteStack.append(name)
+ self.literal = 1
+ return tag
+
+ def unknown_endtag(self, name):
+ #print "End tag %s" % name
+ if self.quoteStack and self.quoteStack[-1] != name:
+ #This is not a real end tag.
+ #print "</%s> is not real!" % name
+ self.handle_data('</%s>' % name)
+ return
+ self.endData()
+ self._popToTag(name)
+ if self.quoteStack and self.quoteStack[-1] == name:
+ self.quoteStack.pop()
+ self.literal = (len(self.quoteStack) > 0)
+
+ def handle_data(self, data):
+ self.currentData.append(data)
+
+ def _toStringSubclass(self, text, subclass):
+ """Adds a certain piece of text to the tree as a NavigableString
+ subclass."""
+ self.endData()
+ self.handle_data(text)
+ self.endData(subclass)
+
+ def handle_pi(self, text):
+ """Handle a processing instruction as a ProcessingInstruction
+ object, possibly one with a %SOUP-ENCODING% slot into which an
+ encoding will be plugged later."""
+ if text[:3] == "xml":
+ text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
+ self._toStringSubclass(text, ProcessingInstruction)
+
+ def handle_comment(self, text):
+ "Handle comments as Comment objects."
+ self._toStringSubclass(text, Comment)
+
+ def handle_charref(self, ref):
+ "Handle character references as data."
+ if self.convertEntities:
+ data = unichr(int(ref))
+ else:
+ data = '&#%s;' % ref
+ self.handle_data(data)
+
+ def handle_entityref(self, ref):
+ """Handle entity references as data, possibly converting known
+ HTML and/or XML entity references to the corresponding Unicode
+ characters."""
+ data = None
+ if self.convertHTMLEntities:
+ try:
+ data = unichr(name2codepoint[ref])
+ except KeyError:
+ pass
+
+ if not data and self.convertXMLEntities:
+ data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
+
+ if not data and self.convertHTMLEntities and \
+ not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
+ # TODO: We've got a problem here. We're told this is
+ # an entity reference, but it's not an XML entity
+ # reference or an HTML entity reference. Nonetheless,
+ # the logical thing to do is to pass it through as an
+ # unrecognized entity reference.
+ #
+ # Except: when the input is "&carol;" this function
+ # will be called with input "carol". When the input is
+ # "AT&T", this function will be called with input
+ # "T". We have no way of knowing whether a semicolon
+ # was present originally, so we don't know whether
+ # this is an unknown entity or just a misplaced
+ # ampersand.
+ #
+ # The more common case is a misplaced ampersand, so I
+ # escape the ampersand and omit the trailing semicolon.
+ data = "&%s" % ref
+ if not data:
+ # This case is different from the one above, because we
+ # haven't already gone through a supposedly comprehensive
+ # mapping of entities to Unicode characters. We might not
+ # have gone through any mapping at all. So the chances are
+ # very high that this is a real entity, and not a
+ # misplaced ampersand.
+ data = "&%s;" % ref
+ self.handle_data(data)
+
+ def handle_decl(self, data):
+ "Handle DOCTYPEs and the like as Declaration objects."
+ self._toStringSubclass(data, Declaration)
+
+ def parse_declaration(self, i):
+ """Treat a bogus SGML declaration as raw data. Treat a CDATA
+ declaration as a CData object."""
+ j = None
+ if self.rawdata[i:i+9] == '<![CDATA[':
+ k = self.rawdata.find(']]>', i)
+ if k == -1:
+ k = len(self.rawdata)
+ data = self.rawdata[i+9:k]
+ j = k+3
+ self._toStringSubclass(data, CData)
+ else:
+ try:
+ j = SGMLParser.parse_declaration(self, i)
+ except SGMLParseError:
+ toHandle = self.rawdata[i:]
+ self.handle_data(toHandle)
+ j = i + len(toHandle)
+ return j
+
+class BeautifulSoup(BeautifulStoneSoup):
+
+ """This parser knows the following facts about HTML:
+
+ * Some tags have no closing tag and should be interpreted as being
+ closed as soon as they are encountered.
+
+ * The text inside some tags (ie. 'script') may contain tags which
+ are not really part of the document and which should be parsed
+ as text, not tags. If you want to parse the text as tags, you can
+ always fetch it and parse it explicitly.
+
+ * Tag nesting rules:
+
+ Most tags can't be nested at all. For instance, the occurance of
+ a <p> tag should implicitly close the previous <p> tag.
+
+ <p>Para1<p>Para2
+ should be transformed into:
+ <p>Para1</p><p>Para2
+
+ Some tags can be nested arbitrarily. For instance, the occurance
+ of a <blockquote> tag should _not_ implicitly close the previous
+ <blockquote> tag.
+
+ Alice said: <blockquote>Bob said: <blockquote>Blah
+ should NOT be transformed into:
+ Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
+
+ Some tags can be nested, but the nesting is reset by the
+ interposition of other tags. For instance, a <tr> tag should
+ implicitly close the previous <tr> tag within the same <table>,
+ but not close a <tr> tag in another table.
+
+ <table><tr>Blah<tr>Blah
+ should be transformed into:
+ <table><tr>Blah</tr><tr>Blah
+ but,
+ <tr>Blah<table><tr>Blah
+ should NOT be transformed into
+ <tr>Blah<table></tr><tr>Blah
+
+ Differing assumptions about tag nesting rules are a major source
+ of problems with the BeautifulSoup class. If BeautifulSoup is not
+ treating as nestable a tag your page author treats as nestable,
+ try ICantBelieveItsBeautifulSoup, MinimalSoup, or
+ BeautifulStoneSoup before writing your own subclass."""
+
+ def __init__(self, *args, **kwargs):
+ if not kwargs.has_key('smartQuotesTo'):
+ kwargs['smartQuotesTo'] = self.HTML_ENTITIES
+ kwargs['isHTML'] = True
+ BeautifulStoneSoup.__init__(self, *args, **kwargs)
+
+ SELF_CLOSING_TAGS = buildTagMap(None,
+ ('br' , 'hr', 'input', 'img', 'meta',
+ 'spacer', 'link', 'frame', 'base', 'col'))
+
+ PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
+
+ QUOTE_TAGS = {'script' : None, 'textarea' : None}
+
+ #According to the HTML standard, each of these inline tags can
+ #contain another tag of the same type. Furthermore, it's common
+ #to actually use these tags this way.
+ NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
+ 'center')
+
+ #According to the HTML standard, these block tags can contain
+ #another tag of the same type. Furthermore, it's common
+ #to actually use these tags this way.
+ NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del')
+
+ #Lists can contain other lists, but there are restrictions.
+ NESTABLE_LIST_TAGS = { 'ol' : [],
+ 'ul' : [],
+ 'li' : ['ul', 'ol'],
+ 'dl' : [],
+ 'dd' : ['dl'],
+ 'dt' : ['dl'] }
+
+ #Tables can contain other tables, but there are restrictions.
+ NESTABLE_TABLE_TAGS = {'table' : [],
+ 'tr' : ['table', 'tbody', 'tfoot', 'thead'],
+ 'td' : ['tr'],
+ 'th' : ['tr'],
+ 'thead' : ['table'],
+ 'tbody' : ['table'],
+ 'tfoot' : ['table'],
+ }
+
+ NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre')
+
+ #If one of these tags is encountered, all tags up to the next tag of
+ #this type are popped.
+ RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
+ NON_NESTABLE_BLOCK_TAGS,
+ NESTABLE_LIST_TAGS,
+ NESTABLE_TABLE_TAGS)
+
+ NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
+ NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
+
+ # Used to detect the charset in a META tag; see start_meta
+ CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
+
+ def start_meta(self, attrs):
+ """Beautiful Soup can detect a charset included in a META tag,
+ try to convert the document to that charset, and re-parse the
+ document from the beginning."""
+ httpEquiv = None
+ contentType = None
+ contentTypeIndex = None
+ tagNeedsEncodingSubstitution = False
+
+ for i in range(0, len(attrs)):
+ key, value = attrs[i]
+ key = key.lower()
+ if key == 'http-equiv':
+ httpEquiv = value
+ elif key == 'content':
+ contentType = value
+ contentTypeIndex = i
+
+ if httpEquiv and contentType: # It's an interesting meta tag.
+ match = self.CHARSET_RE.search(contentType)
+ if match:
+ if (self.declaredHTMLEncoding is not None or
+ self.originalEncoding == self.fromEncoding):
+ # An HTML encoding was sniffed while converting
+ # the document to Unicode, or an HTML encoding was
+ # sniffed during a previous pass through the
+ # document, or an encoding was specified
+ # explicitly and it worked. Rewrite the meta tag.
+ def rewrite(match):
+ return match.group(1) + "%SOUP-ENCODING%"
+ newAttr = self.CHARSET_RE.sub(rewrite, contentType)
+ attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
+ newAttr)
+ tagNeedsEncodingSubstitution = True
+ else:
+ # This is our first pass through the document.
+ # Go through it again with the encoding information.
+ newCharset = match.group(3)
+ if newCharset and newCharset != self.originalEncoding:
+ self.declaredHTMLEncoding = newCharset
+ self._feed(self.declaredHTMLEncoding)
+ raise StopParsing
+ pass
+ tag = self.unknown_starttag("meta", attrs)
+ if tag and tagNeedsEncodingSubstitution:
+ tag.containsSubstitutions = True
+
+class StopParsing(Exception):
+ pass
+
+class ICantBelieveItsBeautifulSoup(BeautifulSoup):
+
+ """The BeautifulSoup class is oriented towards skipping over
+ common HTML errors like unclosed tags. However, sometimes it makes
+ errors of its own. For instance, consider this fragment:
+
+ <b>Foo<b>Bar</b></b>
+
+ This is perfectly valid (if bizarre) HTML. However, the
+ BeautifulSoup class will implicitly close the first b tag when it
+ encounters the second 'b'. It will think the author wrote
+ "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
+ there's no real-world reason to bold something that's already
+ bold. When it encounters '</b></b>' it will close two more 'b'
+ tags, for a grand total of three tags closed instead of two. This
+ can throw off the rest of your document structure. The same is
+ true of a number of other tags, listed below.
+
+ It's much more common for someone to forget to close a 'b' tag
+ than to actually use nested 'b' tags, and the BeautifulSoup class
+ handles the common case. This class handles the not-co-common
+ case: where you can't believe someone wrote what they did, but
+ it's valid HTML and BeautifulSoup screwed up by assuming it
+ wouldn't be."""
+
+ I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
+ ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
+ 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
+ 'big')
+
+ I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',)
+
+ NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
+ I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
+ I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
+
+class MinimalSoup(BeautifulSoup):
+ """The MinimalSoup class is for parsing HTML that contains
+ pathologically bad markup. It makes no assumptions about tag
+ nesting, but it does know which tags are self-closing, that
+ <script> tags contain Javascript and should not be parsed, that
+ META tags may contain encoding information, and so on.
+
+ This also makes it better for subclassing than BeautifulStoneSoup
+ or BeautifulSoup."""
+
+ RESET_NESTING_TAGS = buildTagMap('noscript')
+ NESTABLE_TAGS = {}
+
+class BeautifulSOAP(BeautifulStoneSoup):
+ """This class will push a tag with only a single string child into
+ the tag's parent as an attribute. The attribute's name is the tag
+ name, and the value is the string child. An example should give
+ the flavor of the change:
+
+ <foo><bar>baz</bar></foo>
+ =>
+ <foo bar="baz"><bar>baz</bar></foo>
+
+ You can then access fooTag['bar'] instead of fooTag.barTag.string.
+
+ This is, of course, useful for scraping structures that tend to
+ use subelements instead of attributes, such as SOAP messages. Note
+ that it modifies its input, so don't print the modified version
+ out.
+
+ I'm not sure how many people really want to use this class; let me
+ know if you do. Mainly I like the name."""
+
+ def popTag(self):
+ if len(self.tagStack) > 1:
+ tag = self.tagStack[-1]
+ parent = self.tagStack[-2]
+ parent._getAttrMap()
+ if (isinstance(tag, Tag) and len(tag.contents) == 1 and
+ isinstance(tag.contents[0], NavigableString) and
+ not parent.attrMap.has_key(tag.name)):
+ parent[tag.name] = tag.contents[0]
+ BeautifulStoneSoup.popTag(self)
+
+#Enterprise class names! It has come to our attention that some people
+#think the names of the Beautiful Soup parser classes are too silly
+#and "unprofessional" for use in enterprise screen-scraping. We feel
+#your pain! For such-minded folk, the Beautiful Soup Consortium And
+#All-Night Kosher Bakery recommends renaming this file to
+#"RobustParser.py" (or, in cases of extreme enterprisiness,
+#"RobustParserBeanInterface.class") and using the following
+#enterprise-friendly class aliases:
+class RobustXMLParser(BeautifulStoneSoup):
+ pass
+class RobustHTMLParser(BeautifulSoup):
+ pass
+class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
+ pass
+class RobustInsanelyWackAssHTMLParser(MinimalSoup):
+ pass
+class SimplifyingSOAPParser(BeautifulSOAP):
+ pass
+
+######################################################
+#
+# Bonus library: Unicode, Dammit
+#
+# This class forces XML data into a standard format (usually to UTF-8
+# or Unicode). It is heavily based on code from Mark Pilgrim's
+# Universal Feed Parser. It does not rewrite the XML or HTML to
+# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
+# (XML) and BeautifulSoup.start_meta (HTML).
+
+# Autodetects character encodings.
+# Download from http://chardet.feedparser.org/
+try:
+ import chardet
+# import chardet.constants
+# chardet.constants._debug = 1
+except ImportError:
+ chardet = None
+
+# cjkcodecs and iconv_codec make Python know about more character encodings.
+# Both are available from http://cjkpython.i18n.org/
+# They're built in if you use Python 2.4.
+try:
+ import cjkcodecs.aliases
+except ImportError:
+ pass
+try:
+ import iconv_codec
+except ImportError:
+ pass
+
+class UnicodeDammit:
+ """A class for detecting the encoding of a *ML document and
+ converting it to a Unicode string. If the source encoding is
+ windows-1252, can replace MS smart quotes with their HTML or XML
+ equivalents."""
+
+ # This dictionary maps commonly seen values for "charset" in HTML
+ # meta tags to the corresponding Python codec names. It only covers
+ # values that aren't in Python's aliases and can't be determined
+ # by the heuristics in find_codec.
+ CHARSET_ALIASES = { "macintosh" : "mac-roman",
+ "x-sjis" : "shift-jis" }
+
+ def __init__(self, markup, overrideEncodings=[],
+ smartQuotesTo='xml', isHTML=False):
+ self.declaredHTMLEncoding = None
+ self.markup, documentEncoding, sniffedEncoding = \
+ self._detectEncoding(markup, isHTML)
+ self.smartQuotesTo = smartQuotesTo
+ self.triedEncodings = []
+ if markup == '' or isinstance(markup, unicode):
+ self.originalEncoding = None
+ self.unicode = unicode(markup)
+ return
+
+ u = None
+ for proposedEncoding in overrideEncodings:
+ u = self._convertFrom(proposedEncoding)
+ if u: break
+ if not u:
+ for proposedEncoding in (documentEncoding, sniffedEncoding):
+ u = self._convertFrom(proposedEncoding)
+ if u: break
+
+ # If no luck and we have auto-detection library, try that:
+ if not u and chardet and not isinstance(self.markup, unicode):
+ u = self._convertFrom(chardet.detect(self.markup)['encoding'])
+
+ # As a last resort, try utf-8 and windows-1252:
+ if not u:
+ for proposed_encoding in ("utf-8", "windows-1252"):
+ u = self._convertFrom(proposed_encoding)
+ if u: break
+
+ self.unicode = u
+ if not u: self.originalEncoding = None
+
+ def _subMSChar(self, orig):
+ """Changes a MS smart quote character to an XML or HTML
+ entity."""
+ sub = self.MS_CHARS.get(orig)
+ if isinstance(sub, tuple):
+ if self.smartQuotesTo == 'xml':
+ sub = '&#x%s;' % sub[1]
+ else:
+ sub = '&%s;' % sub[0]
+ return sub
+
+ def _convertFrom(self, proposed):
+ proposed = self.find_codec(proposed)
+ if not proposed or proposed in self.triedEncodings:
+ return None
+ self.triedEncodings.append(proposed)
+ markup = self.markup
+
+ # Convert smart quotes to HTML if coming from an encoding
+ # that might have them.
+ if self.smartQuotesTo and proposed.lower() in("windows-1252",
+ "iso-8859-1",
+ "iso-8859-2"):
+ markup = re.compile("([\x80-\x9f])").sub \
+ (lambda(x): self._subMSChar(x.group(1)),
+ markup)
+
+ try:
+ # print "Trying to convert document to %s" % proposed
+ u = self._toUnicode(markup, proposed)
+ self.markup = u
+ self.originalEncoding = proposed
+ except Exception, e:
+ # print "That didn't work!"
+ # print e
+ return None
+ #print "Correct encoding: %s" % proposed
+ return self.markup
+
+ def _toUnicode(self, data, encoding):
+ '''Given a string and its encoding, decodes the string into Unicode.
+ %encoding is a string recognized by encodings.aliases'''
+
+ # strip Byte Order Mark (if present)
+ if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
+ and (data[2:4] != '\x00\x00'):
+ encoding = 'utf-16be'
+ data = data[2:]
+ elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
+ and (data[2:4] != '\x00\x00'):
+ encoding = 'utf-16le'
+ data = data[2:]
+ elif data[:3] == '\xef\xbb\xbf':
+ encoding = 'utf-8'
+ data = data[3:]
+ elif data[:4] == '\x00\x00\xfe\xff':
+ encoding = 'utf-32be'
+ data = data[4:]
+ elif data[:4] == '\xff\xfe\x00\x00':
+ encoding = 'utf-32le'
+ data = data[4:]
+ newdata = unicode(data, encoding)
+ return newdata
+
+ def _detectEncoding(self, xml_data, isHTML=False):
+ """Given a document, tries to detect its XML encoding."""
+ xml_encoding = sniffed_xml_encoding = None
+ try:
+ if xml_data[:4] == '\x4c\x6f\xa7\x94':
+ # EBCDIC
+ xml_data = self._ebcdic_to_ascii(xml_data)
+ elif xml_data[:4] == '\x00\x3c\x00\x3f':
+ # UTF-16BE
+ sniffed_xml_encoding = 'utf-16be'
+ xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
+ elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
+ and (xml_data[2:4] != '\x00\x00'):
+ # UTF-16BE with BOM
+ sniffed_xml_encoding = 'utf-16be'
+ xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
+ elif xml_data[:4] == '\x3c\x00\x3f\x00':
+ # UTF-16LE
+ sniffed_xml_encoding = 'utf-16le'
+ xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
+ elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
+ (xml_data[2:4] != '\x00\x00'):
+ # UTF-16LE with BOM
+ sniffed_xml_encoding = 'utf-16le'
+ xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
+ elif xml_data[:4] == '\x00\x00\x00\x3c':
+ # UTF-32BE
+ sniffed_xml_encoding = 'utf-32be'
+ xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
+ elif xml_data[:4] == '\x3c\x00\x00\x00':
+ # UTF-32LE
+ sniffed_xml_encoding = 'utf-32le'
+ xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
+ elif xml_data[:4] == '\x00\x00\xfe\xff':
+ # UTF-32BE with BOM
+ sniffed_xml_encoding = 'utf-32be'
+ xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
+ elif xml_data[:4] == '\xff\xfe\x00\x00':
+ # UTF-32LE with BOM
+ sniffed_xml_encoding = 'utf-32le'
+ xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
+ elif xml_data[:3] == '\xef\xbb\xbf':
+ # UTF-8 with BOM
+ sniffed_xml_encoding = 'utf-8'
+ xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
+ else:
+ sniffed_xml_encoding = 'ascii'
+ pass
+ except:
+ xml_encoding_match = None
+ xml_encoding_match = re.compile(
+ '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
+ if not xml_encoding_match and isHTML:
+ regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I)
+ xml_encoding_match = regexp.search(xml_data)
+ if xml_encoding_match is not None:
+ xml_encoding = xml_encoding_match.groups()[0].lower()
+ if isHTML:
+ self.declaredHTMLEncoding = xml_encoding
+ if sniffed_xml_encoding and \
+ (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
+ 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
+ 'utf-16', 'utf-32', 'utf_16', 'utf_32',
+ 'utf16', 'u16')):
+ xml_encoding = sniffed_xml_encoding
+ return xml_data, xml_encoding, sniffed_xml_encoding
+
+
+ def find_codec(self, charset):
+ return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
+ or (charset and self._codec(charset.replace("-", ""))) \
+ or (charset and self._codec(charset.replace("-", "_"))) \
+ or charset
+
+ def _codec(self, charset):
+ if not charset: return charset
+ codec = None
+ try:
+ codecs.lookup(charset)
+ codec = charset
+ except (LookupError, ValueError):
+ pass
+ return codec
+
+ EBCDIC_TO_ASCII_MAP = None
+ def _ebcdic_to_ascii(self, s):
+ c = self.__class__
+ if not c.EBCDIC_TO_ASCII_MAP:
+ emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
+ 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
+ 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
+ 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
+ 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
+ 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
+ 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
+ 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
+ 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
+ 201,202,106,107,108,109,110,111,112,113,114,203,204,205,
+ 206,207,208,209,126,115,116,117,118,119,120,121,122,210,
+ 211,212,213,214,215,216,217,218,219,220,221,222,223,224,
+ 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
+ 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
+ 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
+ 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
+ 250,251,252,253,254,255)
+ import string
+ c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
+ ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
+ return s.translate(c.EBCDIC_TO_ASCII_MAP)
+
+ MS_CHARS = { '\x80' : ('euro', '20AC'),
+ '\x81' : ' ',
+ '\x82' : ('sbquo', '201A'),
+ '\x83' : ('fnof', '192'),
+ '\x84' : ('bdquo', '201E'),
+ '\x85' : ('hellip', '2026'),
+ '\x86' : ('dagger', '2020'),
+ '\x87' : ('Dagger', '2021'),
+ '\x88' : ('circ', '2C6'),
+ '\x89' : ('permil', '2030'),
+ '\x8A' : ('Scaron', '160'),
+ '\x8B' : ('lsaquo', '2039'),
+ '\x8C' : ('OElig', '152'),
+ '\x8D' : '?',
+ '\x8E' : ('#x17D', '17D'),
+ '\x8F' : '?',
+ '\x90' : '?',
+ '\x91' : ('lsquo', '2018'),
+ '\x92' : ('rsquo', '2019'),
+ '\x93' : ('ldquo', '201C'),
+ '\x94' : ('rdquo', '201D'),
+ '\x95' : ('bull', '2022'),
+ '\x96' : ('ndash', '2013'),
+ '\x97' : ('mdash', '2014'),
+ '\x98' : ('tilde', '2DC'),
+ '\x99' : ('trade', '2122'),
+ '\x9a' : ('scaron', '161'),
+ '\x9b' : ('rsaquo', '203A'),
+ '\x9c' : ('oelig', '153'),
+ '\x9d' : '?',
+ '\x9e' : ('#x17E', '17E'),
+ '\x9f' : ('Yuml', ''),}
+
+#######################################################################
+
+
+#By default, act as an HTML pretty-printer.
+if __name__ == '__main__':
+ import sys
+ soup = BeautifulSoup(sys.stdin)
+ print soup.prettify()
--- /dev/null
+import sys
+from rss_sqlite import Listing
+from xml import sax
+from cgi import escape
+from re import sub
+from htmlentitydefs import name2codepoint
+from gconf import client_get_default
+
+import logging
+logger = logging.getLogger(__name__)
+
+def unescape(text):
+ def fixup(m):
+ text = m.group(0)
+ if text[:2] == "&#":
+ # character reference
+ try:
+ if text[:3] == "&#x":
+ return unichr(int(text[3:-1], 16))
+ else:
+ return unichr(int(text[2:-1]))
+ except ValueError:
+ pass
+ else:
+ # named entity
+ try:
+ text = unichr(name2codepoint[text[1:-1]])
+ except KeyError:
+ pass
+ return text # leave as is
+ return sub("&#?\w+;", fixup, text)
+
+def sanitize(text):
+ from cgi import escape
+ return escape(text).encode('ascii', 'xmlcharrefreplace')
+
+class XmlHandler():
+
+ def __init__(self, listing):
+ self.listing=listing
+
+ def getConfigXml(self):
+ xml = "<?xml version=\"1.0\" encoding=\"utf-8\"?><xml>"
+ xml += "<hideReadFeed>True</hideReadFeed>"
+ xml += "<hideReadArticles>True</hideReadArticles>"
+ xml += "</xml>"
+ return xml
+
+ def generateCategoryXml(self):
+ xml = "<?xml version=\"1.0\" encoding=\"utf-8\"?><xml>"
+ for cat in self.listing.getListOfCategories():
+ xml += "<category>"
+ xml += "<catname>%s</catname>" %sanitize(self.listing.getCategoryTitle(cat))
+ xml += "<catid>%s</catid>" % cat
+ xml += "</category>"
+ xml += "</xml>"
+ return xml
+
+ def fix_title(self, title):
+ return escape(unescape(title).replace("<em>","").replace("</em>","").replace("<nobr>","").replace("</nobr>","").replace("<wbr>","").replace("—","-"))
+
+ def generateFeedsXml(self, catid):
+ xml = "<?xml version=\"1.0\" encoding=\"utf-8\"?><xml>"
+ for key in self.listing.getSortedListOfKeys("Manual", category=catid):
+ xml += "<feed>"
+ xml += "<feedname>%s</feedname>" %sanitize(self.listing.getFeedTitle(key))
+ xml += "<feedid>%s</feedid>" %key
+ xml += "<unread>%s</unread>" %self.listing.getFeedNumberOfUnreadItems(key)
+ xml += "<updatedDate>%s</updatedDate>" %self.listing.getFeedUpdateTime(key)
+ xml += "<icon>%s</icon>" %self.listing.getFavicon(key)
+ # xml += "<updating>True</updating>"
+ xml += "<updating>False</updating>"
+ xml += "</feed>"
+ xml += "</xml>"
+ return xml
+
+ def generateArticlesXml(self, key, onlyUnread):
+ feed = self.listing.getFeed(key)
+ xml = "<?xml version=\"1.0\" encoding=\"utf-8\"?><xml>"
+ if onlyUnread == "False":
+ onlyUnread = False
+ for id in feed.getIds(onlyUnread):
+ xml += "<article>"
+ xml += "<title>%s</title>" %self.fix_title(feed.getTitle(id))
+ xml += "<articleid>%s</articleid>" %id
+ xml += "<unread>%s</unread>" %str(feed.isEntryRead(id))
+ xml += "<updatedDate>%s</updatedDate>" %feed.getDateStamp(id)
+ xml += "<path>%s</path>" %feed.getContentLink(id)
+ xml += "</article>"
+ xml += "</xml>"
+ return xml
+
+ def do_GET(self):
+ (req, sep, arg) = self.path.partition("?")
+ request = req.split("/")
+ arguments = {}
+ if arg != "":
+ args = arg.split("&")
+ for arg in args:
+ ele = arg.split("=")
+ arguments[ele[0]] = ele[1]
+ if request[1] == "categories":
+ xml = self.generateCategoryXml()
+ elif request[1] == "feeds":
+ catid = request[2]
+ xml = self.generateFeedsXml(catid)
+ elif request[1] == "articles":
+ key = request[2]
+ onlyUnread = arguments.get("onlyUnread","False")
+ markAllAsRead = arguments.get("markAllAsRead", "False")
+ xml = self.generateArticlesXml(key, onlyUnread, markAllAsRead)
+ elif request[1] == "html":
+ key = request[2]
+ article = request[3]
+ feed = listing.getFeed(key)
+ try:
+ file = open(feed.getContentLink(article))
+ html = file.read().replace("body", "body bgcolor='#ffffff'", 1)
+ file.close()
+ except:
+ html = "<html><body>Error retrieving article</body></html>"
+ self.send_response(200)
+ self.send_header("Content-type", "text/html")
+ self.end_headers()
+ self.wfile.write(html)
+ #listing.updateUnread(key)
+ return
+ elif request[1] == "isUpdating":
+ xml = "<xml>"
+ key = request[2]
+ if (key in updatingFeeds) or ((key=="") and (len(updatingFeeds)>0)):
+ xml += "<updating>True</updating>"
+ else:
+ xml += "<updating>False</updating>"
+ xml += self.getCommands()
+ xml += "</xml>"
+ elif request[1] == "read":
+ key = request[2]
+ article = request[3]
+ feed = listing.getFeed(key)
+ feed.setEntryRead(article)
+ listing.updateUnread(key)
+ self.send_response(200)
+ self.send_header("Content-type", "text/html")
+ self.end_headers()
+ self.wfile.write("OK")
+ return
+ elif request[1] == "config":
+ xml = self.getConfigXml()
+ elif request[1] == "home":
+ file = open(self.path)
+ self.send_response(200)
+ self.send_header("Content-type", "text/html")
+ self.end_headers()
+ self.wfile.write(file.read())
+ file.close()
+ return
+ elif request[1] == "task":
+ self.openTaskSwitch()
+ xml = "<xml>OK</xml>"
+ elif request[1] == "deleteCat":
+ key = request[2]
+ listing.removeCategory(key)
+ xml = "<xml>OK</xml>"
+ elif request[1] == "deleteFeed":
+ key = request[3]
+ listing.removeFeed(key)
+ xml = "<xml>OK</xml>"
+ elif request[1] == "addFeed":
+ cat = request[2]
+ name = request[3]
+ url = arguments.get("url","")
+ listing.addFeed(name, url, category=cat)
+ xml = "<xml>OK</xml>"
+ elif request[1] == "updateFeed":
+ key = request[2]
+ listing.updateFeed (key, priority=-1)
+ #download = Download(listing, [key,])
+ #download.start()
+ xml = "<xml>OK</xml>"
+ elif request[1]=="updateAll":
+ #app.automaticUpdate()
+ self.updateAll()
+ xml = "<xml>OK</xml>"
+ elif request[1] == "addCat":
+ catName = request[2]
+ listing.addCategory(catName)
+ xml = "<xml>OK</xml>"
+ else:
+ self.send_error(404, "File not found")
+ return
+ self.send_response(200)
+ self.send_header("Content-type", "text/xml")
+ self.end_headers()
+ self.wfile.write(xml.encode("utf-8"))
--- /dev/null
+#!/usr/bin/env python2.5
+
+#
+# Copyright (c) 2007-2008 INdT.
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+# ============================================================================
+# Name : FeedingIt.py
+# Author : Yves Marcoz
+# Version : 0.6.1
+# Description : Simple RSS Reader
+# ============================================================================
+#try:
+# import gtk
+# import hildon
+# from gobject import idle_add
+#except:
+# pass
+
+from ConfigParser import RawConfigParser
+from gconf import client_get_default
+from urllib2 import ProxyHandler
+from mainthread import mainthread
+import logging
+logger = logging.getLogger(__name__)
+
+VERSION = "52"
+
+section = "FeedingIt"
+ranges = { "updateInterval":[0.5, 1, 2, 4, 12, 24], "expiry":[24, 48, 72, 144, 288], "fontSize":range(12,24), "orientation":["Automatic", "Landscape", "Portrait"], "artFontSize":[10, 12, 14, 16, 18, 20], "feedsort":["Manual", "Most unread", "Least unread", "Most recent", "Least recent"] }
+titles = {"updateInterval":"Auto-update interval", "expiry":"Delete articles", "fontSize":"List font size", "orientation":"Display orientation", "artFontSize":"Article font size","feedsort":"Feed sort order"}
+subtitles = {"updateInterval":"Every %s hours", "expiry":"After %s hours", "fontSize":"%s pixels", "orientation":"%s", "artFontSize":"%s pixels", "feedsort":"%s"}
+
+class Config():
+ def __init__(self, parent, configFilename):
+ self.configFilename = configFilename
+ self.parent = parent
+ # Load config
+ self.loadConfig()
+
+ # Backup current settings for later restore
+ self.config_backup = dict(self.config)
+ self.do_restore_backup = True
+
+ def on_save_button_clicked(self, button):
+ self.do_restore_backup = False
+ self.window.destroy()
+
+ def createDialog(self):
+ import gtk
+ import hildon
+ from gobject import idle_add
+ self.window = gtk.Dialog("Settings", self.parent)
+ self.window.set_geometry_hints(min_height=600)
+
+ save_button = self.window.add_button(gtk.STOCK_SAVE, gtk.RESPONSE_OK)
+ save_button.connect('clicked', self.on_save_button_clicked)
+ #self.window.set_default_size(-1, 600)
+ panArea = hildon.PannableArea()
+
+ vbox = gtk.VBox(False, 2)
+ self.buttons = {}
+
+ def heading(text):
+ l = gtk.Label()
+ l.set_size_request(-1, 6)
+ vbox.pack_start(l, expand=False)
+ vbox.pack_start(gtk.Frame(text), expand=False)
+
+ def add_setting(setting):
+ picker = hildon.PickerButton(gtk.HILDON_SIZE_FINGER_HEIGHT, hildon.BUTTON_ARRANGEMENT_VERTICAL)
+ selector = self.create_selector(ranges[setting], setting)
+ picker.set_selector(selector)
+ picker.set_title(titles[setting])
+ picker.set_text(titles[setting], subtitles[setting] % self.config[setting])
+ picker.set_name('HildonButton-finger')
+ picker.set_alignment(0,0,1,1)
+ self.buttons[setting] = picker
+ vbox.pack_start(picker, expand=False)
+
+ button = hildon.Button(gtk.HILDON_SIZE_FINGER_HEIGHT, hildon.BUTTON_ARRANGEMENT_VERTICAL)
+ button.set_label("View Known Issues and Tips")
+ button.connect("clicked", self.button_tips_clicked)
+ button.set_alignment(0,0,1,1)
+ vbox.pack_start(button, expand=False)
+
+ heading('Display')
+ add_setting('fontSize')
+ add_setting('artFontSize')
+ add_setting('orientation')
+ add_setting('feedsort')
+ button = hildon.CheckButton(gtk.HILDON_SIZE_FINGER_HEIGHT)
+ button.set_label("Hide read feeds")
+ button.set_active(self.config["hidereadfeeds"])
+ button.connect("toggled", self.button_toggled, "hidereadfeeds")
+ vbox.pack_start(button, expand=False)
+
+ button = hildon.CheckButton(gtk.HILDON_SIZE_FINGER_HEIGHT)
+ button.set_label("Hide read articles")
+ button.set_active(self.config["hidereadarticles"])
+ button.connect("toggled", self.button_toggled, "hidereadarticles")
+ vbox.pack_start(button, expand=False)
+
+
+ heading('Updating')
+ button = hildon.CheckButton(gtk.HILDON_SIZE_FINGER_HEIGHT)
+ button.set_label("Automatically update feeds")
+ button.set_active(self.config["autoupdate"])
+ button.connect("toggled", self.button_toggled, "autoupdate")
+ vbox.pack_start(button, expand=False)
+ add_setting('updateInterval')
+ add_setting('expiry')
+
+ heading('Network')
+ button = hildon.CheckButton(gtk.HILDON_SIZE_FINGER_HEIGHT)
+ button.set_label('Cache images')
+ button.set_active(self.config["imageCache"])
+ button.connect("toggled", self.button_toggled, "imageCache")
+ vbox.pack_start(button, expand=False)
+
+ button = hildon.CheckButton(gtk.HILDON_SIZE_FINGER_HEIGHT)
+ button.set_label("Use HTTP proxy")
+ button.set_active(self.config["proxy"])
+ button.connect("toggled", self.button_toggled, "proxy")
+ vbox.pack_start(button, expand=False)
+
+ button = hildon.CheckButton(gtk.HILDON_SIZE_FINGER_HEIGHT)
+ button.set_label('Open links in external browser')
+ button.set_active(self.config["extBrowser"])
+ button.connect("toggled", self.button_toggled, "extBrowser")
+ vbox.pack_start(button, expand=False)
+
+ panArea.add_with_viewport(vbox)
+
+ self.window.vbox.add(panArea)
+ self.window.connect("destroy", self.onExit)
+ #self.window.add(self.vbox)
+ self.window.set_default_size(-1, 600)
+ self.window.show_all()
+ return self.window
+
+ def button_tips_clicked(self, *widget):
+ import dbus
+ bus = dbus.SessionBus()
+ proxy = bus.get_object("com.nokia.osso_browser", "/com/nokia/osso_browser/request")
+ iface = dbus.Interface(proxy, 'com.nokia.osso_browser')
+ iface.open_new_window("http://feedingit.marcoz.org/news/?page_id=%s" % VERSION)
+
+ def onExit(self, *widget):
+ # When the dialog is closed without hitting
+ # the "Save" button, restore the configuration
+ if self.do_restore_backup:
+ logger.debug('Restoring configuration')
+ self.config = self.config_backup
+
+ self.saveConfig()
+ self.window.destroy()
+
+ def button_toggled(self, widget, configName):
+ #print "widget", widget.get_active()
+ if (widget.get_active()):
+ self.config[configName] = True
+ else:
+ self.config[configName] = False
+ #print "autoup", self.autoupdate
+ self.saveConfig()
+
+ def selection_changed(self, selector, button, setting):
+ from gobject import idle_add
+ current_selection = selector.get_current_text()
+ if current_selection:
+ self.config[setting] = current_selection
+ idle_add(self.updateButton, setting)
+ self.saveConfig()
+
+ def updateButton(self, setting):
+ self.buttons[setting].set_text(titles[setting], subtitles[setting] % self.config[setting])
+
+ def loadConfig(self):
+ self.config = {}
+ try:
+ configParser = RawConfigParser()
+ configParser.read(self.configFilename)
+ self.config["fontSize"] = configParser.getint(section, "fontSize")
+ self.config["artFontSize"] = configParser.getint(section, "artFontSize")
+ self.config["expiry"] = configParser.getint(section, "expiry")
+ self.config["autoupdate"] = configParser.getboolean(section, "autoupdate")
+ self.config["updateInterval"] = configParser.getfloat(section, "updateInterval")
+ self.config["orientation"] = configParser.get(section, "orientation")
+ self.config["imageCache"] = configParser.getboolean(section, "imageCache")
+ except:
+ self.config["fontSize"] = 17
+ self.config["artFontSize"] = 14
+ self.config["expiry"] = 24
+ self.config["autoupdate"] = False
+ self.config["updateInterval"] = 4
+ self.config["orientation"] = "Automatic"
+ self.config["imageCache"] = False
+ try:
+ self.config["proxy"] = configParser.getboolean(section, "proxy")
+ except:
+ self.config["proxy"] = True
+ try:
+ self.config["hidereadfeeds"] = configParser.getboolean(section, "hidereadfeeds")
+ self.config["hidereadarticles"] = configParser.getboolean(section, "hidereadarticles")
+ except:
+ self.config["hidereadfeeds"] = False
+ self.config["hidereadarticles"] = False
+ try:
+ self.config["extBrowser"] = configParser.getboolean(section, "extBrowser")
+ except:
+ self.config["extBrowser"] = False
+ try:
+ self.config["feedsort"] = configParser.get(section, "feedsort")
+ except:
+ self.config["feedsort"] = "Manual"
+
+ def saveConfig(self):
+ configParser = RawConfigParser()
+ configParser.add_section(section)
+ configParser.set(section, 'fontSize', str(self.config["fontSize"]))
+ configParser.set(section, 'artFontSize', str(self.config["artFontSize"]))
+ configParser.set(section, 'expiry', str(self.config["expiry"]))
+ configParser.set(section, 'autoupdate', str(self.config["autoupdate"]))
+ configParser.set(section, 'updateInterval', str(self.config["updateInterval"]))
+ configParser.set(section, 'orientation', str(self.config["orientation"]))
+ configParser.set(section, 'imageCache', str(self.config["imageCache"]))
+ configParser.set(section, 'proxy', str(self.config["proxy"]))
+ configParser.set(section, 'hidereadfeeds', str(self.config["hidereadfeeds"]))
+ configParser.set(section, 'hidereadarticles', str(self.config["hidereadarticles"]))
+ configParser.set(section, 'extBrowser', str(self.config["extBrowser"]))
+ configParser.set(section, 'feedsort', str(self.config["feedsort"]))
+
+ # Writing our configuration file
+ file = open(self.configFilename, 'wb')
+ configParser.write(file)
+ file.close()
+
+ def create_selector(self, choices, setting):
+ import gtk
+ import hildon
+ from gobject import idle_add
+ #self.pickerDialog = hildon.PickerDialog(self.parent)
+ selector = hildon.TouchSelector(text=True)
+ index = 0
+ for item in choices:
+ iter = selector.append_text(str(item))
+ if str(self.config[setting]) == str(item):
+ selector.set_active(0, index)
+ index += 1
+ selector.connect("changed", self.selection_changed, setting)
+ #self.pickerDialog.set_selector(selector)
+ return selector
+ #self.pickerDialog.show_all()
+
+ def getFontSize(self):
+ return self.config["fontSize"]
+ def getArtFontSize(self):
+ return self.config["artFontSize"]
+ def getExpiry(self):
+ return self.config["expiry"]
+ def isAutoUpdateEnabled(self):
+ return self.config["autoupdate"]
+ def getUpdateInterval(self):
+ return float(self.config["updateInterval"])
+ def getReadFont(self):
+ return "sans italic %s" % self.config["fontSize"]
+ def getUnreadFont(self):
+ return "sans %s" % self.config["fontSize"]
+ def getOrientation(self):
+ return ranges["orientation"].index(self.config["orientation"])
+ def getImageCache(self):
+ return self.config["imageCache"]
+ @mainthread
+ def getProxy(self):
+ if self.config["proxy"] == False:
+ return (False, None)
+ if client_get_default().get_bool('/system/http_proxy/use_http_proxy'):
+ port = client_get_default().get_int('/system/http_proxy/port')
+ http = client_get_default().get_string('/system/http_proxy/host')
+ proxy = ProxyHandler( {"http":"http://%s:%s/"% (http,port)} )
+ return (True, proxy)
+ return (False, None)
+ def getHideReadFeeds(self):
+ return self.config["hidereadfeeds"]
+ def getHideReadArticles(self):
+ return self.config["hidereadarticles"]
+ def getOpenInExternalBrowser(self):
+ return self.config["extBrowser"]
+ def getFeedSortOrder(self):
+ return self.config["feedsort"]
--- /dev/null
+# Copyright (c) 2011 Neal H. Walfield
+#
+# This software is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from __future__ import with_statement
+import os
+import logging
+import itertools
+import sys
+import string
+import traceback
+import time
+import errno
+import glob
+
+logger = None
+original_excepthook = None
+
+def my_excepthook(exctype, value, tb):
+ """Log uncaught exceptions."""
+ logger.error(
+ "Uncaught exception: %s"
+ % (''.join(traceback.format_exception(exctype, value, tb)),))
+ original_excepthook(exctype, value, tb)
+
+def init(dot_directory, debug=False, max_logfiles=1, program_name=None):
+ if not os.path.isabs(dot_directory):
+ dot_directory = os.path.join(os.path.expanduser("~"), dot_directory)
+
+ logging_directory = os.path.join(dot_directory, "logging")
+ try:
+ os.makedirs(logging_directory)
+ except OSError, e:
+ if e.errno != errno.EEXIST:
+ raise
+
+ if program_name is None:
+ program_name = os.path.basename(sys.argv[0])
+ string.translate(program_name, string.maketrans(' .', '__'))
+
+ timestamp = time.strftime("%Y%m%d")
+
+ logfiles = glob.glob(os.path.join(logging_directory,
+ program_name + '-*.log'))
+ if len(logfiles) >= max_logfiles:
+ logfiles.sort()
+ for f in logfiles[:-(max_logfiles+1)]:
+ print "Purging old log file %s" % (f,)
+ try:
+ os.remove(f)
+ except OSError, e:
+ print "Removing %s: %s" % (f, str(e))
+
+ logfile = os.path.join(logging_directory,
+ program_name + '-' + timestamp + '.log')
+
+ print "Sending output to %s" % logfile
+
+ global logger
+ logger = logging.getLogger(__name__)
+
+ if debug:
+ level = logging.DEBUG
+ else:
+ level = logging.INFO
+
+ logging.basicConfig(
+ level=level,
+ format=('%(asctime)s (pid: ' + str(os.getpid()) + ') '
+ + '%(levelname)-8s %(message)s'),
+ filename=logfile,
+ filemode='a')
+
+ # Log uncaught exceptions.
+ global original_excepthook
+ original_excepthook = sys.excepthook
+ sys.excepthook = my_excepthook
+
+ def redirect(thing):
+ filename = os.path.join(logging_directory, program_name + '.' + thing)
+ try:
+ with open(filename, "r") as fhandle:
+ contents = fhandle.read()
+ except IOError, e:
+ if e.errno in (errno.ENOENT,):
+ fhandle = None
+ contents = ""
+ else:
+ logging.error("Reading %s: %s" % (filename, str(e)))
+ raise
+
+ logging.error("std%s of last run: %s" % (thing, contents))
+
+ if fhandle is not None:
+ os.remove(filename)
+
+ print "Redirecting std%s to %s" % (thing, filename)
+ return open(filename, "w", 0)
+
+ sys.stderr = redirect('err')
+ sys.stdout = redirect('out')
+
--- /dev/null
+import Thread
+
+class Download(Thread):
+ def __init__(self, listing, key, config):
+ Thread.__init__(self)
+ self.listing = listing
+ self.key = key
+ self.config = config
+
+ def run (self):
+ (use_proxy, proxy) = self.config.getProxy()
+ key_lock = get_lock(self.key)
+ if key_lock != None:
+ if use_proxy:
+ self.listing.updateFeed(self.key, self.config.getExpiry(), proxy=proxy, imageCache=self.config.getImageCache() )
+ else:
+ self.listing.updateFeed(self.key, self.config.getExpiry(), imageCache=self.config.getImageCache() )
+ del key_lock
\ No newline at end of file
--- /dev/null
+#!/usr/bin/python
+
+import sys
+
+from PySide import QtGui
+from PySide import QtDeclarative
+import os
+from os import mkdir, remove, stat, environ
+from os.path import isfile, isdir, exists
+
+# Comment the line below if you don't want to use OpenGL for QML rendering or if it is not supported
+from PySide import QtOpenGL, QtCore
+
+from rss_sqlite import Listing
+CONFIGDIR = environ.get("HOME", "/home/user") + "/.feedingit"
+#CONFIGDIR = "/home/user/.feedingit"
+
+import logging
+#logger = logging.getLogger(__name__)
+
+import debugging
+debugging.init(dot_directory=".feedingit", program_name="feedingit-pyside")
+
+from cgi import escape
+from re import sub
+
+class FeedWrapper(QtCore.QObject):
+ def __init__(self, key):
+ QtCore.QObject.__init__(self)
+ self._key = key
+ def _name(self):
+ return listing.getFeedTitle(self._key)
+ def _unread(self):
+ return listing.getFeedNumberOfUnreadItems(self._key)
+ def _updatedDate(self):
+ return listing.getFeedUpdateTime(self._key)
+ def _icon(self):
+ return listing.getFavicon(self._key)
+ def _feedid(self):
+ return self._key
+ def _updating(self):
+ return false
+
+ changed = QtCore.Signal()
+
+ title = QtCore.Property(unicode, _name, notify=changed)
+ feedid = QtCore.Property(unicode, _feedid, notify=changed)
+ unread = QtCore.Property(unicode, _unread, notify=changed)
+ updatedDate= QtCore.Property(unicode, _updatedDate, notify=changed)
+ icon = QtCore.Property(unicode, _icon, notify=changed)
+ updating = QtCore.Property(unicode, _icon, notify=changed)
+
+class FeedsModel(QtCore.QAbstractListModel):
+ COLUMNS = ('feed', )
+ _category = None
+
+ def __init__(self):
+ QtCore.QAbstractListModel.__init__(self)
+ self._feeds = listing.getListOfFeeds(self._category)
+ self.setRoleNames(dict(enumerate(FeedsModel.COLUMNS)))
+
+ def rowCount(self, parent=QtCore.QModelIndex()):
+ return len(self._feeds)
+
+ def data(self, index, role):
+ if index.isValid() and role == FeedsModel.COLUMNS.index('feed'):
+ print self._feeds[index.row()]
+ return FeedWrapper(self._feeds[index.row()])
+ return None
+
+class ArticleWrapper(QtCore.QObject):
+ def __init__(self, feed, articleid):
+ QtCore.QObject.__init__(self)
+ self._feed = feed
+ self._articleid = articleid
+ def _name(self):
+ return self.fix_title(self._feed.getTitle(self._articleid))
+ def _unread(self):
+ return str(self._feed.isEntryRead(self._articleid))
+ def _getarticleid(self):
+ return self._articleid
+ def _updatedDate(self):
+ return self._feed.getDateStamp(self._articleid)
+ def _path(self):
+ return self._feed.getContentLink(self._articleid)
+
+ changed = QtCore.Signal()
+
+ title = QtCore.Property(unicode, _name, notify=changed)
+ articleid = QtCore.Property(unicode, _getarticleid, notify=changed)
+ unread = QtCore.Property(unicode, _unread, notify=changed)
+ updatedDate= QtCore.Property(unicode, _updatedDate, notify=changed)
+ path = QtCore.Property(unicode, _path, notify=changed)
+
+class ArticlesModel(QtCore.QAbstractListModel):
+ COLUMNS = ('article', )
+ _articles = []
+ _key = None
+ _feed = None
+
+ def __init__(self,):
+ QtCore.QAbstractListModel.__init__(self)
+ self.setRoleNames(dict(enumerate(ArticlesModel.COLUMNS)))
+
+ def updateModel(self, key):
+ self._key = key
+ self._feed = listing.getFeed(self._key)
+ self._articles = self._feed.getIds()
+
+ def rowCount(self, parent=QtCore.QModelIndex()):
+ print "art " + str(len(self._articles))
+ return len(self._articles)
+
+ def data(self, index, role):
+ print "data" + str(index) + " " + str(role)
+ if index.isValid() and role == ArticlesModel.COLUMNS.index('article'):
+ return ArticleWrapper(self._articles[index.row()])
+ return None
+
+class Controller(QtCore.QObject):
+
+ def __init__(self, listing):
+ QtCore.QObject.__init__(self)
+ from XmlHandler import XmlHandler
+ self._handler = XmlHandler(listing)
+
+ @QtCore.Slot(str,str, result=str)
+ def getArticle(self, key, article):
+ feed = listing.getFeed(key)
+ try:
+ file = open(feed.getContentLink(article))
+ html = file.read().replace("body", "body bgcolor='#ffffff'", 1)
+ file.close()
+ except:
+ html = "<html><body>Error retrieving article</body></html>"
+ return html
+
+ @QtCore.Slot(str, result=str)
+ def getFeedsXml(self, catid):
+ return self._handler.generateFeedsXml(catid)
+
+ @QtCore.Slot(str,result=str)
+ def getArticlesXml(self, key):
+ #onlyUnread = arguments.get("onlyUnread","False")
+ return self._handler.generateArticlesXml(key, "False")
+
+ @QtCore.Slot(result=str)
+ def getCategoryXml(self):
+ return self._handler.generateCategoryXml()
+
+ @QtCore.Slot(QtCore.QObject)
+ def feedClicked(self, wrapper):
+ #print 'User clicked on:', wrapper._key
+ #articlesModel.updateModel(wrapper._key)
+ pass
+
+ @QtCore.Slot(str)
+ def updateFeed(self, key):
+ print 'updating feed ', key
+ listing.updateFeed(key)
+
+ @QtCore.Slot()
+ def updateAll(self):
+ for feed in listing.getListOfFeeds("Manual"):
+ listing.updateFeed(feed)
+
+ @QtCore.Slot(str,str,str)
+ def addFeed(self, title, url, catid):
+ listing.addFeed(title,url, category=catid)
+
+ @QtCore.Slot(str)
+ def addCategory(self, name):
+ listing.addCategory(name)
+
+ @QtCore.Slot(str)
+ def markAllAsRead(self, key):
+ feed = listing.getFeed(key)
+ feed.markAllAsRead()
+
+ @QtCore.Slot(str, str)
+ def setEntryRead(self, key, articleid):
+ feed = listing.getFeed(key)
+ feed.setEntryRead(articleid)
+ listing.updateUnread(key)
+
+ @QtCore.Slot(str, result=str)
+ def getConfig(self, item):
+ if (item == "hideReadFeed"):
+ return "True"
+ if (item == "hideReadArticles"):
+ return "False"
+ return ""
+
+def main():
+
+ if not isdir(CONFIGDIR):
+ try:
+ mkdir(CONFIGDIR)
+ except:
+ logger.error("Error: Can't create configuration directory")
+ from sys import exit
+ exit(1)
+
+ from config import Config
+ global config
+ config = Config(None,CONFIGDIR+"config.ini")
+
+ global listing
+ listing = Listing(config, CONFIGDIR)
+
+ import mainthread
+ mainthread.init()
+
+ from jobmanager import JobManager
+ JobManager(True)
+
+ app = QtGui.QApplication(sys.argv)
+ view = QtDeclarative.QDeclarativeView()
+
+ global articlesModel
+ feedsModel = FeedsModel()
+ articlesModel = ArticlesModel()
+
+ controller = Controller(listing)
+
+ rc = view.rootContext()
+
+ rc.setContextProperty('controller', controller)
+ rc.setContextProperty('feedsModel', feedsModel)
+ rc.setContextProperty('articlesModel', articlesModel)
+
+ # Comment the two lines below if you don't want to use OpenGL for QML rendering or if it is not supported
+ glw = QtOpenGL.QGLWidget()
+ view.setViewport(glw)
+
+ if os.path.exists('/usr/share/feedingit/qml'):
+ view.setSource('/usr/share/feedingit/qml/main.qml')
+ else:
+ #view.setSource(os.path.join('qml','main.qml'))
+ view.setSource(os.path.join('qml','FeedingIt.qml'))
+
+ #view.showFullScreen()
+ view.show()
+ sys.exit(app.exec_())
+
+if __name__ == "__main__":
+
+ main()
--- /dev/null
+#!/usr/bin/env python
+"""Universal feed parser
+
+Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
+
+Visit http://feedparser.org/ for the latest version
+Visit http://feedparser.org/docs/ for the latest documentation
+
+Required: Python 2.4 or later
+Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
+"""
+
+__version__ = "5.0.1"
+__license__ = """Copyright (c) 2002-2008, Mark Pilgrim, All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE."""
+__author__ = "Mark Pilgrim <http://diveintomark.org/>"
+__contributors__ = ["Jason Diamond <http://injektilo.org/>",
+ "John Beimler <http://john.beimler.org/>",
+ "Fazal Majid <http://www.majid.info/mylos/weblog/>",
+ "Aaron Swartz <http://aaronsw.com/>",
+ "Kevin Marks <http://epeus.blogspot.com/>",
+ "Sam Ruby <http://intertwingly.net/>",
+ "Ade Oshineye <http://blog.oshineye.com/>",
+ "Martin Pool <http://sourcefrog.net/>",
+ "Kurt McKee <http://kurtmckee.org/>"]
+
+# HTTP "User-Agent" header to send to servers when downloading feeds.
+# If you are embedding feedparser in a larger application, you should
+# change this to your application name and URL.
+USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__
+
+# HTTP "Accept" header to send to servers when downloading feeds. If you don't
+# want to send an Accept header, set this to None.
+ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
+
+# List of preferred XML parsers, by SAX driver name. These will be tried first,
+# but if they're not installed, Python will keep searching through its own list
+# of pre-installed parsers until it finds one that supports everything we need.
+PREFERRED_XML_PARSERS = ["drv_libxml2"]
+
+# If you want feedparser to automatically run HTML markup through HTML Tidy, set
+# this to 1. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
+# or utidylib <http://utidylib.berlios.de/>.
+TIDY_MARKUP = 0
+
+# List of Python interfaces for HTML Tidy, in order of preference. Only useful
+# if TIDY_MARKUP = 1
+PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
+
+# If you want feedparser to automatically resolve all relative URIs, set this
+# to 1.
+RESOLVE_RELATIVE_URIS = 1
+
+# If you want feedparser to automatically sanitize all potentially unsafe
+# HTML content, set this to 1.
+SANITIZE_HTML = 1
+
+# ---------- Python 3 modules (make it work if possible) ----------
+try:
+ import rfc822
+except ImportError:
+ from email import _parseaddr as rfc822
+
+try:
+ # Python 3.1 introduces bytes.maketrans and simultaneously
+ # deprecates string.maketrans; use bytes.maketrans if possible
+ _maketrans = bytes.maketrans
+except (NameError, AttributeError):
+ import string
+ _maketrans = string.maketrans
+
+# base64 support for Atom feeds that contain embedded binary data
+try:
+ import base64, binascii
+except ImportError:
+ base64 = binascii = None
+else:
+ # Python 3.1 deprecates decodestring in favor of decodebytes
+ _base64decode = getattr(base64, 'decodebytes', base64.decodestring)
+
+def _s2bytes(s):
+ # Convert a UTF-8 str to bytes if the interpreter is Python 3
+ try:
+ return bytes(s, 'utf8')
+ except (NameError, TypeError):
+ # In Python 2.5 and below, bytes doesn't exist (NameError)
+ # In Python 2.6 and above, bytes and str are the same (TypeError)
+ return s
+
+def _l2bytes(l):
+ # Convert a list of ints to bytes if the interpreter is Python 3
+ try:
+ if bytes is not str:
+ # In Python 2.6 and above, this call won't raise an exception
+ # but it will return bytes([65]) as '[65]' instead of 'A'
+ return bytes(l)
+ raise NameError
+ except NameError:
+ return ''.join(map(chr, l))
+
+# If you want feedparser to allow all URL schemes, set this to ()
+# List culled from Python's urlparse documentation at:
+# http://docs.python.org/library/urlparse.html
+# as well as from "URI scheme" at Wikipedia:
+# https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme
+# Many more will likely need to be added!
+ACCEPTABLE_URI_SCHEMES = (
+ 'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'mailto',
+ 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu', 'sftp',
+ 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet', 'wais',
+ # Additional common-but-unofficial schemes
+ 'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs',
+ 'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg',
+)
+#ACCEPTABLE_URI_SCHEMES = ()
+
+# ---------- required modules (should come with any Python distribution) ----------
+import cgi
+import copy
+import datetime
+import re
+import struct
+import sys
+import time
+import types
+import urllib
+import urllib2
+import urlparse
+
+from htmlentitydefs import name2codepoint, codepoint2name, entitydefs
+
+try:
+ from io import BytesIO as _StringIO
+except ImportError:
+ try:
+ from cStringIO import StringIO as _StringIO
+ except ImportError:
+ from StringIO import StringIO as _StringIO
+
+# ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
+
+# gzip is included with most Python distributions, but may not be available if you compiled your own
+try:
+ import gzip
+except ImportError:
+ gzip = None
+try:
+ import zlib
+except ImportError:
+ zlib = None
+
+# If a real XML parser is available, feedparser will attempt to use it. feedparser has
+# been tested with the built-in SAX parser, PyXML, and libxml2. On platforms where the
+# Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
+# versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
+try:
+ import xml.sax
+ from xml.sax.saxutils import escape as _xmlescape
+except ImportError:
+ _XML_AVAILABLE = 0
+ def _xmlescape(data,entities={}):
+ data = data.replace('&', '&')
+ data = data.replace('>', '>')
+ data = data.replace('<', '<')
+ for char, entity in entities:
+ data = data.replace(char, entity)
+ return data
+else:
+ try:
+ xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
+ except xml.sax.SAXReaderNotAvailable:
+ _XML_AVAILABLE = 0
+ else:
+ _XML_AVAILABLE = 1
+
+# sgmllib is not available by default in Python 3; if the end user doesn't have
+# it available then we'll lose illformed XML parsing, content santizing, and
+# microformat support (at least while feedparser depends on BeautifulSoup).
+try:
+ import sgmllib
+except ImportError:
+ # This is probably Python 3, which doesn't include sgmllib anymore
+ _SGML_AVAILABLE = 0
+
+ # Mock sgmllib enough to allow subclassing later on
+ class sgmllib(object):
+ class SGMLParser(object):
+ def goahead(self, i):
+ pass
+ def parse_starttag(self, i):
+ pass
+else:
+ _SGML_AVAILABLE = 1
+
+ # sgmllib defines a number of module-level regular expressions that are
+ # insufficient for the XML parsing feedparser needs. Rather than modify
+ # the variables directly in sgmllib, they're defined here using the same
+ # names, and the compiled code objects of several sgmllib.SGMLParser
+ # methods are copied into _BaseHTMLProcessor so that they execute in
+ # feedparser's scope instead of sgmllib's scope.
+ charref = re.compile('&#(\d+|[xX][0-9a-fA-F]+);')
+ tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
+ attrfind = re.compile(
+ r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)[$]?(\s*=\s*'
+ r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?'
+ )
+
+ # Unfortunately, these must be copied over to prevent NameError exceptions
+ entityref = sgmllib.entityref
+ incomplete = sgmllib.incomplete
+ interesting = sgmllib.interesting
+ shorttag = sgmllib.shorttag
+ shorttagopen = sgmllib.shorttagopen
+ starttagopen = sgmllib.starttagopen
+
+ class _EndBracketRegEx:
+ def __init__(self):
+ # Overriding the built-in sgmllib.endbracket regex allows the
+ # parser to find angle brackets embedded in element attributes.
+ self.endbracket = re.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''')
+ def search(self, target, index=0):
+ match = self.endbracket.match(target, index)
+ if match is not None:
+ # Returning a new object in the calling thread's context
+ # resolves a thread-safety.
+ return EndBracketMatch(match)
+ return None
+ class EndBracketMatch:
+ def __init__(self, match):
+ self.match = match
+ def start(self, n):
+ return self.match.end(n)
+ endbracket = _EndBracketRegEx()
+
+
+# cjkcodecs and iconv_codec provide support for more character encodings.
+# Both are available from http://cjkpython.i18n.org/
+try:
+ import cjkcodecs.aliases
+except ImportError:
+ pass
+try:
+ import iconv_codec
+except ImportError:
+ pass
+
+# chardet library auto-detects character encodings
+# Download from http://chardet.feedparser.org/
+try:
+ import chardet
+except ImportError:
+ chardet = None
+
+# BeautifulSoup parser used for parsing microformats from embedded HTML content
+# http://www.crummy.com/software/BeautifulSoup/
+# feedparser is tested with BeautifulSoup 3.0.x, but it might work with the
+# older 2.x series. If it doesn't, and you can figure out why, I'll accept a
+# patch and modify the compatibility statement accordingly.
+try:
+ import BeautifulSoup
+except ImportError:
+ BeautifulSoup = None
+
+# ---------- don't touch these ----------
+class ThingsNobodyCaresAboutButMe(Exception): pass
+class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
+class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
+class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
+class UndeclaredNamespace(Exception): pass
+
+SUPPORTED_VERSIONS = {'': u'unknown',
+ 'rss090': u'RSS 0.90',
+ 'rss091n': u'RSS 0.91 (Netscape)',
+ 'rss091u': u'RSS 0.91 (Userland)',
+ 'rss092': u'RSS 0.92',
+ 'rss093': u'RSS 0.93',
+ 'rss094': u'RSS 0.94',
+ 'rss20': u'RSS 2.0',
+ 'rss10': u'RSS 1.0',
+ 'rss': u'RSS (unknown version)',
+ 'atom01': u'Atom 0.1',
+ 'atom02': u'Atom 0.2',
+ 'atom03': u'Atom 0.3',
+ 'atom10': u'Atom 1.0',
+ 'atom': u'Atom (unknown version)',
+ 'cdf': u'CDF',
+ }
+
+class FeedParserDict(dict):
+ keymap = {'channel': 'feed',
+ 'items': 'entries',
+ 'guid': 'id',
+ 'date': 'updated',
+ 'date_parsed': 'updated_parsed',
+ 'description': ['summary', 'subtitle'],
+ 'url': ['href'],
+ 'modified': 'updated',
+ 'modified_parsed': 'updated_parsed',
+ 'issued': 'published',
+ 'issued_parsed': 'published_parsed',
+ 'copyright': 'rights',
+ 'copyright_detail': 'rights_detail',
+ 'tagline': 'subtitle',
+ 'tagline_detail': 'subtitle_detail'}
+ def __getitem__(self, key):
+ if key == 'category':
+ try:
+ return dict.__getitem__(self, 'tags')[0]['term']
+ except IndexError:
+ raise KeyError, "object doesn't have key 'category'"
+ elif key == 'enclosures':
+ norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel'])
+ return [norel(link) for link in dict.__getitem__(self, 'links') if link['rel']==u'enclosure']
+ elif key == 'license':
+ for link in dict.__getitem__(self, 'links'):
+ if link['rel']==u'license' and link.has_key('href'):
+ return link['href']
+ elif key == 'categories':
+ return [(tag['scheme'], tag['term']) for tag in dict.__getitem__(self, 'tags')]
+ else:
+ realkey = self.keymap.get(key, key)
+ if isinstance(realkey, list):
+ for k in realkey:
+ if dict.__contains__(self, k):
+ return dict.__getitem__(self, k)
+ elif dict.__contains__(self, realkey):
+ return dict.__getitem__(self, realkey)
+ return dict.__getitem__(self, key)
+
+ def __contains__(self, key):
+ try:
+ self.__getitem__(key)
+ except KeyError:
+ return False
+ else:
+ return True
+
+ has_key = __contains__
+
+ def get(self, key, default=None):
+ try:
+ return self.__getitem__(key)
+ except KeyError:
+ return default
+
+ def __setitem__(self, key, value):
+ key = self.keymap.get(key, key)
+ if isinstance(key, list):
+ key = key[0]
+ return dict.__setitem__(self, key, value)
+
+ def setdefault(self, key, value):
+ if key not in self:
+ self[key] = value
+ return value
+ return self[key]
+
+ def __getattr__(self, key):
+ # __getattribute__() is called first; this will be called
+ # only if an attribute was not already found
+ try:
+ return self.__getitem__(key)
+ except KeyError:
+ raise AttributeError, "object has no attribute '%s'" % key
+
+
+_ebcdic_to_ascii_map = None
+def _ebcdic_to_ascii(s):
+ global _ebcdic_to_ascii_map
+ if not _ebcdic_to_ascii_map:
+ emap = (
+ 0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
+ 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
+ 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
+ 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
+ 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
+ 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
+ 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
+ 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
+ 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,
+ 202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,
+ 209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,
+ 216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,
+ 123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,
+ 125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,
+ 92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
+ 48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
+ )
+ _ebcdic_to_ascii_map = _maketrans( \
+ _l2bytes(range(256)), _l2bytes(emap))
+ return s.translate(_ebcdic_to_ascii_map)
+
+_cp1252 = {
+ unichr(128): unichr(8364), # euro sign
+ unichr(130): unichr(8218), # single low-9 quotation mark
+ unichr(131): unichr( 402), # latin small letter f with hook
+ unichr(132): unichr(8222), # double low-9 quotation mark
+ unichr(133): unichr(8230), # horizontal ellipsis
+ unichr(134): unichr(8224), # dagger
+ unichr(135): unichr(8225), # double dagger
+ unichr(136): unichr( 710), # modifier letter circumflex accent
+ unichr(137): unichr(8240), # per mille sign
+ unichr(138): unichr( 352), # latin capital letter s with caron
+ unichr(139): unichr(8249), # single left-pointing angle quotation mark
+ unichr(140): unichr( 338), # latin capital ligature oe
+ unichr(142): unichr( 381), # latin capital letter z with caron
+ unichr(145): unichr(8216), # left single quotation mark
+ unichr(146): unichr(8217), # right single quotation mark
+ unichr(147): unichr(8220), # left double quotation mark
+ unichr(148): unichr(8221), # right double quotation mark
+ unichr(149): unichr(8226), # bullet
+ unichr(150): unichr(8211), # en dash
+ unichr(151): unichr(8212), # em dash
+ unichr(152): unichr( 732), # small tilde
+ unichr(153): unichr(8482), # trade mark sign
+ unichr(154): unichr( 353), # latin small letter s with caron
+ unichr(155): unichr(8250), # single right-pointing angle quotation mark
+ unichr(156): unichr( 339), # latin small ligature oe
+ unichr(158): unichr( 382), # latin small letter z with caron
+ unichr(159): unichr( 376)} # latin capital letter y with diaeresis
+
+_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
+def _urljoin(base, uri):
+ uri = _urifixer.sub(r'\1\3', uri)
+ #try:
+ uri = urlparse.urljoin(base, uri)
+ if not isinstance(uri, unicode):
+ return uri.decode('utf-8', 'ignore')
+ return uri
+ #except:
+ # uri = urlparse.urlunparse([urllib.quote(part) for part in urlparse.urlparse(uri)])
+ # return urlparse.urljoin(base, uri)
+
+class _FeedParserMixin:
+ namespaces = {'': '',
+ 'http://backend.userland.com/rss': '',
+ 'http://blogs.law.harvard.edu/tech/rss': '',
+ 'http://purl.org/rss/1.0/': '',
+ 'http://my.netscape.com/rdf/simple/0.9/': '',
+ 'http://example.com/newformat#': '',
+ 'http://example.com/necho': '',
+ 'http://purl.org/echo/': '',
+ 'uri/of/echo/namespace#': '',
+ 'http://purl.org/pie/': '',
+ 'http://purl.org/atom/ns#': '',
+ 'http://www.w3.org/2005/Atom': '',
+ 'http://purl.org/rss/1.0/modules/rss091#': '',
+
+ 'http://webns.net/mvcb/': 'admin',
+ 'http://purl.org/rss/1.0/modules/aggregation/': 'ag',
+ 'http://purl.org/rss/1.0/modules/annotate/': 'annotate',
+ 'http://media.tangent.org/rss/1.0/': 'audio',
+ 'http://backend.userland.com/blogChannelModule': 'blogChannel',
+ 'http://web.resource.org/cc/': 'cc',
+ 'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons',
+ 'http://purl.org/rss/1.0/modules/company': 'co',
+ 'http://purl.org/rss/1.0/modules/content/': 'content',
+ 'http://my.theinfo.org/changed/1.0/rss/': 'cp',
+ 'http://purl.org/dc/elements/1.1/': 'dc',
+ 'http://purl.org/dc/terms/': 'dcterms',
+ 'http://purl.org/rss/1.0/modules/email/': 'email',
+ 'http://purl.org/rss/1.0/modules/event/': 'ev',
+ 'http://rssnamespace.org/feedburner/ext/1.0': 'feedburner',
+ 'http://freshmeat.net/rss/fm/': 'fm',
+ 'http://xmlns.com/foaf/0.1/': 'foaf',
+ 'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo',
+ 'http://postneo.com/icbm/': 'icbm',
+ 'http://purl.org/rss/1.0/modules/image/': 'image',
+ 'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes',
+ 'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes',
+ 'http://purl.org/rss/1.0/modules/link/': 'l',
+ 'http://search.yahoo.com/mrss': 'media',
+ #Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace
+ 'http://search.yahoo.com/mrss/': 'media',
+ 'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
+ 'http://prismstandard.org/namespaces/1.2/basic/': 'prism',
+ 'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf',
+ 'http://www.w3.org/2000/01/rdf-schema#': 'rdfs',
+ 'http://purl.org/rss/1.0/modules/reference/': 'ref',
+ 'http://purl.org/rss/1.0/modules/richequiv/': 'reqv',
+ 'http://purl.org/rss/1.0/modules/search/': 'search',
+ 'http://purl.org/rss/1.0/modules/slash/': 'slash',
+ 'http://schemas.xmlsoap.org/soap/envelope/': 'soap',
+ 'http://purl.org/rss/1.0/modules/servicestatus/': 'ss',
+ 'http://hacks.benhammersley.com/rss/streaming/': 'str',
+ 'http://purl.org/rss/1.0/modules/subscription/': 'sub',
+ 'http://purl.org/rss/1.0/modules/syndication/': 'sy',
+ 'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf',
+ 'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo',
+ 'http://purl.org/rss/1.0/modules/threading/': 'thr',
+ 'http://purl.org/rss/1.0/modules/textinput/': 'ti',
+ 'http://madskills.com/public/xml/rss/module/trackback/':'trackback',
+ 'http://wellformedweb.org/commentAPI/': 'wfw',
+ 'http://purl.org/rss/1.0/modules/wiki/': 'wiki',
+ 'http://www.w3.org/1999/xhtml': 'xhtml',
+ 'http://www.w3.org/1999/xlink': 'xlink',
+ 'http://www.w3.org/XML/1998/namespace': 'xml'
+}
+ _matchnamespaces = {}
+
+ can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'icon', 'logo']
+ can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
+ can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
+ html_types = [u'text/html', u'application/xhtml+xml']
+
+ def __init__(self, baseuri=None, baselang=None, encoding=u'utf-8'):
+ if not self._matchnamespaces:
+ for k, v in self.namespaces.items():
+ self._matchnamespaces[k.lower()] = v
+ self.feeddata = FeedParserDict() # feed-level data
+ self.encoding = encoding # character encoding
+ self.entries = [] # list of entry-level data
+ self.version = u'' # feed type/version, see SUPPORTED_VERSIONS
+ self.namespacesInUse = {} # dictionary of namespaces defined by the feed
+
+ # the following are used internally to track state;
+ # this is really out of control and should be refactored
+ self.infeed = 0
+ self.inentry = 0
+ self.incontent = 0
+ self.intextinput = 0
+ self.inimage = 0
+ self.inauthor = 0
+ self.incontributor = 0
+ self.inpublisher = 0
+ self.insource = 0
+ self.sourcedata = FeedParserDict()
+ self.contentparams = FeedParserDict()
+ self._summaryKey = None
+ self.namespacemap = {}
+ self.elementstack = []
+ self.basestack = []
+ self.langstack = []
+ self.baseuri = baseuri or u''
+ self.lang = baselang or None
+ self.svgOK = 0
+ self.hasTitle = 0
+ if baselang:
+ self.feeddata['language'] = baselang.replace('_','-')
+
+ def _normalize_attributes(self, kv):
+ k = kv[0].lower()
+ v = k in ('rel', 'type') and kv[1].lower() or kv[1]
+ # the sgml parser doesn't handle entities in attributes, nor
+ # does it pass the attribute values through as unicode, while
+ # strict xml parsers do -- account for this difference
+ if isinstance(self, _LooseFeedParser):
+ v = v.replace('&', '&')
+ if not isinstance(v, unicode):
+ v = v.decode('utf-8')
+ return (k, v)
+
+ def unknown_starttag(self, tag, attrs):
+ # normalize attrs
+ attrs = map(self._normalize_attributes, attrs)
+
+ # track xml:base and xml:lang
+ attrsD = dict(attrs)
+ baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
+ if not isinstance(baseuri, unicode):
+ baseuri = baseuri.decode(self.encoding, 'ignore')
+ # ensure that self.baseuri is always an absolute URI that
+ # uses a whitelisted URI scheme (e.g. not `javscript:`)
+ if self.baseuri:
+ self.baseuri = _makeSafeAbsoluteURI(self.baseuri, baseuri) or self.baseuri
+ else:
+ self.baseuri = _urljoin(self.baseuri, baseuri)
+ lang = attrsD.get('xml:lang', attrsD.get('lang'))
+ if lang == '':
+ # xml:lang could be explicitly set to '', we need to capture that
+ lang = None
+ elif lang is None:
+ # if no xml:lang is specified, use parent lang
+ lang = self.lang
+ if lang:
+ if tag in ('feed', 'rss', 'rdf:RDF'):
+ self.feeddata['language'] = lang.replace('_','-')
+ self.lang = lang
+ self.basestack.append(self.baseuri)
+ self.langstack.append(lang)
+
+ # track namespaces
+ for prefix, uri in attrs:
+ if prefix.startswith('xmlns:'):
+ self.trackNamespace(prefix[6:], uri)
+ elif prefix == 'xmlns':
+ self.trackNamespace(None, uri)
+
+ # track inline content
+ if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', u'xml').endswith(u'xml'):
+ if tag in ['xhtml:div', 'div']:
+ return # typepad does this 10/2007
+ # element declared itself as escaped markup, but it isn't really
+ self.contentparams['type'] = u'application/xhtml+xml'
+ if self.incontent and self.contentparams.get('type') == u'application/xhtml+xml':
+ if tag.find(':') <> -1:
+ prefix, tag = tag.split(':', 1)
+ namespace = self.namespacesInUse.get(prefix, '')
+ if tag=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
+ attrs.append(('xmlns',namespace))
+ if tag=='svg' and namespace=='http://www.w3.org/2000/svg':
+ attrs.append(('xmlns',namespace))
+ if tag == 'svg':
+ self.svgOK += 1
+ return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0)
+
+ # match namespaces
+ if tag.find(':') <> -1:
+ prefix, suffix = tag.split(':', 1)
+ else:
+ prefix, suffix = '', tag
+ prefix = self.namespacemap.get(prefix, prefix)
+ if prefix:
+ prefix = prefix + '_'
+
+ # special hack for better tracking of empty textinput/image elements in illformed feeds
+ if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
+ self.intextinput = 0
+ if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
+ self.inimage = 0
+
+ # call special handler (if defined) or default handler
+ methodname = '_start_' + prefix + suffix
+ try:
+ method = getattr(self, methodname)
+ return method(attrsD)
+ except AttributeError:
+ # Since there's no handler or something has gone wrong we explicitly add the element and its attributes
+ unknown_tag = prefix + suffix
+ if len(attrsD) == 0:
+ # No attributes so merge it into the encosing dictionary
+ return self.push(unknown_tag, 1)
+ else:
+ # Has attributes so create it in its own dictionary
+ context = self._getContext()
+ context[unknown_tag] = attrsD
+
+ def unknown_endtag(self, tag):
+ # match namespaces
+ if tag.find(':') <> -1:
+ prefix, suffix = tag.split(':', 1)
+ else:
+ prefix, suffix = '', tag
+ prefix = self.namespacemap.get(prefix, prefix)
+ if prefix:
+ prefix = prefix + '_'
+ if suffix == 'svg' and self.svgOK:
+ self.svgOK -= 1
+
+ # call special handler (if defined) or default handler
+ methodname = '_end_' + prefix + suffix
+ try:
+ if self.svgOK:
+ raise AttributeError()
+ method = getattr(self, methodname)
+ method()
+ except AttributeError:
+ self.pop(prefix + suffix)
+
+ # track inline content
+ if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', u'xml').endswith(u'xml'):
+ # element declared itself as escaped markup, but it isn't really
+ if tag in ['xhtml:div', 'div']:
+ return # typepad does this 10/2007
+ self.contentparams['type'] = u'application/xhtml+xml'
+ if self.incontent and self.contentparams.get('type') == u'application/xhtml+xml':
+ tag = tag.split(':')[-1]
+ self.handle_data('</%s>' % tag, escape=0)
+
+ # track xml:base and xml:lang going out of scope
+ if self.basestack:
+ self.basestack.pop()
+ if self.basestack and self.basestack[-1]:
+ self.baseuri = self.basestack[-1]
+ if self.langstack:
+ self.langstack.pop()
+ if self.langstack: # and (self.langstack[-1] is not None):
+ self.lang = self.langstack[-1]
+
+ def handle_charref(self, ref):
+ # called for each character reference, e.g. for ' ', ref will be '160'
+ if not self.elementstack:
+ return
+ ref = ref.lower()
+ if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
+ text = '&#%s;' % ref
+ else:
+ if ref[0] == 'x':
+ c = int(ref[1:], 16)
+ else:
+ c = int(ref)
+ text = unichr(c).encode('utf-8')
+ self.elementstack[-1][2].append(text)
+
+ def handle_entityref(self, ref):
+ # called for each entity reference, e.g. for '©', ref will be 'copy'
+ if not self.elementstack:
+ return
+ if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
+ text = '&%s;' % ref
+ elif ref in self.entities.keys():
+ text = self.entities[ref]
+ if text.startswith('&#') and text.endswith(';'):
+ return self.handle_entityref(text)
+ else:
+ try:
+ name2codepoint[ref]
+ except KeyError:
+ text = '&%s;' % ref
+ else:
+ text = unichr(name2codepoint[ref]).encode('utf-8')
+ self.elementstack[-1][2].append(text)
+
+ def handle_data(self, text, escape=1):
+ # called for each block of plain text, i.e. outside of any tag and
+ # not containing any character or entity references
+ if not self.elementstack:
+ return
+ if escape and self.contentparams.get('type') == u'application/xhtml+xml':
+ text = _xmlescape(text)
+ self.elementstack[-1][2].append(text)
+
+ def handle_comment(self, text):
+ # called for each comment, e.g. <!-- insert message here -->
+ pass
+
+ def handle_pi(self, text):
+ # called for each processing instruction, e.g. <?instruction>
+ pass
+
+ def handle_decl(self, text):
+ pass
+
+ def parse_declaration(self, i):
+ # override internal declaration handler to handle CDATA blocks
+ if self.rawdata[i:i+9] == '<![CDATA[':
+ k = self.rawdata.find(']]>', i)
+ if k == -1:
+ # CDATA block began but didn't finish
+ k = len(self.rawdata)
+ return k
+ self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
+ return k+3
+ else:
+ k = self.rawdata.find('>', i)
+ if k >= 0:
+ return k+1
+ else:
+ # We have an incomplete CDATA block.
+ return k
+
+ def mapContentType(self, contentType):
+ contentType = contentType.lower()
+ if contentType == 'text' or contentType == 'plain':
+ contentType = u'text/plain'
+ elif contentType == 'html':
+ contentType = u'text/html'
+ elif contentType == 'xhtml':
+ contentType = u'application/xhtml+xml'
+ return contentType
+
+ def trackNamespace(self, prefix, uri):
+ loweruri = uri.lower()
+ if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
+ self.version = u'rss090'
+ if loweruri == 'http://purl.org/rss/1.0/' and not self.version:
+ self.version = u'rss10'
+ if loweruri == 'http://www.w3.org/2005/atom' and not self.version:
+ self.version = u'atom10'
+ if loweruri.find(u'backend.userland.com/rss') <> -1:
+ # match any backend.userland.com namespace
+ uri = u'http://backend.userland.com/rss'
+ loweruri = uri
+ if self._matchnamespaces.has_key(loweruri):
+ self.namespacemap[prefix] = self._matchnamespaces[loweruri]
+ self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
+ else:
+ self.namespacesInUse[prefix or ''] = uri
+
+ def resolveURI(self, uri):
+ return _urljoin(self.baseuri or u'', uri)
+
+ def decodeEntities(self, element, data):
+ return data
+
+ def strattrs(self, attrs):
+ return ''.join([' %s="%s"' % (t[0],_xmlescape(t[1],{'"':'"'})) for t in attrs])
+
+ def push(self, element, expectingText):
+ self.elementstack.append([element, expectingText, []])
+
+ def pop(self, element, stripWhitespace=1):
+ if not self.elementstack:
+ return
+ if self.elementstack[-1][0] != element:
+ return
+
+ element, expectingText, pieces = self.elementstack.pop()
+
+ if self.version == u'atom10' and self.contentparams.get('type', u'text') == u'application/xhtml+xml':
+ # remove enclosing child element, but only if it is a <div> and
+ # only if all the remaining content is nested underneath it.
+ # This means that the divs would be retained in the following:
+ # <div>foo</div><div>bar</div>
+ while pieces and len(pieces)>1 and not pieces[-1].strip():
+ del pieces[-1]
+ while pieces and len(pieces)>1 and not pieces[0].strip():
+ del pieces[0]
+ if pieces and (pieces[0] == '<div>' or pieces[0].startswith('<div ')) and pieces[-1]=='</div>':
+ depth = 0
+ for piece in pieces[:-1]:
+ if piece.startswith('</'):
+ depth -= 1
+ if depth == 0:
+ break
+ elif piece.startswith('<') and not piece.endswith('/>'):
+ depth += 1
+ else:
+ pieces = pieces[1:-1]
+
+ # Ensure each piece is a str for Python 3
+ for (i, v) in enumerate(pieces):
+ if not isinstance(v, unicode):
+ pieces[i] = v.decode('utf-8')
+
+ output = u''.join(pieces)
+ if stripWhitespace:
+ output = output.strip()
+ if not expectingText:
+ return output
+
+ # decode base64 content
+ if base64 and self.contentparams.get('base64', 0):
+ try:
+ output = _base64decode(output)
+ except binascii.Error:
+ pass
+ except binascii.Incomplete:
+ pass
+ except TypeError:
+ # In Python 3, base64 takes and outputs bytes, not str
+ # This may not be the most correct way to accomplish this
+ output = _base64decode(output.encode('utf-8')).decode('utf-8')
+
+ # resolve relative URIs
+ if (element in self.can_be_relative_uri) and output:
+ output = self.resolveURI(output)
+
+ # decode entities within embedded markup
+ if not self.contentparams.get('base64', 0):
+ output = self.decodeEntities(element, output)
+
+ # some feed formats require consumers to guess
+ # whether the content is html or plain text
+ if not self.version.startswith(u'atom') and self.contentparams.get('type') == u'text/plain':
+ if self.lookslikehtml(output):
+ self.contentparams['type'] = u'text/html'
+
+ # remove temporary cruft from contentparams
+ try:
+ del self.contentparams['mode']
+ except KeyError:
+ pass
+ try:
+ del self.contentparams['base64']
+ except KeyError:
+ pass
+
+ is_htmlish = self.mapContentType(self.contentparams.get('type', u'text/html')) in self.html_types
+ # resolve relative URIs within embedded markup
+ if is_htmlish and RESOLVE_RELATIVE_URIS:
+ if element in self.can_contain_relative_uris:
+ output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', u'text/html'))
+
+ # parse microformats
+ # (must do this before sanitizing because some microformats
+ # rely on elements that we sanitize)
+ if is_htmlish and element in ['content', 'description', 'summary']:
+ mfresults = _parseMicroformats(output, self.baseuri, self.encoding)
+ if mfresults:
+ for tag in mfresults.get('tags', []):
+ self._addTag(tag['term'], tag['scheme'], tag['label'])
+ for enclosure in mfresults.get('enclosures', []):
+ self._start_enclosure(enclosure)
+ for xfn in mfresults.get('xfn', []):
+ self._addXFN(xfn['relationships'], xfn['href'], xfn['name'])
+ vcard = mfresults.get('vcard')
+ if vcard:
+ self._getContext()['vcard'] = vcard
+
+ # sanitize embedded markup
+ if is_htmlish and SANITIZE_HTML:
+ if element in self.can_contain_dangerous_markup:
+ output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', u'text/html'))
+
+ if self.encoding and not isinstance(output, unicode):
+ output = output.decode(self.encoding, 'ignore')
+
+ # address common error where people take data that is already
+ # utf-8, presume that it is iso-8859-1, and re-encode it.
+ if self.encoding in (u'utf-8', u'utf-8_INVALID_PYTHON_3') and isinstance(output, unicode):
+ try:
+ output = output.encode('iso-8859-1').decode('utf-8')
+ except (UnicodeEncodeError, UnicodeDecodeError):
+ pass
+
+ # map win-1252 extensions to the proper code points
+ if isinstance(output, unicode):
+ output = u''.join([c in _cp1252.keys() and _cp1252[c] or c for c in output])
+
+ # categories/tags/keywords/whatever are handled in _end_category
+ if element == 'category':
+ return output
+
+ if element == 'title' and self.hasTitle:
+ return output
+
+ # store output in appropriate place(s)
+ if self.inentry and not self.insource:
+ if element == 'content':
+ self.entries[-1].setdefault(element, [])
+ contentparams = copy.deepcopy(self.contentparams)
+ contentparams['value'] = output
+ self.entries[-1][element].append(contentparams)
+ elif element == 'link':
+ if not self.inimage:
+ # query variables in urls in link elements are improperly
+ # converted from `?a=1&b=2` to `?a=1&b;=2` as if they're
+ # unhandled character references. fix this special case.
+ output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output)
+ self.entries[-1][element] = output
+ if output:
+ self.entries[-1]['links'][-1]['href'] = output
+ else:
+ if element == 'description':
+ element = 'summary'
+ self.entries[-1][element] = output
+ if self.incontent:
+ contentparams = copy.deepcopy(self.contentparams)
+ contentparams['value'] = output
+ self.entries[-1][element + '_detail'] = contentparams
+ elif (self.infeed or self.insource):# and (not self.intextinput) and (not self.inimage):
+ context = self._getContext()
+ if element == 'description':
+ element = 'subtitle'
+ context[element] = output
+ if element == 'link':
+ # fix query variables; see above for the explanation
+ output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output)
+ context[element] = output
+ context['links'][-1]['href'] = output
+ elif self.incontent:
+ contentparams = copy.deepcopy(self.contentparams)
+ contentparams['value'] = output
+ context[element + '_detail'] = contentparams
+ return output
+
+ def pushContent(self, tag, attrsD, defaultContentType, expectingText):
+ self.incontent += 1
+ if self.lang:
+ self.lang=self.lang.replace('_','-')
+ self.contentparams = FeedParserDict({
+ 'type': self.mapContentType(attrsD.get('type', defaultContentType)),
+ 'language': self.lang,
+ 'base': self.baseuri})
+ self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams)
+ self.push(tag, expectingText)
+
+ def popContent(self, tag):
+ value = self.pop(tag)
+ self.incontent -= 1
+ self.contentparams.clear()
+ return value
+
+ # a number of elements in a number of RSS variants are nominally plain
+ # text, but this is routinely ignored. This is an attempt to detect
+ # the most common cases. As false positives often result in silent
+ # data loss, this function errs on the conservative side.
+ @staticmethod
+ def lookslikehtml(s):
+ # must have a close tag or a entity reference to qualify
+ if not (re.search(r'</(\w+)>',s) or re.search("&#?\w+;",s)):
+ return
+
+ # all tags must be in a restricted subset of valid HTML tags
+ if filter(lambda t: t.lower() not in _HTMLSanitizer.acceptable_elements,
+ re.findall(r'</?(\w+)',s)):
+ return
+
+ # all entities must have been defined as valid HTML entities
+ if filter(lambda e: e not in entitydefs.keys(), re.findall(r'&(\w+);', s)):
+ return
+
+ return 1
+
+ def _mapToStandardPrefix(self, name):
+ colonpos = name.find(':')
+ if colonpos <> -1:
+ prefix = name[:colonpos]
+ suffix = name[colonpos+1:]
+ prefix = self.namespacemap.get(prefix, prefix)
+ name = prefix + ':' + suffix
+ return name
+
+ def _getAttribute(self, attrsD, name):
+ return attrsD.get(self._mapToStandardPrefix(name))
+
+ def _isBase64(self, attrsD, contentparams):
+ if attrsD.get('mode', '') == 'base64':
+ return 1
+ if self.contentparams['type'].startswith(u'text/'):
+ return 0
+ if self.contentparams['type'].endswith(u'+xml'):
+ return 0
+ if self.contentparams['type'].endswith(u'/xml'):
+ return 0
+ return 1
+
+ def _itsAnHrefDamnIt(self, attrsD):
+ href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
+ if href:
+ try:
+ del attrsD['url']
+ except KeyError:
+ pass
+ try:
+ del attrsD['uri']
+ except KeyError:
+ pass
+ attrsD['href'] = href
+ return attrsD
+
+ def _save(self, key, value, overwrite=False):
+ context = self._getContext()
+ if overwrite:
+ context[key] = value
+ else:
+ context.setdefault(key, value)
+
+ def _start_rss(self, attrsD):
+ versionmap = {'0.91': u'rss091u',
+ '0.92': u'rss092',
+ '0.93': u'rss093',
+ '0.94': u'rss094'}
+ #If we're here then this is an RSS feed.
+ #If we don't have a version or have a version that starts with something
+ #other than RSS then there's been a mistake. Correct it.
+ if not self.version or not self.version.startswith(u'rss'):
+ attr_version = attrsD.get('version', '')
+ version = versionmap.get(attr_version)
+ if version:
+ self.version = version
+ elif attr_version.startswith('2.'):
+ self.version = u'rss20'
+ else:
+ self.version = u'rss'
+
+ def _start_channel(self, attrsD):
+ self.infeed = 1
+ self._cdf_common(attrsD)
+
+ def _cdf_common(self, attrsD):
+ if attrsD.has_key('lastmod'):
+ self._start_modified({})
+ self.elementstack[-1][-1] = attrsD['lastmod']
+ self._end_modified()
+ if attrsD.has_key('href'):
+ self._start_link({})
+ self.elementstack[-1][-1] = attrsD['href']
+ self._end_link()
+
+ def _start_feed(self, attrsD):
+ self.infeed = 1
+ versionmap = {'0.1': u'atom01',
+ '0.2': u'atom02',
+ '0.3': u'atom03'}
+ if not self.version:
+ attr_version = attrsD.get('version')
+ version = versionmap.get(attr_version)
+ if version:
+ self.version = version
+ else:
+ self.version = u'atom'
+
+ def _end_channel(self):
+ self.infeed = 0
+ _end_feed = _end_channel
+
+ def _start_image(self, attrsD):
+ context = self._getContext()
+ if not self.inentry:
+ context.setdefault('image', FeedParserDict())
+ self.inimage = 1
+ self.hasTitle = 0
+ self.push('image', 0)
+
+ def _end_image(self):
+ self.pop('image')
+ self.inimage = 0
+
+ def _start_textinput(self, attrsD):
+ context = self._getContext()
+ context.setdefault('textinput', FeedParserDict())
+ self.intextinput = 1
+ self.hasTitle = 0
+ self.push('textinput', 0)
+ _start_textInput = _start_textinput
+
+ def _end_textinput(self):
+ self.pop('textinput')
+ self.intextinput = 0
+ _end_textInput = _end_textinput
+
+ def _start_author(self, attrsD):
+ self.inauthor = 1
+ self.push('author', 1)
+ # Append a new FeedParserDict when expecting an author
+ context = self._getContext()
+ context.setdefault('authors', [])
+ context['authors'].append(FeedParserDict())
+ _start_managingeditor = _start_author
+ _start_dc_author = _start_author
+ _start_dc_creator = _start_author
+ _start_itunes_author = _start_author
+
+ def _end_author(self):
+ self.pop('author')
+ self.inauthor = 0
+ self._sync_author_detail()
+ _end_managingeditor = _end_author
+ _end_dc_author = _end_author
+ _end_dc_creator = _end_author
+ _end_itunes_author = _end_author
+
+ def _start_itunes_owner(self, attrsD):
+ self.inpublisher = 1
+ self.push('publisher', 0)
+
+ def _end_itunes_owner(self):
+ self.pop('publisher')
+ self.inpublisher = 0
+ self._sync_author_detail('publisher')
+
+ def _start_contributor(self, attrsD):
+ self.incontributor = 1
+ context = self._getContext()
+ context.setdefault('contributors', [])
+ context['contributors'].append(FeedParserDict())
+ self.push('contributor', 0)
+
+ def _end_contributor(self):
+ self.pop('contributor')
+ self.incontributor = 0
+
+ def _start_dc_contributor(self, attrsD):
+ self.incontributor = 1
+ context = self._getContext()
+ context.setdefault('contributors', [])
+ context['contributors'].append(FeedParserDict())
+ self.push('name', 0)
+
+ def _end_dc_contributor(self):
+ self._end_name()
+ self.incontributor = 0
+
+ def _start_name(self, attrsD):
+ self.push('name', 0)
+ _start_itunes_name = _start_name
+
+ def _end_name(self):
+ value = self.pop('name')
+ if self.inpublisher:
+ self._save_author('name', value, 'publisher')
+ elif self.inauthor:
+ self._save_author('name', value)
+ elif self.incontributor:
+ self._save_contributor('name', value)
+ elif self.intextinput:
+ context = self._getContext()
+ context['name'] = value
+ _end_itunes_name = _end_name
+
+ def _start_width(self, attrsD):
+ self.push('width', 0)
+
+ def _end_width(self):
+ value = self.pop('width')
+ try:
+ value = int(value)
+ except ValueError:
+ value = 0
+ if self.inimage:
+ context = self._getContext()
+ context['width'] = value
+
+ def _start_height(self, attrsD):
+ self.push('height', 0)
+
+ def _end_height(self):
+ value = self.pop('height')
+ try:
+ value = int(value)
+ except ValueError:
+ value = 0
+ if self.inimage:
+ context = self._getContext()
+ context['height'] = value
+
+ def _start_url(self, attrsD):
+ self.push('href', 1)
+ _start_homepage = _start_url
+ _start_uri = _start_url
+
+ def _end_url(self):
+ value = self.pop('href')
+ if self.inauthor:
+ self._save_author('href', value)
+ elif self.incontributor:
+ self._save_contributor('href', value)
+ _end_homepage = _end_url
+ _end_uri = _end_url
+
+ def _start_email(self, attrsD):
+ self.push('email', 0)
+ _start_itunes_email = _start_email
+
+ def _end_email(self):
+ value = self.pop('email')
+ if self.inpublisher:
+ self._save_author('email', value, 'publisher')
+ elif self.inauthor:
+ self._save_author('email', value)
+ elif self.incontributor:
+ self._save_contributor('email', value)
+ _end_itunes_email = _end_email
+
+ def _getContext(self):
+ if self.insource:
+ context = self.sourcedata
+ elif self.inimage and self.feeddata.has_key('image'):
+ context = self.feeddata['image']
+ elif self.intextinput:
+ context = self.feeddata['textinput']
+ elif self.inentry:
+ context = self.entries[-1]
+ else:
+ context = self.feeddata
+ return context
+
+ def _save_author(self, key, value, prefix='author'):
+ context = self._getContext()
+ context.setdefault(prefix + '_detail', FeedParserDict())
+ context[prefix + '_detail'][key] = value
+ self._sync_author_detail()
+ context.setdefault('authors', [FeedParserDict()])
+ context['authors'][-1][key] = value
+
+ def _save_contributor(self, key, value):
+ context = self._getContext()
+ context.setdefault('contributors', [FeedParserDict()])
+ context['contributors'][-1][key] = value
+
+ def _sync_author_detail(self, key='author'):
+ context = self._getContext()
+ detail = context.get('%s_detail' % key)
+ if detail:
+ name = detail.get('name')
+ email = detail.get('email')
+ if name and email:
+ context[key] = u'%s (%s)' % (name, email)
+ elif name:
+ context[key] = name
+ elif email:
+ context[key] = email
+ else:
+ author, email = context.get(key), None
+ if not author:
+ return
+ emailmatch = re.search(ur'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author)
+ if emailmatch:
+ email = emailmatch.group(0)
+ # probably a better way to do the following, but it passes all the tests
+ author = author.replace(email, u'')
+ author = author.replace(u'()', u'')
+ author = author.replace(u'<>', u'')
+ author = author.replace(u'<>', u'')
+ author = author.strip()
+ if author and (author[0] == u'('):
+ author = author[1:]
+ if author and (author[-1] == u')'):
+ author = author[:-1]
+ author = author.strip()
+ if author or email:
+ context.setdefault('%s_detail' % key, FeedParserDict())
+ if author:
+ context['%s_detail' % key]['name'] = author
+ if email:
+ context['%s_detail' % key]['email'] = email
+
+ def _start_subtitle(self, attrsD):
+ self.pushContent('subtitle', attrsD, u'text/plain', 1)
+ _start_tagline = _start_subtitle
+ _start_itunes_subtitle = _start_subtitle
+
+ def _end_subtitle(self):
+ self.popContent('subtitle')
+ _end_tagline = _end_subtitle
+ _end_itunes_subtitle = _end_subtitle
+
+ def _start_rights(self, attrsD):
+ self.pushContent('rights', attrsD, u'text/plain', 1)
+ _start_dc_rights = _start_rights
+ _start_copyright = _start_rights
+
+ def _end_rights(self):
+ self.popContent('rights')
+ _end_dc_rights = _end_rights
+ _end_copyright = _end_rights
+
+ def _start_item(self, attrsD):
+ self.entries.append(FeedParserDict())
+ self.push('item', 0)
+ self.inentry = 1
+ self.guidislink = 0
+ self.hasTitle = 0
+ id = self._getAttribute(attrsD, 'rdf:about')
+ if id:
+ context = self._getContext()
+ context['id'] = id
+ self._cdf_common(attrsD)
+ _start_entry = _start_item
+
+ def _end_item(self):
+ self.pop('item')
+ self.inentry = 0
+ _end_entry = _end_item
+
+ def _start_dc_language(self, attrsD):
+ self.push('language', 1)
+ _start_language = _start_dc_language
+
+ def _end_dc_language(self):
+ self.lang = self.pop('language')
+ _end_language = _end_dc_language
+
+ def _start_dc_publisher(self, attrsD):
+ self.push('publisher', 1)
+ _start_webmaster = _start_dc_publisher
+
+ def _end_dc_publisher(self):
+ self.pop('publisher')
+ self._sync_author_detail('publisher')
+ _end_webmaster = _end_dc_publisher
+
+ def _start_published(self, attrsD):
+ self.push('published', 1)
+ _start_dcterms_issued = _start_published
+ _start_issued = _start_published
+
+ def _end_published(self):
+ value = self.pop('published')
+ self._save('published_parsed', _parse_date(value), overwrite=True)
+ _end_dcterms_issued = _end_published
+ _end_issued = _end_published
+
+ def _start_updated(self, attrsD):
+ self.push('updated', 1)
+ _start_modified = _start_updated
+ _start_dcterms_modified = _start_updated
+ _start_pubdate = _start_updated
+ _start_dc_date = _start_updated
+ _start_lastbuilddate = _start_updated
+
+ def _end_updated(self):
+ value = self.pop('updated')
+ parsed_value = _parse_date(value)
+ self._save('updated_parsed', parsed_value, overwrite=True)
+ _end_modified = _end_updated
+ _end_dcterms_modified = _end_updated
+ _end_pubdate = _end_updated
+ _end_dc_date = _end_updated
+ _end_lastbuilddate = _end_updated
+
+ def _start_created(self, attrsD):
+ self.push('created', 1)
+ _start_dcterms_created = _start_created
+
+ def _end_created(self):
+ value = self.pop('created')
+ self._save('created_parsed', _parse_date(value), overwrite=True)
+ _end_dcterms_created = _end_created
+
+ def _start_expirationdate(self, attrsD):
+ self.push('expired', 1)
+
+ def _end_expirationdate(self):
+ self._save('expired_parsed', _parse_date(self.pop('expired')), overwrite=True)
+
+ def _start_cc_license(self, attrsD):
+ context = self._getContext()
+ value = self._getAttribute(attrsD, 'rdf:resource')
+ attrsD = FeedParserDict()
+ attrsD['rel'] = u'license'
+ if value:
+ attrsD['href']=value
+ context.setdefault('links', []).append(attrsD)
+
+ def _start_creativecommons_license(self, attrsD):
+ self.push('license', 1)
+ _start_creativeCommons_license = _start_creativecommons_license
+
+ def _end_creativecommons_license(self):
+ value = self.pop('license')
+ context = self._getContext()
+ attrsD = FeedParserDict()
+ attrsD['rel'] = u'license'
+ if value:
+ attrsD['href'] = value
+ context.setdefault('links', []).append(attrsD)
+ del context['license']
+ _end_creativeCommons_license = _end_creativecommons_license
+
+ def _addXFN(self, relationships, href, name):
+ context = self._getContext()
+ xfn = context.setdefault('xfn', [])
+ value = FeedParserDict({'relationships': relationships, 'href': href, 'name': name})
+ if value not in xfn:
+ xfn.append(value)
+
+ def _addTag(self, term, scheme, label):
+ context = self._getContext()
+ tags = context.setdefault('tags', [])
+ if (not term) and (not scheme) and (not label):
+ return
+ value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})
+ if value not in tags:
+ tags.append(value)
+
+ def _start_category(self, attrsD):
+ term = attrsD.get('term')
+ scheme = attrsD.get('scheme', attrsD.get('domain'))
+ label = attrsD.get('label')
+ self._addTag(term, scheme, label)
+ self.push('category', 1)
+ _start_dc_subject = _start_category
+ _start_keywords = _start_category
+
+ def _start_media_category(self, attrsD):
+ attrsD.setdefault('scheme', u'http://search.yahoo.com/mrss/category_schema')
+ self._start_category(attrsD)
+
+ def _end_itunes_keywords(self):
+ for term in self.pop('itunes_keywords').split():
+ self._addTag(term, u'http://www.itunes.com/', None)
+
+ def _start_itunes_category(self, attrsD):
+ self._addTag(attrsD.get('text'), u'http://www.itunes.com/', None)
+ self.push('category', 1)
+
+ def _end_category(self):
+ value = self.pop('category')
+ if not value:
+ return
+ context = self._getContext()
+ tags = context['tags']
+ if value and len(tags) and not tags[-1]['term']:
+ tags[-1]['term'] = value
+ else:
+ self._addTag(value, None, None)
+ _end_dc_subject = _end_category
+ _end_keywords = _end_category
+ _end_itunes_category = _end_category
+ _end_media_category = _end_category
+
+ def _start_cloud(self, attrsD):
+ self._getContext()['cloud'] = FeedParserDict(attrsD)
+
+ def _start_link(self, attrsD):
+ attrsD.setdefault('rel', u'alternate')
+ if attrsD['rel'] == u'self':
+ attrsD.setdefault('type', u'application/atom+xml')
+ else:
+ attrsD.setdefault('type', u'text/html')
+ context = self._getContext()
+ attrsD = self._itsAnHrefDamnIt(attrsD)
+ if attrsD.has_key('href'):
+ attrsD['href'] = self.resolveURI(attrsD['href'])
+ expectingText = self.infeed or self.inentry or self.insource
+ context.setdefault('links', [])
+ if not (self.inentry and self.inimage):
+ context['links'].append(FeedParserDict(attrsD))
+ if attrsD.has_key('href'):
+ expectingText = 0
+ if (attrsD.get('rel') == u'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
+ context['link'] = attrsD['href']
+ else:
+ self.push('link', expectingText)
+
+ def _end_link(self):
+ value = self.pop('link')
+ context = self._getContext()
+
+ def _start_guid(self, attrsD):
+ self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
+ self.push('id', 1)
+
+ def _end_guid(self):
+ value = self.pop('id')
+ self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))
+ if self.guidislink:
+ # guid acts as link, but only if 'ispermalink' is not present or is 'true',
+ # and only if the item doesn't already have a link element
+ self._save('link', value)
+
+ def _start_title(self, attrsD):
+ if self.svgOK:
+ return self.unknown_starttag('title', attrsD.items())
+ self.pushContent('title', attrsD, u'text/plain', self.infeed or self.inentry or self.insource)
+ _start_dc_title = _start_title
+ _start_media_title = _start_title
+
+ def _end_title(self):
+ if self.svgOK:
+ return
+ value = self.popContent('title')
+ if not value:
+ return
+ context = self._getContext()
+ self.hasTitle = 1
+ _end_dc_title = _end_title
+
+ def _end_media_title(self):
+ hasTitle = self.hasTitle
+ self._end_title()
+ self.hasTitle = hasTitle
+
+ def _start_description(self, attrsD):
+ context = self._getContext()
+ if context.has_key('summary'):
+ self._summaryKey = 'content'
+ self._start_content(attrsD)
+ else:
+ self.pushContent('description', attrsD, u'text/html', self.infeed or self.inentry or self.insource)
+ _start_dc_description = _start_description
+
+ def _start_abstract(self, attrsD):
+ self.pushContent('description', attrsD, u'text/plain', self.infeed or self.inentry or self.insource)
+
+ def _end_description(self):
+ if self._summaryKey == 'content':
+ self._end_content()
+ else:
+ value = self.popContent('description')
+ self._summaryKey = None
+ _end_abstract = _end_description
+ _end_dc_description = _end_description
+
+ def _start_info(self, attrsD):
+ self.pushContent('info', attrsD, u'text/plain', 1)
+ _start_feedburner_browserfriendly = _start_info
+
+ def _end_info(self):
+ self.popContent('info')
+ _end_feedburner_browserfriendly = _end_info
+
+ def _start_generator(self, attrsD):
+ if attrsD:
+ attrsD = self._itsAnHrefDamnIt(attrsD)
+ if attrsD.has_key('href'):
+ attrsD['href'] = self.resolveURI(attrsD['href'])
+ self._getContext()['generator_detail'] = FeedParserDict(attrsD)
+ self.push('generator', 1)
+
+ def _end_generator(self):
+ value = self.pop('generator')
+ context = self._getContext()
+ if context.has_key('generator_detail'):
+ context['generator_detail']['name'] = value
+
+ def _start_admin_generatoragent(self, attrsD):
+ self.push('generator', 1)
+ value = self._getAttribute(attrsD, 'rdf:resource')
+ if value:
+ self.elementstack[-1][2].append(value)
+ self.pop('generator')
+ self._getContext()['generator_detail'] = FeedParserDict({'href': value})
+
+ def _start_admin_errorreportsto(self, attrsD):
+ self.push('errorreportsto', 1)
+ value = self._getAttribute(attrsD, 'rdf:resource')
+ if value:
+ self.elementstack[-1][2].append(value)
+ self.pop('errorreportsto')
+
+ def _start_summary(self, attrsD):
+ context = self._getContext()
+ if context.has_key('summary'):
+ self._summaryKey = 'content'
+ self._start_content(attrsD)
+ else:
+ self._summaryKey = 'summary'
+ self.pushContent(self._summaryKey, attrsD, u'text/plain', 1)
+ _start_itunes_summary = _start_summary
+
+ def _end_summary(self):
+ if self._summaryKey == 'content':
+ self._end_content()
+ else:
+ self.popContent(self._summaryKey or 'summary')
+ self._summaryKey = None
+ _end_itunes_summary = _end_summary
+
+ def _start_enclosure(self, attrsD):
+ attrsD = self._itsAnHrefDamnIt(attrsD)
+ context = self._getContext()
+ attrsD['rel'] = u'enclosure'
+ context.setdefault('links', []).append(FeedParserDict(attrsD))
+
+ def _start_source(self, attrsD):
+ if 'url' in attrsD:
+ # This means that we're processing a source element from an RSS 2.0 feed
+ self.sourcedata['href'] = attrsD[u'url']
+ self.push('source', 1)
+ self.insource = 1
+ self.hasTitle = 0
+
+ def _end_source(self):
+ self.insource = 0
+ value = self.pop('source')
+ if value:
+ self.sourcedata['title'] = value
+ self._getContext()['source'] = copy.deepcopy(self.sourcedata)
+ self.sourcedata.clear()
+
+ def _start_content(self, attrsD):
+ self.pushContent('content', attrsD, u'text/plain', 1)
+ src = attrsD.get('src')
+ if src:
+ self.contentparams['src'] = src
+ self.push('content', 1)
+
+ def _start_body(self, attrsD):
+ self.pushContent('content', attrsD, u'application/xhtml+xml', 1)
+ _start_xhtml_body = _start_body
+
+ def _start_content_encoded(self, attrsD):
+ self.pushContent('content', attrsD, u'text/html', 1)
+ _start_fullitem = _start_content_encoded
+
+ def _end_content(self):
+ copyToSummary = self.mapContentType(self.contentparams.get('type')) in ([u'text/plain'] + self.html_types)
+ value = self.popContent('content')
+ if copyToSummary:
+ self._save('summary', value)
+
+ _end_body = _end_content
+ _end_xhtml_body = _end_content
+ _end_content_encoded = _end_content
+ _end_fullitem = _end_content
+
+ def _start_itunes_image(self, attrsD):
+ self.push('itunes_image', 0)
+ if attrsD.get('href'):
+ self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
+ _start_itunes_link = _start_itunes_image
+
+ def _end_itunes_block(self):
+ value = self.pop('itunes_block', 0)
+ self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0
+
+ def _end_itunes_explicit(self):
+ value = self.pop('itunes_explicit', 0)
+ # Convert 'yes' -> True, 'clean' to False, and any other value to None
+ # False and None both evaluate as False, so the difference can be ignored
+ # by applications that only need to know if the content is explicit.
+ self._getContext()['itunes_explicit'] = (None, False, True)[(value == 'yes' and 2) or value == 'clean' or 0]
+
+ def _start_media_content(self, attrsD):
+ context = self._getContext()
+ context.setdefault('media_content', [])
+ context['media_content'].append(attrsD)
+
+ def _start_media_thumbnail(self, attrsD):
+ context = self._getContext()
+ context.setdefault('media_thumbnail', [])
+ self.push('url', 1) # new
+ context['media_thumbnail'].append(attrsD)
+
+ def _end_media_thumbnail(self):
+ url = self.pop('url')
+ context = self._getContext()
+ if url != None and len(url.strip()) != 0:
+ if not context['media_thumbnail'][-1].has_key('url'):
+ context['media_thumbnail'][-1]['url'] = url
+
+ def _start_media_player(self, attrsD):
+ self.push('media_player', 0)
+ self._getContext()['media_player'] = FeedParserDict(attrsD)
+
+ def _end_media_player(self):
+ value = self.pop('media_player')
+ context = self._getContext()
+ context['media_player']['content'] = value
+
+ def _start_newlocation(self, attrsD):
+ self.push('newlocation', 1)
+
+ def _end_newlocation(self):
+ url = self.pop('newlocation')
+ context = self._getContext()
+ # don't set newlocation if the context isn't right
+ if context is not self.feeddata:
+ return
+ context['newlocation'] = _makeSafeAbsoluteURI(self.baseuri, url.strip())
+
+if _XML_AVAILABLE:
+ class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
+ def __init__(self, baseuri, baselang, encoding):
+ xml.sax.handler.ContentHandler.__init__(self)
+ _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
+ self.bozo = 0
+ self.exc = None
+ self.decls = {}
+
+ def startPrefixMapping(self, prefix, uri):
+ if not uri:
+ return
+ # Jython uses '' instead of None; standardize on None
+ prefix = prefix or None
+ self.trackNamespace(prefix, uri)
+ if prefix and uri == 'http://www.w3.org/1999/xlink':
+ self.decls['xmlns:' + prefix] = uri
+
+ def startElementNS(self, name, qname, attrs):
+ namespace, localname = name
+ lowernamespace = str(namespace or '').lower()
+ if lowernamespace.find(u'backend.userland.com/rss') <> -1:
+ # match any backend.userland.com namespace
+ namespace = u'http://backend.userland.com/rss'
+ lowernamespace = namespace
+ if qname and qname.find(':') > 0:
+ givenprefix = qname.split(':')[0]
+ else:
+ givenprefix = None
+ prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
+ if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix):
+ raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix
+ localname = str(localname).lower()
+
+ # qname implementation is horribly broken in Python 2.1 (it
+ # doesn't report any), and slightly broken in Python 2.2 (it
+ # doesn't report the xml: namespace). So we match up namespaces
+ # with a known list first, and then possibly override them with
+ # the qnames the SAX parser gives us (if indeed it gives us any
+ # at all). Thanks to MatejC for helping me test this and
+ # tirelessly telling me that it didn't work yet.
+ attrsD, self.decls = self.decls, {}
+ if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
+ attrsD['xmlns']=namespace
+ if localname=='svg' and namespace=='http://www.w3.org/2000/svg':
+ attrsD['xmlns']=namespace
+
+ if prefix:
+ localname = prefix.lower() + ':' + localname
+ elif namespace and not qname: #Expat
+ for name,value in self.namespacesInUse.items():
+ if name and value == namespace:
+ localname = name + ':' + localname
+ break
+
+ for (namespace, attrlocalname), attrvalue in attrs.items():
+ lowernamespace = (namespace or '').lower()
+ prefix = self._matchnamespaces.get(lowernamespace, '')
+ if prefix:
+ attrlocalname = prefix + ':' + attrlocalname
+ attrsD[str(attrlocalname).lower()] = attrvalue
+ for qname in attrs.getQNames():
+ attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
+ self.unknown_starttag(localname, attrsD.items())
+
+ def characters(self, text):
+ self.handle_data(text)
+
+ def endElementNS(self, name, qname):
+ namespace, localname = name
+ lowernamespace = str(namespace or '').lower()
+ if qname and qname.find(':') > 0:
+ givenprefix = qname.split(':')[0]
+ else:
+ givenprefix = ''
+ prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
+ if prefix:
+ localname = prefix + ':' + localname
+ elif namespace and not qname: #Expat
+ for name,value in self.namespacesInUse.items():
+ if name and value == namespace:
+ localname = name + ':' + localname
+ break
+ localname = str(localname).lower()
+ self.unknown_endtag(localname)
+
+ def error(self, exc):
+ self.bozo = 1
+ self.exc = exc
+
+ # drv_libxml2 calls warning() in some cases
+ warning = error
+
+ def fatalError(self, exc):
+ self.error(exc)
+ raise exc
+
+class _BaseHTMLProcessor(sgmllib.SGMLParser):
+ special = re.compile('''[<>'"]''')
+ bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
+ elements_no_end_tag = [
+ 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame',
+ 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param',
+ 'source', 'track', 'wbr'
+ ]
+
+ def __init__(self, encoding, _type):
+ self.encoding = encoding
+ self._type = _type
+ sgmllib.SGMLParser.__init__(self)
+
+ def reset(self):
+ self.pieces = []
+ sgmllib.SGMLParser.reset(self)
+
+ def _shorttag_replace(self, match):
+ tag = match.group(1)
+ if tag in self.elements_no_end_tag:
+ return '<' + tag + ' />'
+ else:
+ return '<' + tag + '></' + tag + '>'
+
+ # By declaring these methods and overriding their compiled code
+ # with the code from sgmllib, the original code will execute in
+ # feedparser's scope instead of sgmllib's. This means that the
+ # `tagfind` and `charref` regular expressions will be found as
+ # they're declared above, not as they're declared in sgmllib.
+ def goahead(self, i):
+ pass
+ goahead.func_code = sgmllib.SGMLParser.goahead.func_code
+
+ def __parse_starttag(self, i):
+ pass
+ __parse_starttag.func_code = sgmllib.SGMLParser.parse_starttag.func_code
+
+ def parse_starttag(self,i):
+ j = self.__parse_starttag(i)
+ if self._type == 'application/xhtml+xml':
+ if j>2 and self.rawdata[j-2:j]=='/>':
+ self.unknown_endtag(self.lasttag)
+ return j
+
+ def feed(self, data):
+ data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'<!\1', data)
+ #data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace
+ data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
+ data = data.replace(''', "'")
+ data = data.replace('"', '"')
+ try:
+ bytes
+ if bytes is str:
+ raise NameError
+ self.encoding = self.encoding + u'_INVALID_PYTHON_3'
+ except NameError:
+ if self.encoding and isinstance(data, unicode):
+ data = data.encode(self.encoding)
+ sgmllib.SGMLParser.feed(self, data)
+ sgmllib.SGMLParser.close(self)
+
+ def normalize_attrs(self, attrs):
+ if not attrs:
+ return attrs
+ # utility method to be called by descendants
+ attrs = dict([(k.lower(), v) for k, v in attrs]).items()
+ attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
+ attrs.sort()
+ return attrs
+
+ def unknown_starttag(self, tag, attrs):
+ # called for each start tag
+ # attrs is a list of (attr, value) tuples
+ # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
+ uattrs = []
+ strattrs=''
+ if attrs:
+ for key, value in attrs:
+ value=value.replace('>','>').replace('<','<').replace('"','"')
+ value = self.bare_ampersand.sub("&", value)
+ # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
+ if not isinstance(value, unicode):
+ value = value.decode(self.encoding, 'ignore')
+ try:
+ # Currently, in Python 3 the key is already a str, and cannot be decoded again
+ uattrs.append((unicode(key, self.encoding), value))
+ except TypeError:
+ uattrs.append((key, value))
+ strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs])
+ if self.encoding:
+ try:
+ strattrs = strattrs.encode(self.encoding)
+ except (UnicodeEncodeError, LookupError):
+ pass
+ if tag in self.elements_no_end_tag:
+ self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
+ else:
+ self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
+
+ def unknown_endtag(self, tag):
+ # called for each end tag, e.g. for </pre>, tag will be 'pre'
+ # Reconstruct the original end tag.
+ if tag not in self.elements_no_end_tag:
+ self.pieces.append("</%(tag)s>" % locals())
+
+ def handle_charref(self, ref):
+ # called for each character reference, e.g. for ' ', ref will be '160'
+ # Reconstruct the original character reference.
+ if ref.startswith('x'):
+ value = unichr(int(ref[1:],16))
+ else:
+ value = unichr(int(ref))
+
+ if value in _cp1252.keys():
+ self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:])
+ else:
+ self.pieces.append('&#%(ref)s;' % locals())
+
+ def handle_entityref(self, ref):
+ # called for each entity reference, e.g. for '©', ref will be 'copy'
+ # Reconstruct the original entity reference.
+ if name2codepoint.has_key(ref):
+ self.pieces.append('&%(ref)s;' % locals())
+ else:
+ self.pieces.append('&%(ref)s' % locals())
+
+ def handle_data(self, text):
+ # called for each block of plain text, i.e. outside of any tag and
+ # not containing any character or entity references
+ # Store the original text verbatim.
+ self.pieces.append(text)
+
+ def handle_comment(self, text):
+ # called for each HTML comment, e.g. <!-- insert Javascript code here -->
+ # Reconstruct the original comment.
+ self.pieces.append('<!--%(text)s-->' % locals())
+
+ def handle_pi(self, text):
+ # called for each processing instruction, e.g. <?instruction>
+ # Reconstruct original processing instruction.
+ self.pieces.append('<?%(text)s>' % locals())
+
+ def handle_decl(self, text):
+ # called for the DOCTYPE, if present, e.g.
+ # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
+ # "http://www.w3.org/TR/html4/loose.dtd">
+ # Reconstruct original DOCTYPE
+ self.pieces.append('<!%(text)s>' % locals())
+
+ _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
+ def _scan_name(self, i, declstartpos):
+ rawdata = self.rawdata
+ n = len(rawdata)
+ if i == n:
+ return None, -1
+ m = self._new_declname_match(rawdata, i)
+ if m:
+ s = m.group()
+ name = s.strip()
+ if (i + len(s)) == n:
+ return None, -1 # end of buffer
+ return name.lower(), m.end()
+ else:
+ self.handle_data(rawdata)
+# self.updatepos(declstartpos, i)
+ return None, -1
+
+ def convert_charref(self, name):
+ return '&#%s;' % name
+
+ def convert_entityref(self, name):
+ return '&%s;' % name
+
+ def output(self):
+ '''Return processed HTML as a single string'''
+ return ''.join([str(p) for p in self.pieces])
+
+ def parse_declaration(self, i):
+ try:
+ return sgmllib.SGMLParser.parse_declaration(self, i)
+ except sgmllib.SGMLParseError:
+ # escape the doctype declaration and continue parsing
+ self.handle_data('<')
+ return i+1
+
+class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
+ def __init__(self, baseuri, baselang, encoding, entities):
+ sgmllib.SGMLParser.__init__(self)
+ _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
+ _BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml')
+ self.entities=entities
+
+ def decodeEntities(self, element, data):
+ data = data.replace('<', '<')
+ data = data.replace('<', '<')
+ data = data.replace('<', '<')
+ data = data.replace('>', '>')
+ data = data.replace('>', '>')
+ data = data.replace('>', '>')
+ data = data.replace('&', '&')
+ data = data.replace('&', '&')
+ data = data.replace('"', '"')
+ data = data.replace('"', '"')
+ data = data.replace(''', ''')
+ data = data.replace(''', ''')
+ if self.contentparams.has_key('type') and not self.contentparams.get('type', u'xml').endswith(u'xml'):
+ data = data.replace('<', '<')
+ data = data.replace('>', '>')
+ data = data.replace('&', '&')
+ data = data.replace('"', '"')
+ data = data.replace(''', "'")
+ return data
+
+ def strattrs(self, attrs):
+ return ''.join([' %s="%s"' % (n,v.replace('"','"')) for n,v in attrs])
+
+class _MicroformatsParser:
+ STRING = 1
+ DATE = 2
+ URI = 3
+ NODE = 4
+ EMAIL = 5
+
+ known_xfn_relationships = ['contact', 'acquaintance', 'friend', 'met', 'co-worker', 'coworker', 'colleague', 'co-resident', 'coresident', 'neighbor', 'child', 'parent', 'sibling', 'brother', 'sister', 'spouse', 'wife', 'husband', 'kin', 'relative', 'muse', 'crush', 'date', 'sweetheart', 'me']
+ known_binary_extensions = ['zip','rar','exe','gz','tar','tgz','tbz2','bz2','z','7z','dmg','img','sit','sitx','hqx','deb','rpm','bz2','jar','rar','iso','bin','msi','mp2','mp3','ogg','ogm','mp4','m4v','m4a','avi','wma','wmv']
+
+ def __init__(self, data, baseuri, encoding):
+ self.document = BeautifulSoup.BeautifulSoup(data)
+ self.baseuri = baseuri
+ self.encoding = encoding
+ if isinstance(data, unicode):
+ data = data.encode(encoding)
+ self.tags = []
+ self.enclosures = []
+ self.xfn = []
+ self.vcard = None
+
+ def vcardEscape(self, s):
+ if isinstance(s, basestring):
+ s = s.replace(',', '\\,').replace(';', '\\;').replace('\n', '\\n')
+ return s
+
+ def vcardFold(self, s):
+ s = re.sub(';+$', '', s)
+ sFolded = ''
+ iMax = 75
+ sPrefix = ''
+ while len(s) > iMax:
+ sFolded += sPrefix + s[:iMax] + '\n'
+ s = s[iMax:]
+ sPrefix = ' '
+ iMax = 74
+ sFolded += sPrefix + s
+ return sFolded
+
+ def normalize(self, s):
+ return re.sub(r'\s+', ' ', s).strip()
+
+ def unique(self, aList):
+ results = []
+ for element in aList:
+ if element not in results:
+ results.append(element)
+ return results
+
+ def toISO8601(self, dt):
+ return time.strftime('%Y-%m-%dT%H:%M:%SZ', dt)
+
+ def getPropertyValue(self, elmRoot, sProperty, iPropertyType=4, bAllowMultiple=0, bAutoEscape=0):
+ all = lambda x: 1
+ sProperty = sProperty.lower()
+ bFound = 0
+ bNormalize = 1
+ propertyMatch = {'class': re.compile(r'\b%s\b' % sProperty)}
+ if bAllowMultiple and (iPropertyType != self.NODE):
+ snapResults = []
+ containers = elmRoot(['ul', 'ol'], propertyMatch)
+ for container in containers:
+ snapResults.extend(container('li'))
+ bFound = (len(snapResults) != 0)
+ if not bFound:
+ snapResults = elmRoot(all, propertyMatch)
+ bFound = (len(snapResults) != 0)
+ if (not bFound) and (sProperty == 'value'):
+ snapResults = elmRoot('pre')
+ bFound = (len(snapResults) != 0)
+ bNormalize = not bFound
+ if not bFound:
+ snapResults = [elmRoot]
+ bFound = (len(snapResults) != 0)
+ arFilter = []
+ if sProperty == 'vcard':
+ snapFilter = elmRoot(all, propertyMatch)
+ for node in snapFilter:
+ if node.findParent(all, propertyMatch):
+ arFilter.append(node)
+ arResults = []
+ for node in snapResults:
+ if node not in arFilter:
+ arResults.append(node)
+ bFound = (len(arResults) != 0)
+ if not bFound:
+ if bAllowMultiple:
+ return []
+ elif iPropertyType == self.STRING:
+ return ''
+ elif iPropertyType == self.DATE:
+ return None
+ elif iPropertyType == self.URI:
+ return ''
+ elif iPropertyType == self.NODE:
+ return None
+ else:
+ return None
+ arValues = []
+ for elmResult in arResults:
+ sValue = None
+ if iPropertyType == self.NODE:
+ if bAllowMultiple:
+ arValues.append(elmResult)
+ continue
+ else:
+ return elmResult
+ sNodeName = elmResult.name.lower()
+ if (iPropertyType == self.EMAIL) and (sNodeName == 'a'):
+ sValue = (elmResult.get('href') or '').split('mailto:').pop().split('?')[0]
+ if sValue:
+ sValue = bNormalize and self.normalize(sValue) or sValue.strip()
+ if (not sValue) and (sNodeName == 'abbr'):
+ sValue = elmResult.get('title')
+ if sValue:
+ sValue = bNormalize and self.normalize(sValue) or sValue.strip()
+ if (not sValue) and (iPropertyType == self.URI):
+ if sNodeName == 'a':
+ sValue = elmResult.get('href')
+ elif sNodeName == 'img':
+ sValue = elmResult.get('src')
+ elif sNodeName == 'object':
+ sValue = elmResult.get('data')
+ if sValue:
+ sValue = bNormalize and self.normalize(sValue) or sValue.strip()
+ if (not sValue) and (sNodeName == 'img'):
+ sValue = elmResult.get('alt')
+ if sValue:
+ sValue = bNormalize and self.normalize(sValue) or sValue.strip()
+ if not sValue:
+ sValue = elmResult.renderContents()
+ sValue = re.sub(r'<\S[^>]*>', '', sValue)
+ sValue = sValue.replace('\r\n', '\n')
+ sValue = sValue.replace('\r', '\n')
+ if sValue:
+ sValue = bNormalize and self.normalize(sValue) or sValue.strip()
+ if not sValue:
+ continue
+ if iPropertyType == self.DATE:
+ sValue = _parse_date_iso8601(sValue)
+ if bAllowMultiple:
+ arValues.append(bAutoEscape and self.vcardEscape(sValue) or sValue)
+ else:
+ return bAutoEscape and self.vcardEscape(sValue) or sValue
+ return arValues
+
+ def findVCards(self, elmRoot, bAgentParsing=0):
+ sVCards = ''
+
+ if not bAgentParsing:
+ arCards = self.getPropertyValue(elmRoot, 'vcard', bAllowMultiple=1)
+ else:
+ arCards = [elmRoot]
+
+ for elmCard in arCards:
+ arLines = []
+
+ def processSingleString(sProperty):
+ sValue = self.getPropertyValue(elmCard, sProperty, self.STRING, bAutoEscape=1).decode(self.encoding)
+ if sValue:
+ arLines.append(self.vcardFold(sProperty.upper() + ':' + sValue))
+ return sValue or u''
+
+ def processSingleURI(sProperty):
+ sValue = self.getPropertyValue(elmCard, sProperty, self.URI)
+ if sValue:
+ sContentType = ''
+ sEncoding = ''
+ sValueKey = ''
+ if sValue.startswith('data:'):
+ sEncoding = ';ENCODING=b'
+ sContentType = sValue.split(';')[0].split('/').pop()
+ sValue = sValue.split(',', 1).pop()
+ else:
+ elmValue = self.getPropertyValue(elmCard, sProperty)
+ if elmValue:
+ if sProperty != 'url':
+ sValueKey = ';VALUE=uri'
+ sContentType = elmValue.get('type', '').strip().split('/').pop().strip()
+ sContentType = sContentType.upper()
+ if sContentType == 'OCTET-STREAM':
+ sContentType = ''
+ if sContentType:
+ sContentType = ';TYPE=' + sContentType.upper()
+ arLines.append(self.vcardFold(sProperty.upper() + sEncoding + sContentType + sValueKey + ':' + sValue))
+
+ def processTypeValue(sProperty, arDefaultType, arForceType=None):
+ arResults = self.getPropertyValue(elmCard, sProperty, bAllowMultiple=1)
+ for elmResult in arResults:
+ arType = self.getPropertyValue(elmResult, 'type', self.STRING, 1, 1)
+ if arForceType:
+ arType = self.unique(arForceType + arType)
+ if not arType:
+ arType = arDefaultType
+ sValue = self.getPropertyValue(elmResult, 'value', self.EMAIL, 0)
+ if sValue:
+ arLines.append(self.vcardFold(sProperty.upper() + ';TYPE=' + ','.join(arType) + ':' + sValue))
+
+ # AGENT
+ # must do this before all other properties because it is destructive
+ # (removes nested class="vcard" nodes so they don't interfere with
+ # this vcard's other properties)
+ arAgent = self.getPropertyValue(elmCard, 'agent', bAllowMultiple=1)
+ for elmAgent in arAgent:
+ if re.compile(r'\bvcard\b').search(elmAgent.get('class')):
+ sAgentValue = self.findVCards(elmAgent, 1) + '\n'
+ sAgentValue = sAgentValue.replace('\n', '\\n')
+ sAgentValue = sAgentValue.replace(';', '\\;')
+ if sAgentValue:
+ arLines.append(self.vcardFold('AGENT:' + sAgentValue))
+ # Completely remove the agent element from the parse tree
+ elmAgent.extract()
+ else:
+ sAgentValue = self.getPropertyValue(elmAgent, 'value', self.URI, bAutoEscape=1);
+ if sAgentValue:
+ arLines.append(self.vcardFold('AGENT;VALUE=uri:' + sAgentValue))
+
+ # FN (full name)
+ sFN = processSingleString('fn')
+
+ # N (name)
+ elmName = self.getPropertyValue(elmCard, 'n')
+ if elmName:
+ sFamilyName = self.getPropertyValue(elmName, 'family-name', self.STRING, bAutoEscape=1)
+ sGivenName = self.getPropertyValue(elmName, 'given-name', self.STRING, bAutoEscape=1)
+ arAdditionalNames = self.getPropertyValue(elmName, 'additional-name', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'additional-names', self.STRING, 1, 1)
+ arHonorificPrefixes = self.getPropertyValue(elmName, 'honorific-prefix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-prefixes', self.STRING, 1, 1)
+ arHonorificSuffixes = self.getPropertyValue(elmName, 'honorific-suffix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-suffixes', self.STRING, 1, 1)
+ arLines.append(self.vcardFold('N:' + sFamilyName + ';' +
+ sGivenName + ';' +
+ ','.join(arAdditionalNames) + ';' +
+ ','.join(arHonorificPrefixes) + ';' +
+ ','.join(arHonorificSuffixes)))
+ elif sFN:
+ # implied "N" optimization
+ # http://microformats.org/wiki/hcard#Implied_.22N.22_Optimization
+ arNames = self.normalize(sFN).split()
+ if len(arNames) == 2:
+ bFamilyNameFirst = (arNames[0].endswith(',') or
+ len(arNames[1]) == 1 or
+ ((len(arNames[1]) == 2) and (arNames[1].endswith('.'))))
+ if bFamilyNameFirst:
+ arLines.append(self.vcardFold('N:' + arNames[0] + ';' + arNames[1]))
+ else:
+ arLines.append(self.vcardFold('N:' + arNames[1] + ';' + arNames[0]))
+
+ # SORT-STRING
+ sSortString = self.getPropertyValue(elmCard, 'sort-string', self.STRING, bAutoEscape=1)
+ if sSortString:
+ arLines.append(self.vcardFold('SORT-STRING:' + sSortString))
+
+ # NICKNAME
+ arNickname = self.getPropertyValue(elmCard, 'nickname', self.STRING, 1, 1)
+ if arNickname:
+ arLines.append(self.vcardFold('NICKNAME:' + ','.join(arNickname)))
+
+ # PHOTO
+ processSingleURI('photo')
+
+ # BDAY
+ dtBday = self.getPropertyValue(elmCard, 'bday', self.DATE)
+ if dtBday:
+ arLines.append(self.vcardFold('BDAY:' + self.toISO8601(dtBday)))
+
+ # ADR (address)
+ arAdr = self.getPropertyValue(elmCard, 'adr', bAllowMultiple=1)
+ for elmAdr in arAdr:
+ arType = self.getPropertyValue(elmAdr, 'type', self.STRING, 1, 1)
+ if not arType:
+ arType = ['intl','postal','parcel','work'] # default adr types, see RFC 2426 section 3.2.1
+ sPostOfficeBox = self.getPropertyValue(elmAdr, 'post-office-box', self.STRING, 0, 1)
+ sExtendedAddress = self.getPropertyValue(elmAdr, 'extended-address', self.STRING, 0, 1)
+ sStreetAddress = self.getPropertyValue(elmAdr, 'street-address', self.STRING, 0, 1)
+ sLocality = self.getPropertyValue(elmAdr, 'locality', self.STRING, 0, 1)
+ sRegion = self.getPropertyValue(elmAdr, 'region', self.STRING, 0, 1)
+ sPostalCode = self.getPropertyValue(elmAdr, 'postal-code', self.STRING, 0, 1)
+ sCountryName = self.getPropertyValue(elmAdr, 'country-name', self.STRING, 0, 1)
+ arLines.append(self.vcardFold('ADR;TYPE=' + ','.join(arType) + ':' +
+ sPostOfficeBox + ';' +
+ sExtendedAddress + ';' +
+ sStreetAddress + ';' +
+ sLocality + ';' +
+ sRegion + ';' +
+ sPostalCode + ';' +
+ sCountryName))
+
+ # LABEL
+ processTypeValue('label', ['intl','postal','parcel','work'])
+
+ # TEL (phone number)
+ processTypeValue('tel', ['voice'])
+
+ # EMAIL
+ processTypeValue('email', ['internet'], ['internet'])
+
+ # MAILER
+ processSingleString('mailer')
+
+ # TZ (timezone)
+ processSingleString('tz')
+
+ # GEO (geographical information)
+ elmGeo = self.getPropertyValue(elmCard, 'geo')
+ if elmGeo:
+ sLatitude = self.getPropertyValue(elmGeo, 'latitude', self.STRING, 0, 1)
+ sLongitude = self.getPropertyValue(elmGeo, 'longitude', self.STRING, 0, 1)
+ arLines.append(self.vcardFold('GEO:' + sLatitude + ';' + sLongitude))
+
+ # TITLE
+ processSingleString('title')
+
+ # ROLE
+ processSingleString('role')
+
+ # LOGO
+ processSingleURI('logo')
+
+ # ORG (organization)
+ elmOrg = self.getPropertyValue(elmCard, 'org')
+ if elmOrg:
+ sOrganizationName = self.getPropertyValue(elmOrg, 'organization-name', self.STRING, 0, 1)
+ if not sOrganizationName:
+ # implied "organization-name" optimization
+ # http://microformats.org/wiki/hcard#Implied_.22organization-name.22_Optimization
+ sOrganizationName = self.getPropertyValue(elmCard, 'org', self.STRING, 0, 1)
+ if sOrganizationName:
+ arLines.append(self.vcardFold('ORG:' + sOrganizationName))
+ else:
+ arOrganizationUnit = self.getPropertyValue(elmOrg, 'organization-unit', self.STRING, 1, 1)
+ arLines.append(self.vcardFold('ORG:' + sOrganizationName + ';' + ';'.join(arOrganizationUnit)))
+
+ # CATEGORY
+ arCategory = self.getPropertyValue(elmCard, 'category', self.STRING, 1, 1) + self.getPropertyValue(elmCard, 'categories', self.STRING, 1, 1)
+ if arCategory:
+ arLines.append(self.vcardFold('CATEGORIES:' + ','.join(arCategory)))
+
+ # NOTE
+ processSingleString('note')
+
+ # REV
+ processSingleString('rev')
+
+ # SOUND
+ processSingleURI('sound')
+
+ # UID
+ processSingleString('uid')
+
+ # URL
+ processSingleURI('url')
+
+ # CLASS
+ processSingleString('class')
+
+ # KEY
+ processSingleURI('key')
+
+ if arLines:
+ arLines = [u'BEGIN:vCard',u'VERSION:3.0'] + arLines + [u'END:vCard']
+ # XXX - this is super ugly; properly fix this with issue 148
+ for i, s in enumerate(arLines):
+ if not isinstance(s, unicode):
+ arLines[i] = s.decode('utf-8', 'ignore')
+ sVCards += u'\n'.join(arLines) + u'\n'
+
+ return sVCards.strip()
+
+ def isProbablyDownloadable(self, elm):
+ attrsD = elm.attrMap
+ if not attrsD.has_key('href'):
+ return 0
+ linktype = attrsD.get('type', '').strip()
+ if linktype.startswith('audio/') or \
+ linktype.startswith('video/') or \
+ (linktype.startswith('application/') and not linktype.endswith('xml')):
+ return 1
+ path = urlparse.urlparse(attrsD['href'])[2]
+ if path.find('.') == -1:
+ return 0
+ fileext = path.split('.').pop().lower()
+ return fileext in self.known_binary_extensions
+
+ def findTags(self):
+ all = lambda x: 1
+ for elm in self.document(all, {'rel': re.compile(r'\btag\b')}):
+ href = elm.get('href')
+ if not href:
+ continue
+ urlscheme, domain, path, params, query, fragment = \
+ urlparse.urlparse(_urljoin(self.baseuri, href))
+ segments = path.split('/')
+ tag = segments.pop()
+ if not tag:
+ if segments:
+ tag = segments.pop()
+ else:
+ # there are no tags
+ continue
+ tagscheme = urlparse.urlunparse((urlscheme, domain, '/'.join(segments), '', '', ''))
+ if not tagscheme.endswith('/'):
+ tagscheme += '/'
+ self.tags.append(FeedParserDict({"term": tag, "scheme": tagscheme, "label": elm.string or ''}))
+
+ def findEnclosures(self):
+ all = lambda x: 1
+ enclosure_match = re.compile(r'\benclosure\b')
+ for elm in self.document(all, {'href': re.compile(r'.+')}):
+ if not enclosure_match.search(elm.get('rel', u'')) and not self.isProbablyDownloadable(elm):
+ continue
+ if elm.attrMap not in self.enclosures:
+ self.enclosures.append(elm.attrMap)
+ if elm.string and not elm.get('title'):
+ self.enclosures[-1]['title'] = elm.string
+
+ def findXFN(self):
+ all = lambda x: 1
+ for elm in self.document(all, {'rel': re.compile('.+'), 'href': re.compile('.+')}):
+ rels = elm.get('rel', u'').split()
+ xfn_rels = []
+ for rel in rels:
+ if rel in self.known_xfn_relationships:
+ xfn_rels.append(rel)
+ if xfn_rels:
+ self.xfn.append({"relationships": xfn_rels, "href": elm.get('href', ''), "name": elm.string})
+
+def _parseMicroformats(htmlSource, baseURI, encoding):
+ if not BeautifulSoup:
+ return
+ try:
+ p = _MicroformatsParser(htmlSource, baseURI, encoding)
+ except UnicodeEncodeError:
+ # sgmllib throws this exception when performing lookups of tags
+ # with non-ASCII characters in them.
+ return
+ p.vcard = p.findVCards(p.document)
+ p.findTags()
+ p.findEnclosures()
+ p.findXFN()
+ return {"tags": p.tags, "enclosures": p.enclosures, "xfn": p.xfn, "vcard": p.vcard}
+
+class _RelativeURIResolver(_BaseHTMLProcessor):
+ relative_uris = [('a', 'href'),
+ ('applet', 'codebase'),
+ ('area', 'href'),
+ ('blockquote', 'cite'),
+ ('body', 'background'),
+ ('del', 'cite'),
+ ('form', 'action'),
+ ('frame', 'longdesc'),
+ ('frame', 'src'),
+ ('iframe', 'longdesc'),
+ ('iframe', 'src'),
+ ('head', 'profile'),
+ ('img', 'longdesc'),
+ ('img', 'src'),
+ ('img', 'usemap'),
+ ('input', 'src'),
+ ('input', 'usemap'),
+ ('ins', 'cite'),
+ ('link', 'href'),
+ ('object', 'classid'),
+ ('object', 'codebase'),
+ ('object', 'data'),
+ ('object', 'usemap'),
+ ('q', 'cite'),
+ ('script', 'src')]
+
+ def __init__(self, baseuri, encoding, _type):
+ _BaseHTMLProcessor.__init__(self, encoding, _type)
+ self.baseuri = baseuri
+
+ def resolveURI(self, uri):
+ return _makeSafeAbsoluteURI(_urljoin(self.baseuri, uri.strip()))
+
+ def unknown_starttag(self, tag, attrs):
+ attrs = self.normalize_attrs(attrs)
+ attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
+ _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
+
+def _resolveRelativeURIs(htmlSource, baseURI, encoding, _type):
+ if not _SGML_AVAILABLE:
+ return htmlSource
+
+ p = _RelativeURIResolver(baseURI, encoding, _type)
+ p.feed(htmlSource)
+ return p.output()
+
+def _makeSafeAbsoluteURI(base, rel=None):
+ # bail if ACCEPTABLE_URI_SCHEMES is empty
+ if not ACCEPTABLE_URI_SCHEMES:
+ return _urljoin(base, rel or u'')
+ if not base:
+ return rel or u''
+ if not rel:
+ scheme = urlparse.urlparse(base)[0]
+ if not scheme or scheme in ACCEPTABLE_URI_SCHEMES:
+ return base
+ return u''
+ uri = _urljoin(base, rel)
+ if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES:
+ return u''
+ return uri
+
+class _HTMLSanitizer(_BaseHTMLProcessor):
+ acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
+ 'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
+ 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
+ 'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
+ 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
+ 'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
+ 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
+ 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
+ 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
+ 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
+ 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
+ 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
+ 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript']
+
+ acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
+ 'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
+ 'background', 'balance', 'bgcolor', 'bgproperties', 'border',
+ 'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
+ 'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
+ 'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols',
+ 'colspan', 'compact', 'contenteditable', 'controls', 'coords', 'data',
+ 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', 'delay',
+ 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for',
+ 'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus',
+ 'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode',
+ 'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc',
+ 'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max',
+ 'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref',
+ 'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size',
+ 'prompt', 'pqg', 'radiogroup', 'readonly', 'rel', 'repeat-max',
+ 'repeat-min', 'replace', 'required', 'rev', 'rightspacing', 'rows',
+ 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src',
+ 'start', 'step', 'summary', 'suppress', 'tabindex', 'target', 'template',
+ 'title', 'toppadding', 'type', 'unselectable', 'usemap', 'urn', 'valign',
+ 'value', 'variable', 'volume', 'vspace', 'vrml', 'width', 'wrap',
+ 'xml:lang']
+
+ unacceptable_elements_with_end_tag = ['script', 'applet', 'style']
+
+ acceptable_css_properties = ['azimuth', 'background-color',
+ 'border-bottom-color', 'border-collapse', 'border-color',
+ 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
+ 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
+ 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
+ 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
+ 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
+ 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
+ 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
+ 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
+ 'white-space', 'width']
+
+ # survey of common keywords found in feeds
+ acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
+ 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
+ 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
+ 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
+ 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
+ 'transparent', 'underline', 'white', 'yellow']
+
+ valid_css_values = re.compile('^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|' +
+ '\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$')
+
+ mathml_elements = ['annotation', 'annotation-xml', 'maction', 'math',
+ 'merror', 'mfenced', 'mfrac', 'mi', 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded',
+ 'mphantom', 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle',
+ 'msub', 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
+ 'munderover', 'none', 'semantics']
+
+ mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
+ 'columnalign', 'close', 'columnlines', 'columnspacing', 'columnspan', 'depth',
+ 'display', 'displaystyle', 'encoding', 'equalcolumns', 'equalrows',
+ 'fence', 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness',
+ 'lspace', 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant',
+ 'maxsize', 'minsize', 'open', 'other', 'rowalign', 'rowalign', 'rowalign',
+ 'rowlines', 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
+ 'separator', 'separators', 'stretchy', 'width', 'width', 'xlink:href',
+ 'xlink:show', 'xlink:type', 'xmlns', 'xmlns:xlink']
+
+ # svgtiny - foreignObject + linearGradient + radialGradient + stop
+ svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
+ 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject',
+ 'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
+ 'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath',
+ 'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop',
+ 'svg', 'switch', 'text', 'title', 'tspan', 'use']
+
+ # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
+ svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
+ 'arabic-form', 'ascent', 'attributeName', 'attributeType',
+ 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
+ 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
+ 'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
+ 'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
+ 'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
+ 'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
+ 'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'keySplines',
+ 'keyTimes', 'lang', 'mathematical', 'marker-end', 'marker-mid',
+ 'marker-start', 'markerHeight', 'markerUnits', 'markerWidth', 'max',
+ 'min', 'name', 'offset', 'opacity', 'orient', 'origin',
+ 'overline-position', 'overline-thickness', 'panose-1', 'path',
+ 'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX', 'refY',
+ 'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
+ 'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
+ 'stop-color', 'stop-opacity', 'strikethrough-position',
+ 'strikethrough-thickness', 'stroke', 'stroke-dasharray',
+ 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
+ 'stroke-miterlimit', 'stroke-opacity', 'stroke-width', 'systemLanguage',
+ 'target', 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
+ 'underline-position', 'underline-thickness', 'unicode', 'unicode-range',
+ 'units-per-em', 'values', 'version', 'viewBox', 'visibility', 'width',
+ 'widths', 'x', 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
+ 'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
+ 'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1',
+ 'y2', 'zoomAndPan']
+
+ svg_attr_map = None
+ svg_elem_map = None
+
+ acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
+ 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
+ 'stroke-opacity']
+
+ def reset(self):
+ _BaseHTMLProcessor.reset(self)
+ self.unacceptablestack = 0
+ self.mathmlOK = 0
+ self.svgOK = 0
+
+ def unknown_starttag(self, tag, attrs):
+ acceptable_attributes = self.acceptable_attributes
+ keymap = {}
+ if not tag in self.acceptable_elements or self.svgOK:
+ if tag in self.unacceptable_elements_with_end_tag:
+ self.unacceptablestack += 1
+
+ # add implicit namespaces to html5 inline svg/mathml
+ if self._type.endswith('html'):
+ if not dict(attrs).get('xmlns'):
+ if tag=='svg':
+ attrs.append( ('xmlns','http://www.w3.org/2000/svg') )
+ if tag=='math':
+ attrs.append( ('xmlns','http://www.w3.org/1998/Math/MathML') )
+
+ # not otherwise acceptable, perhaps it is MathML or SVG?
+ if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs:
+ self.mathmlOK += 1
+ if tag=='svg' and ('xmlns','http://www.w3.org/2000/svg') in attrs:
+ self.svgOK += 1
+
+ # chose acceptable attributes based on tag class, else bail
+ if self.mathmlOK and tag in self.mathml_elements:
+ acceptable_attributes = self.mathml_attributes
+ elif self.svgOK and tag in self.svg_elements:
+ # for most vocabularies, lowercasing is a good idea. Many
+ # svg elements, however, are camel case
+ if not self.svg_attr_map:
+ lower=[attr.lower() for attr in self.svg_attributes]
+ mix=[a for a in self.svg_attributes if a not in lower]
+ self.svg_attributes = lower
+ self.svg_attr_map = dict([(a.lower(),a) for a in mix])
+
+ lower=[attr.lower() for attr in self.svg_elements]
+ mix=[a for a in self.svg_elements if a not in lower]
+ self.svg_elements = lower
+ self.svg_elem_map = dict([(a.lower(),a) for a in mix])
+ acceptable_attributes = self.svg_attributes
+ tag = self.svg_elem_map.get(tag,tag)
+ keymap = self.svg_attr_map
+ elif not tag in self.acceptable_elements:
+ return
+
+ # declare xlink namespace, if needed
+ if self.mathmlOK or self.svgOK:
+ if filter(lambda (n,v): n.startswith('xlink:'),attrs):
+ if not ('xmlns:xlink','http://www.w3.org/1999/xlink') in attrs:
+ attrs.append(('xmlns:xlink','http://www.w3.org/1999/xlink'))
+
+ clean_attrs = []
+ for key, value in self.normalize_attrs(attrs):
+ if key in acceptable_attributes:
+ key=keymap.get(key,key)
+ # make sure the uri uses an acceptable uri scheme
+ if key == u'href':
+ value = _makeSafeAbsoluteURI(value)
+ clean_attrs.append((key,value))
+ elif key=='style':
+ clean_value = self.sanitize_style(value)
+ if clean_value:
+ clean_attrs.append((key,clean_value))
+ _BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs)
+
+ def unknown_endtag(self, tag):
+ if not tag in self.acceptable_elements:
+ if tag in self.unacceptable_elements_with_end_tag:
+ self.unacceptablestack -= 1
+ if self.mathmlOK and tag in self.mathml_elements:
+ if tag == 'math' and self.mathmlOK:
+ self.mathmlOK -= 1
+ elif self.svgOK and tag in self.svg_elements:
+ tag = self.svg_elem_map.get(tag,tag)
+ if tag == 'svg' and self.svgOK:
+ self.svgOK -= 1
+ else:
+ return
+ _BaseHTMLProcessor.unknown_endtag(self, tag)
+
+ def handle_pi(self, text):
+ pass
+
+ def handle_decl(self, text):
+ pass
+
+ def handle_data(self, text):
+ if not self.unacceptablestack:
+ _BaseHTMLProcessor.handle_data(self, text)
+
+ def sanitize_style(self, style):
+ # disallow urls
+ style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
+
+ # gauntlet
+ if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
+ return ''
+ # This replaced a regexp that used re.match and was prone to pathological back-tracking.
+ if re.sub("\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip():
+ return ''
+
+ clean = []
+ for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
+ if not value:
+ continue
+ if prop.lower() in self.acceptable_css_properties:
+ clean.append(prop + ': ' + value + ';')
+ elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
+ for keyword in value.split():
+ if not keyword in self.acceptable_css_keywords and \
+ not self.valid_css_values.match(keyword):
+ break
+ else:
+ clean.append(prop + ': ' + value + ';')
+ elif self.svgOK and prop.lower() in self.acceptable_svg_properties:
+ clean.append(prop + ': ' + value + ';')
+
+ return ' '.join(clean)
+
+ def parse_comment(self, i, report=1):
+ ret = _BaseHTMLProcessor.parse_comment(self, i, report)
+ if ret >= 0:
+ return ret
+ # if ret == -1, this may be a malicious attempt to circumvent
+ # sanitization, or a page-destroying unclosed comment
+ match = re.compile(r'--[^>]*>').search(self.rawdata, i+4)
+ if match:
+ return match.end()
+ # unclosed comment; deliberately fail to handle_data()
+ return len(self.rawdata)
+
+
+def _sanitizeHTML(htmlSource, encoding, _type):
+ if not _SGML_AVAILABLE:
+ return htmlSource
+ p = _HTMLSanitizer(encoding, _type)
+ htmlSource = htmlSource.replace('<![CDATA[', '<![CDATA[')
+ p.feed(htmlSource)
+ data = p.output()
+ if TIDY_MARKUP:
+ # loop through list of preferred Tidy interfaces looking for one that's installed,
+ # then set up a common _tidy function to wrap the interface-specific API.
+ _tidy = None
+ for tidy_interface in PREFERRED_TIDY_INTERFACES:
+ try:
+ if tidy_interface == "uTidy":
+ from tidy import parseString as _utidy
+ def _tidy(data, **kwargs):
+ return str(_utidy(data, **kwargs))
+ break
+ elif tidy_interface == "mxTidy":
+ from mx.Tidy import Tidy as _mxtidy
+ def _tidy(data, **kwargs):
+ nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs)
+ return data
+ break
+ except:
+ pass
+ if _tidy:
+ utf8 = isinstance(data, unicode)
+ if utf8:
+ data = data.encode('utf-8')
+ data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8")
+ if utf8:
+ data = unicode(data, 'utf-8')
+ if data.count('<body'):
+ data = data.split('<body', 1)[1]
+ if data.count('>'):
+ data = data.split('>', 1)[1]
+ if data.count('</body'):
+ data = data.split('</body', 1)[0]
+ data = data.strip().replace('\r\n', '\n')
+ return data
+
+class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
+ def http_error_default(self, req, fp, code, msg, headers):
+ # The default implementation just raises HTTPError.
+ # Forget that.
+ fp.status = code
+ return fp
+
+ def http_error_301(self, req, fp, code, msg, hdrs):
+ result = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp,
+ code, msg, hdrs)
+ result.status = code
+ result.newurl = result.geturl()
+ return result
+ # The default implementations in urllib2.HTTPRedirectHandler
+ # are identical, so hardcoding a http_error_301 call above
+ # won't affect anything
+ http_error_300 = http_error_301
+ http_error_302 = http_error_301
+ http_error_303 = http_error_301
+ http_error_307 = http_error_301
+
+ def http_error_401(self, req, fp, code, msg, headers):
+ # Check if
+ # - server requires digest auth, AND
+ # - we tried (unsuccessfully) with basic auth, AND
+ # If all conditions hold, parse authentication information
+ # out of the Authorization header we sent the first time
+ # (for the username and password) and the WWW-Authenticate
+ # header the server sent back (for the realm) and retry
+ # the request with the appropriate digest auth headers instead.
+ # This evil genius hack has been brought to you by Aaron Swartz.
+ host = urlparse.urlparse(req.get_full_url())[1]
+ if base64 is None or 'Authorization' not in req.headers \
+ or 'WWW-Authenticate' not in headers:
+ return self.http_error_default(req, fp, code, msg, headers)
+ auth = _base64decode(req.headers['Authorization'].split(' ')[1])
+ user, passw = auth.split(':')
+ realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
+ self.add_password(realm, host, user, passw)
+ retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
+ self.reset_retry_count()
+ return retry
+
+def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers):
+ """URL, filename, or string --> stream
+
+ This function lets you define parsers that take any input source
+ (URL, pathname to local or network file, or actual data as a string)
+ and deal with it in a uniform manner. Returned object is guaranteed
+ to have all the basic stdio read methods (read, readline, readlines).
+ Just .close() the object when you're done with it.
+
+ If the etag argument is supplied, it will be used as the value of an
+ If-None-Match request header.
+
+ If the modified argument is supplied, it can be a tuple of 9 integers
+ (as returned by gmtime() in the standard Python time module) or a date
+ string in any format supported by feedparser. Regardless, it MUST
+ be in GMT (Greenwich Mean Time). It will be reformatted into an
+ RFC 1123-compliant date and used as the value of an If-Modified-Since
+ request header.
+
+ If the agent argument is supplied, it will be used as the value of a
+ User-Agent request header.
+
+ If the referrer argument is supplied, it will be used as the value of a
+ Referer[sic] request header.
+
+ If handlers is supplied, it is a list of handlers used to build a
+ urllib2 opener.
+
+ if request_headers is supplied it is a dictionary of HTTP request headers
+ that will override the values generated by FeedParser.
+ """
+
+ if hasattr(url_file_stream_or_string, 'read'):
+ return url_file_stream_or_string
+
+ if url_file_stream_or_string == '-':
+ return sys.stdin
+
+ if isinstance(url_file_stream_or_string, basestring) \
+ and urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'):
+ # Deal with the feed URI scheme
+ if url_file_stream_or_string.startswith('feed:http'):
+ url_file_stream_or_string = url_file_stream_or_string[5:]
+ elif url_file_stream_or_string.startswith('feed:'):
+ url_file_stream_or_string = 'http:' + url_file_stream_or_string[5:]
+ if not agent:
+ agent = USER_AGENT
+ # test for inline user:password for basic auth
+ auth = None
+ if base64:
+ urltype, rest = urllib.splittype(url_file_stream_or_string)
+ realhost, rest = urllib.splithost(rest)
+ if realhost:
+ user_passwd, realhost = urllib.splituser(realhost)
+ if user_passwd:
+ url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)
+ auth = base64.standard_b64encode(user_passwd).strip()
+
+ # iri support
+ if isinstance(url_file_stream_or_string, unicode):
+ url_file_stream_or_string = _convert_to_idn(url_file_stream_or_string)
+
+ # try to open with urllib2 (to use optional headers)
+ request = _build_urllib2_request(url_file_stream_or_string, agent, etag, modified, referrer, auth, request_headers)
+ opener = apply(urllib2.build_opener, tuple(handlers + [_FeedURLHandler()]))
+ opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
+ try:
+ return opener.open(request)
+ finally:
+ opener.close() # JohnD
+
+ # try to open with native open function (if url_file_stream_or_string is a filename)
+ try:
+ return open(url_file_stream_or_string, 'rb')
+ except IOError:
+ pass
+
+ # treat url_file_stream_or_string as string
+ if isinstance(url_file_stream_or_string, unicode):
+ return _StringIO(url_file_stream_or_string.encode('utf-8'))
+ return _StringIO(url_file_stream_or_string)
+
+def _convert_to_idn(url):
+ """Convert a URL to IDN notation"""
+ # this function should only be called with a unicode string
+ # strategy: if the host cannot be encoded in ascii, then
+ # it'll be necessary to encode it in idn form
+ parts = list(urlparse.urlsplit(url))
+ try:
+ parts[1].encode('ascii')
+ except UnicodeEncodeError:
+ # the url needs to be converted to idn notation
+ host = parts[1].rsplit(':', 1)
+ newhost = []
+ port = u''
+ if len(host) == 2:
+ port = host.pop()
+ for h in host[0].split('.'):
+ newhost.append(h.encode('idna').decode('utf-8'))
+ parts[1] = '.'.join(newhost)
+ if port:
+ parts[1] += ':' + port
+ return urlparse.urlunsplit(parts)
+ else:
+ return url
+
+def _build_urllib2_request(url, agent, etag, modified, referrer, auth, request_headers):
+ request = urllib2.Request(url)
+ request.add_header('User-Agent', agent)
+ if etag:
+ request.add_header('If-None-Match', etag)
+ if isinstance(modified, basestring):
+ modified = _parse_date(modified)
+ elif isinstance(modified, datetime.datetime):
+ modified = modified.utctimetuple()
+ if modified:
+ # format into an RFC 1123-compliant timestamp. We can't use
+ # time.strftime() since the %a and %b directives can be affected
+ # by the current locale, but RFC 2616 states that dates must be
+ # in English.
+ short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
+ months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
+ request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
+ if referrer:
+ request.add_header('Referer', referrer)
+ if gzip and zlib:
+ request.add_header('Accept-encoding', 'gzip, deflate')
+ elif gzip:
+ request.add_header('Accept-encoding', 'gzip')
+ elif zlib:
+ request.add_header('Accept-encoding', 'deflate')
+ else:
+ request.add_header('Accept-encoding', '')
+ if auth:
+ request.add_header('Authorization', 'Basic %s' % auth)
+ if ACCEPT_HEADER:
+ request.add_header('Accept', ACCEPT_HEADER)
+ # use this for whatever -- cookies, special headers, etc
+ # [('Cookie','Something'),('x-special-header','Another Value')]
+ for header_name, header_value in request_headers.items():
+ request.add_header(header_name, header_value)
+ request.add_header('A-IM', 'feed') # RFC 3229 support
+ return request
+
+_date_handlers = []
+def registerDateHandler(func):
+ '''Register a date handler function (takes string, returns 9-tuple date in GMT)'''
+ _date_handlers.insert(0, func)
+
+# ISO-8601 date parsing routines written by Fazal Majid.
+# The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
+# parser is beyond the scope of feedparser and would be a worthwhile addition
+# to the Python library.
+# A single regular expression cannot parse ISO 8601 date formats into groups
+# as the standard is highly irregular (for instance is 030104 2003-01-04 or
+# 0301-04-01), so we use templates instead.
+# Please note the order in templates is significant because we need a
+# greedy match.
+_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-0MM?-?DD', 'YYYY-MM', 'YYYY-?OOO',
+ 'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
+ '-YY-?MM', '-OOO', '-YY',
+ '--MM-?DD', '--MM',
+ '---DD',
+ 'CC', '']
+_iso8601_re = [
+ tmpl.replace(
+ 'YYYY', r'(?P<year>\d{4})').replace(
+ 'YY', r'(?P<year>\d\d)').replace(
+ 'MM', r'(?P<month>[01]\d)').replace(
+ 'DD', r'(?P<day>[0123]\d)').replace(
+ 'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
+ 'CC', r'(?P<century>\d\d$)')
+ + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
+ + r'(:(?P<second>\d{2}))?'
+ + r'(\.(?P<fracsecond>\d+))?'
+ + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
+ for tmpl in _iso8601_tmpl]
+try:
+ del tmpl
+except NameError:
+ pass
+_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
+try:
+ del regex
+except NameError:
+ pass
+def _parse_date_iso8601(dateString):
+ '''Parse a variety of ISO-8601-compatible formats like 20040105'''
+ m = None
+ for _iso8601_match in _iso8601_matches:
+ m = _iso8601_match(dateString)
+ if m:
+ break
+ if not m:
+ return
+ if m.span() == (0, 0):
+ return
+ params = m.groupdict()
+ ordinal = params.get('ordinal', 0)
+ if ordinal:
+ ordinal = int(ordinal)
+ else:
+ ordinal = 0
+ year = params.get('year', '--')
+ if not year or year == '--':
+ year = time.gmtime()[0]
+ elif len(year) == 2:
+ # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
+ year = 100 * int(time.gmtime()[0] / 100) + int(year)
+ else:
+ year = int(year)
+ month = params.get('month', '-')
+ if not month or month == '-':
+ # ordinals are NOT normalized by mktime, we simulate them
+ # by setting month=1, day=ordinal
+ if ordinal:
+ month = 1
+ else:
+ month = time.gmtime()[1]
+ month = int(month)
+ day = params.get('day', 0)
+ if not day:
+ # see above
+ if ordinal:
+ day = ordinal
+ elif params.get('century', 0) or \
+ params.get('year', 0) or params.get('month', 0):
+ day = 1
+ else:
+ day = time.gmtime()[2]
+ else:
+ day = int(day)
+ # special case of the century - is the first year of the 21st century
+ # 2000 or 2001 ? The debate goes on...
+ if 'century' in params.keys():
+ year = (int(params['century']) - 1) * 100 + 1
+ # in ISO 8601 most fields are optional
+ for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:
+ if not params.get(field, None):
+ params[field] = 0
+ hour = int(params.get('hour', 0))
+ minute = int(params.get('minute', 0))
+ second = int(float(params.get('second', 0)))
+ # weekday is normalized by mktime(), we can ignore it
+ weekday = 0
+ daylight_savings_flag = -1
+ tm = [year, month, day, hour, minute, second, weekday,
+ ordinal, daylight_savings_flag]
+ # ISO 8601 time zone adjustments
+ tz = params.get('tz')
+ if tz and tz != 'Z':
+ if tz[0] == '-':
+ tm[3] += int(params.get('tzhour', 0))
+ tm[4] += int(params.get('tzmin', 0))
+ elif tz[0] == '+':
+ tm[3] -= int(params.get('tzhour', 0))
+ tm[4] -= int(params.get('tzmin', 0))
+ else:
+ return None
+ # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
+ # which is guaranteed to normalize d/m/y/h/m/s.
+ # Many implementations have bugs, but we'll pretend they don't.
+ return time.localtime(time.mktime(tuple(tm)))
+registerDateHandler(_parse_date_iso8601)
+
+# 8-bit date handling routines written by ytrewq1.
+_korean_year = u'\ub144' # b3e2 in euc-kr
+_korean_month = u'\uc6d4' # bff9 in euc-kr
+_korean_day = u'\uc77c' # c0cf in euc-kr
+_korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr
+_korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr
+
+_korean_onblog_date_re = \
+ re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
+ (_korean_year, _korean_month, _korean_day))
+_korean_nate_date_re = \
+ re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
+ (_korean_am, _korean_pm))
+def _parse_date_onblog(dateString):
+ '''Parse a string according to the OnBlog 8-bit date format'''
+ m = _korean_onblog_date_re.match(dateString)
+ if not m:
+ return
+ w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
+ {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
+ 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
+ 'zonediff': '+09:00'}
+ return _parse_date_w3dtf(w3dtfdate)
+registerDateHandler(_parse_date_onblog)
+
+def _parse_date_nate(dateString):
+ '''Parse a string according to the Nate 8-bit date format'''
+ m = _korean_nate_date_re.match(dateString)
+ if not m:
+ return
+ hour = int(m.group(5))
+ ampm = m.group(4)
+ if (ampm == _korean_pm):
+ hour += 12
+ hour = str(hour)
+ if len(hour) == 1:
+ hour = '0' + hour
+ w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
+ {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
+ 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\
+ 'zonediff': '+09:00'}
+ return _parse_date_w3dtf(w3dtfdate)
+registerDateHandler(_parse_date_nate)
+
+_mssql_date_re = \
+ re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?')
+def _parse_date_mssql(dateString):
+ '''Parse a string according to the MS SQL date format'''
+ m = _mssql_date_re.match(dateString)
+ if not m:
+ return
+ w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
+ {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
+ 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
+ 'zonediff': '+09:00'}
+ return _parse_date_w3dtf(w3dtfdate)
+registerDateHandler(_parse_date_mssql)
+
+# Unicode strings for Greek date strings
+_greek_months = \
+ { \
+ u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7
+ u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7
+ u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7
+ u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7
+ u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7
+ u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7
+ u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7
+ u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7
+ u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7
+ u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7
+ u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7
+ u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7
+ u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7
+ u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7
+ u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7
+ u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7
+ u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7
+ u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7
+ u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7
+ }
+
+_greek_wdays = \
+ { \
+ u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7
+ u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7
+ u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7
+ u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7
+ u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7
+ u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7
+ u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7
+ }
+
+_greek_date_format_re = \
+ re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
+
+def _parse_date_greek(dateString):
+ '''Parse a string according to a Greek 8-bit date format.'''
+ m = _greek_date_format_re.match(dateString)
+ if not m:
+ return
+ wday = _greek_wdays[m.group(1)]
+ month = _greek_months[m.group(3)]
+ rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
+ {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
+ 'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\
+ 'zonediff': m.group(8)}
+ return _parse_date_rfc822(rfc822date)
+registerDateHandler(_parse_date_greek)
+
+# Unicode strings for Hungarian date strings
+_hungarian_months = \
+ { \
+ u'janu\u00e1r': u'01', # e1 in iso-8859-2
+ u'febru\u00e1ri': u'02', # e1 in iso-8859-2
+ u'm\u00e1rcius': u'03', # e1 in iso-8859-2
+ u'\u00e1prilis': u'04', # e1 in iso-8859-2
+ u'm\u00e1ujus': u'05', # e1 in iso-8859-2
+ u'j\u00fanius': u'06', # fa in iso-8859-2
+ u'j\u00falius': u'07', # fa in iso-8859-2
+ u'augusztus': u'08',
+ u'szeptember': u'09',
+ u'okt\u00f3ber': u'10', # f3 in iso-8859-2
+ u'november': u'11',
+ u'december': u'12',
+ }
+
+_hungarian_date_format_re = \
+ re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
+
+def _parse_date_hungarian(dateString):
+ '''Parse a string according to a Hungarian 8-bit date format.'''
+ m = _hungarian_date_format_re.match(dateString)
+ if not m or m.group(2) not in _hungarian_months:
+ return None
+ month = _hungarian_months[m.group(2)]
+ day = m.group(3)
+ if len(day) == 1:
+ day = '0' + day
+ hour = m.group(4)
+ if len(hour) == 1:
+ hour = '0' + hour
+ w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
+ {'year': m.group(1), 'month': month, 'day': day,\
+ 'hour': hour, 'minute': m.group(5),\
+ 'zonediff': m.group(6)}
+ return _parse_date_w3dtf(w3dtfdate)
+registerDateHandler(_parse_date_hungarian)
+
+# W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by
+# Drake and licensed under the Python license. Removed all range checking
+# for month, day, hour, minute, and second, since mktime will normalize
+# these later
+def _parse_date_w3dtf(dateString):
+ def __extract_date(m):
+ year = int(m.group('year'))
+ if year < 100:
+ year = 100 * int(time.gmtime()[0] / 100) + int(year)
+ if year < 1000:
+ return 0, 0, 0
+ julian = m.group('julian')
+ if julian:
+ julian = int(julian)
+ month = julian / 30 + 1
+ day = julian % 30 + 1
+ jday = None
+ while jday != julian:
+ t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0))
+ jday = time.gmtime(t)[-2]
+ diff = abs(jday - julian)
+ if jday > julian:
+ if diff < day:
+ day = day - diff
+ else:
+ month = month - 1
+ day = 31
+ elif jday < julian:
+ if day + diff < 28:
+ day = day + diff
+ else:
+ month = month + 1
+ return year, month, day
+ month = m.group('month')
+ day = 1
+ if month is None:
+ month = 1
+ else:
+ month = int(month)
+ day = m.group('day')
+ if day:
+ day = int(day)
+ else:
+ day = 1
+ return year, month, day
+
+ def __extract_time(m):
+ if not m:
+ return 0, 0, 0
+ hours = m.group('hours')
+ if not hours:
+ return 0, 0, 0
+ hours = int(hours)
+ minutes = int(m.group('minutes'))
+ seconds = m.group('seconds')
+ if seconds:
+ seconds = int(seconds)
+ else:
+ seconds = 0
+ return hours, minutes, seconds
+
+ def __extract_tzd(m):
+ '''Return the Time Zone Designator as an offset in seconds from UTC.'''
+ if not m:
+ return 0
+ tzd = m.group('tzd')
+ if not tzd:
+ return 0
+ if tzd == 'Z':
+ return 0
+ hours = int(m.group('tzdhours'))
+ minutes = m.group('tzdminutes')
+ if minutes:
+ minutes = int(minutes)
+ else:
+ minutes = 0
+ offset = (hours*60 + minutes) * 60
+ if tzd[0] == '+':
+ return -offset
+ return offset
+
+ __date_re = ('(?P<year>\d\d\d\d)'
+ '(?:(?P<dsep>-|)'
+ '(?:(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?'
+ '|(?P<julian>\d\d\d)))?')
+ __tzd_re = '(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)'
+ __tzd_rx = re.compile(__tzd_re)
+ __time_re = ('(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)'
+ '(?:(?P=tsep)(?P<seconds>\d\d)(?:[.,]\d+)?)?'
+ + __tzd_re)
+ __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re)
+ __datetime_rx = re.compile(__datetime_re)
+ m = __datetime_rx.match(dateString)
+ if (m is None) or (m.group() != dateString):
+ return
+ gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)
+ if gmt[0] == 0:
+ return
+ return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone)
+registerDateHandler(_parse_date_w3dtf)
+
+def _parse_date_rfc822(dateString):
+ '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date'''
+ data = dateString.split()
+ if not data:
+ return None
+ if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames:
+ del data[0]
+ if len(data) == 4:
+ s = data[3]
+ i = s.find('+')
+ if i > 0:
+ data[3:] = [s[:i], s[i+1:]]
+ else:
+ data.append('')
+ dateString = " ".join(data)
+ # Account for the Etc/GMT timezone by stripping 'Etc/'
+ elif len(data) == 5 and data[4].lower().startswith('etc/'):
+ data[4] = data[4][4:]
+ dateString = " ".join(data)
+ if len(data) < 5:
+ dateString += ' 00:00:00 GMT'
+ tm = rfc822.parsedate_tz(dateString)
+ if tm:
+ # Jython doesn't adjust for 2-digit years like CPython does,
+ # so account for it by shifting the year so that it's in the
+ # range 1970-2069 (1970 being the year of the Unix epoch).
+ if tm[0] < 100:
+ tm = (tm[0] + (1900, 2000)[tm[0] < 70],) + tm[1:]
+ return time.gmtime(rfc822.mktime_tz(tm))
+# rfc822.py defines several time zones, but we define some extra ones.
+# 'ET' is equivalent to 'EST', etc.
+_additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
+rfc822._timezones.update(_additional_timezones)
+registerDateHandler(_parse_date_rfc822)
+
+def _parse_date_perforce(aDateString):
+ """parse a date in yyyy/mm/dd hh:mm:ss TTT format"""
+ # Fri, 2006/09/15 08:19:53 EDT
+ _my_date_pattern = re.compile( \
+ r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})')
+
+ m = _my_date_pattern.search(aDateString)
+ if m is None:
+ return None
+ dow, year, month, day, hour, minute, second, tz = m.groups()
+ months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
+ dateString = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz)
+ tm = rfc822.parsedate_tz(dateString)
+ if tm:
+ return time.gmtime(rfc822.mktime_tz(tm))
+registerDateHandler(_parse_date_perforce)
+
+def _parse_date(dateString):
+ '''Parses a variety of date formats into a 9-tuple in GMT'''
+ if not dateString:
+ return None
+ for handler in _date_handlers:
+ try:
+ date9tuple = handler(dateString)
+ except (KeyError, OverflowError, ValueError):
+ continue
+ if not date9tuple:
+ continue
+ if len(date9tuple) != 9:
+ continue
+ return date9tuple
+ return None
+
+def _getCharacterEncoding(http_headers, xml_data):
+ '''Get the character encoding of the XML document
+
+ http_headers is a dictionary
+ xml_data is a raw string (not Unicode)
+
+ This is so much trickier than it sounds, it's not even funny.
+ According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
+ is application/xml, application/*+xml,
+ application/xml-external-parsed-entity, or application/xml-dtd,
+ the encoding given in the charset parameter of the HTTP Content-Type
+ takes precedence over the encoding given in the XML prefix within the
+ document, and defaults to 'utf-8' if neither are specified. But, if
+ the HTTP Content-Type is text/xml, text/*+xml, or
+ text/xml-external-parsed-entity, the encoding given in the XML prefix
+ within the document is ALWAYS IGNORED and only the encoding given in
+ the charset parameter of the HTTP Content-Type header should be
+ respected, and it defaults to 'us-ascii' if not specified.
+
+ Furthermore, discussion on the atom-syntax mailing list with the
+ author of RFC 3023 leads me to the conclusion that any document
+ served with a Content-Type of text/* and no charset parameter
+ must be treated as us-ascii. (We now do this.) And also that it
+ must always be flagged as non-well-formed. (We now do this too.)
+
+ If Content-Type is unspecified (input was local file or non-HTTP source)
+ or unrecognized (server just got it totally wrong), then go by the
+ encoding given in the XML prefix of the document and default to
+ 'iso-8859-1' as per the HTTP specification (RFC 2616).
+
+ Then, assuming we didn't find a character encoding in the HTTP headers
+ (and the HTTP Content-type allowed us to look in the body), we need
+ to sniff the first few bytes of the XML data and try to determine
+ whether the encoding is ASCII-compatible. Section F of the XML
+ specification shows the way here:
+ http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
+
+ If the sniffed encoding is not ASCII-compatible, we need to make it
+ ASCII compatible so that we can sniff further into the XML declaration
+ to find the encoding attribute, which will tell us the true encoding.
+
+ Of course, none of this guarantees that we will be able to parse the
+ feed in the declared character encoding (assuming it was declared
+ correctly, which many are not). CJKCodecs and iconv_codec help a lot;
+ you should definitely install them if you can.
+ http://cjkpython.i18n.org/
+ '''
+
+ def _parseHTTPContentType(content_type):
+ '''takes HTTP Content-Type header and returns (content type, charset)
+
+ If no charset is specified, returns (content type, '')
+ If no content type is specified, returns ('', '')
+ Both return parameters are guaranteed to be lowercase strings
+ '''
+ content_type = content_type or ''
+ content_type, params = cgi.parse_header(content_type)
+ charset = params.get('charset', '').replace("'", "")
+ if not isinstance(charset, unicode):
+ charset = charset.decode('utf-8', 'ignore')
+ return content_type, charset
+
+ sniffed_xml_encoding = u''
+ xml_encoding = u''
+ true_encoding = u''
+ http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type', http_headers.get('Content-type')))
+ # Must sniff for non-ASCII-compatible character encodings before
+ # searching for XML declaration. This heuristic is defined in
+ # section F of the XML specification:
+ # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
+ try:
+ if xml_data[:4] == _l2bytes([0x4c, 0x6f, 0xa7, 0x94]):
+ # EBCDIC
+ xml_data = _ebcdic_to_ascii(xml_data)
+ elif xml_data[:4] == _l2bytes([0x00, 0x3c, 0x00, 0x3f]):
+ # UTF-16BE
+ sniffed_xml_encoding = u'utf-16be'
+ xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
+ elif (len(xml_data) >= 4) and (xml_data[:2] == _l2bytes([0xfe, 0xff])) and (xml_data[2:4] != _l2bytes([0x00, 0x00])):
+ # UTF-16BE with BOM
+ sniffed_xml_encoding = u'utf-16be'
+ xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
+ elif xml_data[:4] == _l2bytes([0x3c, 0x00, 0x3f, 0x00]):
+ # UTF-16LE
+ sniffed_xml_encoding = u'utf-16le'
+ xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
+ elif (len(xml_data) >= 4) and (xml_data[:2] == _l2bytes([0xff, 0xfe])) and (xml_data[2:4] != _l2bytes([0x00, 0x00])):
+ # UTF-16LE with BOM
+ sniffed_xml_encoding = u'utf-16le'
+ xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
+ elif xml_data[:4] == _l2bytes([0x00, 0x00, 0x00, 0x3c]):
+ # UTF-32BE
+ sniffed_xml_encoding = u'utf-32be'
+ xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
+ elif xml_data[:4] == _l2bytes([0x3c, 0x00, 0x00, 0x00]):
+ # UTF-32LE
+ sniffed_xml_encoding = u'utf-32le'
+ xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
+ elif xml_data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]):
+ # UTF-32BE with BOM
+ sniffed_xml_encoding = u'utf-32be'
+ xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
+ elif xml_data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]):
+ # UTF-32LE with BOM
+ sniffed_xml_encoding = u'utf-32le'
+ xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
+ elif xml_data[:3] == _l2bytes([0xef, 0xbb, 0xbf]):
+ # UTF-8 with BOM
+ sniffed_xml_encoding = u'utf-8'
+ xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
+ else:
+ # ASCII-compatible
+ pass
+ xml_encoding_match = re.compile(_s2bytes('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')).match(xml_data)
+ except UnicodeDecodeError:
+ xml_encoding_match = None
+ if xml_encoding_match:
+ xml_encoding = xml_encoding_match.groups()[0].decode('utf-8').lower()
+ if sniffed_xml_encoding and (xml_encoding in (u'iso-10646-ucs-2', u'ucs-2', u'csunicode', u'iso-10646-ucs-4', u'ucs-4', u'csucs4', u'utf-16', u'utf-32', u'utf_16', u'utf_32', u'utf16', u'u16')):
+ xml_encoding = sniffed_xml_encoding
+ acceptable_content_type = 0
+ application_content_types = (u'application/xml', u'application/xml-dtd', u'application/xml-external-parsed-entity')
+ text_content_types = (u'text/xml', u'text/xml-external-parsed-entity')
+ if (http_content_type in application_content_types) or \
+ (http_content_type.startswith(u'application/') and http_content_type.endswith(u'+xml')):
+ acceptable_content_type = 1
+ true_encoding = http_encoding or xml_encoding or u'utf-8'
+ elif (http_content_type in text_content_types) or \
+ (http_content_type.startswith(u'text/')) and http_content_type.endswith(u'+xml'):
+ acceptable_content_type = 1
+ true_encoding = http_encoding or u'us-ascii'
+ elif http_content_type.startswith(u'text/'):
+ true_encoding = http_encoding or u'us-ascii'
+ elif http_headers and (not (http_headers.has_key('content-type') or http_headers.has_key('Content-type'))):
+ true_encoding = xml_encoding or u'iso-8859-1'
+ else:
+ true_encoding = xml_encoding or u'utf-8'
+ # some feeds claim to be gb2312 but are actually gb18030.
+ # apparently MSIE and Firefox both do the following switch:
+ if true_encoding.lower() == u'gb2312':
+ true_encoding = u'gb18030'
+ return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
+
+def _toUTF8(data, encoding):
+ '''Changes an XML data stream on the fly to specify a new encoding
+
+ data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
+ encoding is a string recognized by encodings.aliases
+ '''
+ # strip Byte Order Mark (if present)
+ if (len(data) >= 4) and (data[:2] == _l2bytes([0xfe, 0xff])) and (data[2:4] != _l2bytes([0x00, 0x00])):
+ encoding = 'utf-16be'
+ data = data[2:]
+ elif (len(data) >= 4) and (data[:2] == _l2bytes([0xff, 0xfe])) and (data[2:4] != _l2bytes([0x00, 0x00])):
+ encoding = 'utf-16le'
+ data = data[2:]
+ elif data[:3] == _l2bytes([0xef, 0xbb, 0xbf]):
+ encoding = 'utf-8'
+ data = data[3:]
+ elif data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]):
+ encoding = 'utf-32be'
+ data = data[4:]
+ elif data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]):
+ encoding = 'utf-32le'
+ data = data[4:]
+ newdata = unicode(data, encoding)
+ declmatch = re.compile('^<\?xml[^>]*?>')
+ newdecl = '''<?xml version='1.0' encoding='utf-8'?>'''
+ if declmatch.search(newdata):
+ newdata = declmatch.sub(newdecl, newdata)
+ else:
+ newdata = newdecl + u'\n' + newdata
+ return newdata.encode('utf-8')
+
+def _stripDoctype(data):
+ '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
+
+ rss_version may be 'rss091n' or None
+ stripped_data is the same XML document, minus the DOCTYPE
+ '''
+ start = re.search(_s2bytes('<\w'), data)
+ start = start and start.start() or -1
+ head,data = data[:start+1], data[start+1:]
+
+ entity_pattern = re.compile(_s2bytes(r'^\s*<!ENTITY([^>]*?)>'), re.MULTILINE)
+ entity_results=entity_pattern.findall(head)
+ head = entity_pattern.sub(_s2bytes(''), head)
+ doctype_pattern = re.compile(_s2bytes(r'^\s*<!DOCTYPE([^>]*?)>'), re.MULTILINE)
+ doctype_results = doctype_pattern.findall(head)
+ doctype = doctype_results and doctype_results[0] or _s2bytes('')
+ if doctype.lower().count(_s2bytes('netscape')):
+ version = u'rss091n'
+ else:
+ version = None
+
+ # only allow in 'safe' inline entity definitions
+ replacement=_s2bytes('')
+ if len(doctype_results)==1 and entity_results:
+ safe_pattern=re.compile(_s2bytes('\s+(\w+)\s+"(&#\w+;|[^&"]*)"'))
+ safe_entities=filter(lambda e: safe_pattern.match(e),entity_results)
+ if safe_entities:
+ replacement=_s2bytes('<!DOCTYPE feed [\n <!ENTITY') + _s2bytes('>\n <!ENTITY ').join(safe_entities) + _s2bytes('>\n]>')
+ data = doctype_pattern.sub(replacement, head) + data
+
+ return version, data, dict(replacement and [(k.decode('utf-8'), v.decode('utf-8')) for k, v in safe_pattern.findall(replacement)])
+
+def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None):
+ '''Parse a feed from a URL, file, stream, or string.
+
+ request_headers, if given, is a dict from http header name to value to add
+ to the request; this overrides internally generated values.
+ '''
+
+ if handlers is None:
+ handlers = []
+ if request_headers is None:
+ request_headers = {}
+ if response_headers is None:
+ response_headers = {}
+
+ result = FeedParserDict()
+ result['feed'] = FeedParserDict()
+ result['entries'] = []
+ result['bozo'] = 0
+ if not isinstance(handlers, list):
+ handlers = [handlers]
+ try:
+ f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers)
+ data = f.read()
+ except Exception, e:
+ result['bozo'] = 1
+ result['bozo_exception'] = e
+ data = None
+ f = None
+
+ if hasattr(f, 'headers'):
+ result['headers'] = dict(f.headers)
+ # overwrite existing headers using response_headers
+ if 'headers' in result:
+ result['headers'].update(response_headers)
+ elif response_headers:
+ result['headers'] = copy.deepcopy(response_headers)
+
+ # if feed is gzip-compressed, decompress it
+ if f and data and 'headers' in result:
+ if gzip and 'gzip' in (result['headers'].get('content-encoding'), result['headers'].get('Content-Encoding')):
+ try:
+ data = gzip.GzipFile(fileobj=_StringIO(data)).read()
+ except (IOError, struct.error), e:
+ # IOError can occur if the gzip header is bad
+ # struct.error can occur if the data is damaged
+ # Some feeds claim to be gzipped but they're not, so
+ # we get garbage. Ideally, we should re-request the
+ # feed without the 'Accept-encoding: gzip' header,
+ # but we don't.
+ result['bozo'] = 1
+ result['bozo_exception'] = e
+ data = None
+ elif zlib and 'deflate' in (result['headers'].get('content-encoding'), result['headers'].get('Content-Encoding')):
+ try:
+ data = zlib.decompress(data)
+ except zlib.error, e:
+ result['bozo'] = 1
+ result['bozo_exception'] = e
+ data = None
+
+ # save HTTP headers
+ if 'headers' in result:
+ if 'etag' in result['headers'] or 'ETag' in result['headers']:
+ etag = result['headers'].get('etag', result['headers'].get('ETag', u''))
+ if not isinstance(etag, unicode):
+ etag = etag.decode('utf-8', 'ignore')
+ if etag:
+ result['etag'] = etag
+ if 'last-modified' in result['headers'] or 'Last-Modified' in result['headers']:
+ modified = result['headers'].get('last-modified', result['headers'].get('Last-Modified'))
+ if modified:
+ result['modified'] = _parse_date(modified)
+ if hasattr(f, 'url'):
+ if not isinstance(f.url, unicode):
+ result['href'] = f.url.decode('utf-8', 'ignore')
+ else:
+ result['href'] = f.url
+ result['status'] = 200
+ if hasattr(f, 'status'):
+ result['status'] = f.status
+ if hasattr(f, 'close'):
+ f.close()
+
+ if data is None:
+ return result
+
+ # there are four encodings to keep track of:
+ # - http_encoding is the encoding declared in the Content-Type HTTP header
+ # - xml_encoding is the encoding declared in the <?xml declaration
+ # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data
+ # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
+ http_headers = result.get('headers', {})
+ result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \
+ _getCharacterEncoding(http_headers, data)
+ if http_headers and (not acceptable_content_type):
+ if http_headers.has_key('content-type') or http_headers.has_key('Content-type'):
+ bozo_message = '%s is not an XML media type' % http_headers.get('content-type', http_headers.get('Content-type'))
+ else:
+ bozo_message = 'no Content-type specified'
+ result['bozo'] = 1
+ result['bozo_exception'] = NonXMLContentType(bozo_message)
+
+ if data is not None:
+ result['version'], data, entities = _stripDoctype(data)
+
+ # ensure that baseuri is an absolute uri using an acceptable URI scheme
+ contentloc = http_headers.get('content-location', http_headers.get('Content-Location', u''))
+ href = result.get('href', u'')
+ baseuri = _makeSafeAbsoluteURI(href, contentloc) or _makeSafeAbsoluteURI(contentloc) or href
+
+ baselang = http_headers.get('content-language', http_headers.get('Content-Language', None))
+ if not isinstance(baselang, unicode) and baselang is not None:
+ baselang = baselang.decode('utf-8', 'ignore')
+
+ # if server sent 304, we're done
+ if result.get('status', 0) == 304:
+ result['version'] = u''
+ result['debug_message'] = 'The feed has not changed since you last checked, ' + \
+ 'so the server sent no data. This is a feature, not a bug!'
+ return result
+
+ # if there was a problem downloading, we're done
+ if data is None:
+ return result
+
+ # determine character encoding
+ use_strict_parser = 0
+ known_encoding = 0
+ tried_encodings = []
+ # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
+ for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding):
+ if not proposed_encoding:
+ continue
+ if proposed_encoding in tried_encodings:
+ continue
+ tried_encodings.append(proposed_encoding)
+ try:
+ data = _toUTF8(data, proposed_encoding)
+ except (UnicodeDecodeError, LookupError):
+ pass
+ else:
+ known_encoding = use_strict_parser = 1
+ break
+ # if no luck and we have auto-detection library, try that
+ if (not known_encoding) and chardet:
+ proposed_encoding = chardet.detect(data)['encoding']
+ if proposed_encoding and (proposed_encoding not in tried_encodings):
+ tried_encodings.append(proposed_encoding)
+ try:
+ data = _toUTF8(data, proposed_encoding)
+ except (UnicodeDecodeError, LookupError):
+ pass
+ else:
+ known_encoding = use_strict_parser = 1
+ # if still no luck and we haven't tried utf-8 yet, try that
+ if (not known_encoding) and (u'utf-8' not in tried_encodings):
+ proposed_encoding = u'utf-8'
+ tried_encodings.append(proposed_encoding)
+ try:
+ data = _toUTF8(data, proposed_encoding)
+ except UnicodeDecodeError:
+ pass
+ else:
+ known_encoding = use_strict_parser = 1
+ # if still no luck and we haven't tried windows-1252 yet, try that
+ if (not known_encoding) and (u'windows-1252' not in tried_encodings):
+ proposed_encoding = u'windows-1252'
+ tried_encodings.append(proposed_encoding)
+ try:
+ data = _toUTF8(data, proposed_encoding)
+ except UnicodeDecodeError:
+ pass
+ else:
+ known_encoding = use_strict_parser = 1
+ # if still no luck and we haven't tried iso-8859-2 yet, try that.
+ if (not known_encoding) and (u'iso-8859-2' not in tried_encodings):
+ proposed_encoding = u'iso-8859-2'
+ tried_encodings.append(proposed_encoding)
+ try:
+ data = _toUTF8(data, proposed_encoding)
+ except UnicodeDecodeError:
+ pass
+ else:
+ known_encoding = use_strict_parser = 1
+ # if still no luck, give up
+ if not known_encoding:
+ result['bozo'] = 1
+ result['bozo_exception'] = CharacterEncodingUnknown( \
+ 'document encoding unknown, I tried ' + \
+ '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' % \
+ (result['encoding'], xml_encoding))
+ result['encoding'] = u''
+ elif proposed_encoding != result['encoding']:
+ result['bozo'] = 1
+ result['bozo_exception'] = CharacterEncodingOverride( \
+ 'document declared as %s, but parsed as %s' % \
+ (result['encoding'], proposed_encoding))
+ result['encoding'] = proposed_encoding
+
+ if not _XML_AVAILABLE:
+ use_strict_parser = 0
+ if use_strict_parser:
+ # initialize the SAX parser
+ feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')
+ saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
+ saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
+ try:
+ # disable downloading external doctype references, if possible
+ saxparser.setFeature(xml.sax.handler.feature_external_ges, 0)
+ except xml.sax.SAXNotSupportedException:
+ pass
+ saxparser.setContentHandler(feedparser)
+ saxparser.setErrorHandler(feedparser)
+ source = xml.sax.xmlreader.InputSource()
+ source.setByteStream(_StringIO(data))
+ if hasattr(saxparser, '_ns_stack'):
+ # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
+ # PyXML doesn't have this problem, and it doesn't have _ns_stack either
+ saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
+ try:
+ saxparser.parse(source)
+ except xml.sax.SAXParseException, e:
+ result['bozo'] = 1
+ result['bozo_exception'] = feedparser.exc or e
+ use_strict_parser = 0
+ if not use_strict_parser and _SGML_AVAILABLE:
+ feedparser = _LooseFeedParser(baseuri, baselang, 'utf-8', entities)
+ feedparser.feed(data.decode('utf-8', 'replace'))
+ result['feed'] = feedparser.feeddata
+ result['entries'] = feedparser.entries
+ result['version'] = result['version'] or feedparser.version
+ result['namespaces'] = feedparser.namespacesInUse
+ return result
--- /dev/null
+#!/usr/bin/env python2.5
+
+# Copyright (c) 2011 Neal H. Walfield <neal@walfield.org>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import urllib2
+import httplib
+import time
+import logging
+logger = logging.getLogger(__name__)
+
+class ProgressSocket(object):
+ """
+ Monitor what is being sent and received.
+ """
+ def __init__(self, socket, connection):
+ self.socket = socket
+ self.connection = connection
+
+ def __getattribute__(self, attr):
+ # logger.debug("%s.__getattribute__(%s)"
+ # % (self.__class__.__name__, attr))
+
+ def send(data):
+ # 100k at a time.
+ bs = 100 * 1024
+ sent = 0
+ while sent < len (data):
+ remaining = len (data) - sent
+ if remaining < bs:
+ amount = remaining
+ else:
+ amount = bs
+
+ self.socket.sendall(data[sent:sent+amount])
+ sent += amount
+ self.connection.stats['sent'] += amount
+ self.connection.opener.stats['sent'] += amount
+
+ if self.connection.callback is not None:
+ self.connection.callback ()
+
+ def read(*args, **kwargs):
+ data = self.socket.read (*args, **kwargs)
+ # print "GOT: %s" % (data[0:240],)
+ self.connection.stats['received'] += len (data)
+ self.connection.opener.stats['received'] += len (data)
+ if self.connection.callback is not None:
+ self.connection.callback ()
+ return data
+
+ if attr == 'send' or attr == 'sendall':
+ return send
+ if attr == 'read':
+ return read
+
+ try:
+ return super (ProgressSocket, self).__getattribute__(attr)
+ except AttributeError:
+ socket = super (ProgressSocket, self).__getattribute__('socket')
+ return socket.__getattribute__(attr)
+
+ def makefile(self, mode, bufsize):
+ return ProgressSocket (socket=self.socket.makefile(mode, bufsize),
+ connection=self.connection)
+
+ def close(self):
+ return self.socket.close ()
+
+def HTTPProgressConnectionBuilder(callback, opener):
+ class HTTPProgressConnection(httplib.HTTPConnection):
+ def __init__(self, *args, **kwargs):
+ self.method = None
+ self.url = None
+ return httplib.HTTPConnection.__init__ (self, *args, **kwargs)
+
+ def putrequest(self, method, url, *args, **kwargs):
+ self.method = method
+ self.url = url
+ return httplib.HTTPConnection.putrequest (
+ self, method, url, *args, **kwargs)
+
+ def connect(self):
+ httplib.HTTPConnection.connect(self)
+ # Wrap the socket.
+ self.sock = ProgressSocket(socket=self.sock,
+ connection=self)
+
+ HTTPProgressConnection.callback = callback
+ HTTPProgressConnection.opener = opener
+ HTTPProgressConnection.stats \
+ = {'sent': 0, 'received': 0, 'started':time.time()}
+ return HTTPProgressConnection
+
+class HTTPProgressHandler(urllib2.HTTPHandler):
+ def __init__(self, callback):
+ self.callback = callback
+ self.stats = {'sent': 0, 'received': 0, 'started':time.time()}
+ return urllib2.HTTPHandler.__init__(self)
+
+ def http_open(self, request):
+ return self.do_open(
+ HTTPProgressConnectionBuilder(self.callback, self),
+ request)
+
+if __name__ == '__main__':
+ def callback(connection):
+ req = ""
+ if connection.method:
+ req += connection.method + " "
+ req += connection.host + ':' + str (connection.port)
+ if connection.url:
+ req += connection.url
+
+ cstats = connection.stats
+ ostats = connection.opener.stats
+
+ print(
+ ("%s: connection: %d sent, %d received: %d kb/s; "
+ + "opener: %d sent, %d received, %d kb/s")
+ % (req,
+ cstats['sent'], cstats['received'],
+ ((cstats['sent'] + cstats['received'])
+ / (time.time() - cstats['started']) / 1024),
+ ostats['sent'], ostats['received'],
+ ((ostats['sent'] + ostats['received'])
+ / (time.time() - ostats['started']) / 1024)))
+
+ opener = urllib2.build_opener(HTTPProgressHandler(callback))
+
+ data = opener.open ('http://google.com')
+ downloaded = 0
+ for d in data:
+ downloaded += len (d)
+ print "Document is %d bytes in size" % (downloaded,)
--- /dev/null
+#!/usr/bin/env python2.5
+
+# Copyright (c) 2011 Neal H. Walfield <neal@walfield.org>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import threading
+import thread
+import traceback
+import heapq
+import sys
+import mainthread
+import logging
+logger = logging.getLogger(__name__)
+
+def debug(*args):
+ if False:
+ logger.debug(' '.join(args))
+
+# The default priority. Like nice(), a smaller numeric priority
+# corresponds to a higher priority class.
+default_priority = 0
+
+class JobRunner(threading.Thread):
+ def __init__(self, job_manager):
+ threading.Thread.__init__(self)
+ self.job_manager = job_manager
+
+ def run (self):
+ have_lock = True
+ self.job_manager.lock.acquire ()
+ try:
+ while (self.job_manager.pause == 0
+ and not self.job_manager.do_quit
+ and (len (self.job_manager.threads)
+ <= self.job_manager.num_threads)):
+ try:
+ _, key, job = heapq.heappop (self.job_manager.queue)
+ except IndexError:
+ return
+
+ try:
+ self.job_manager.in_progress.append (key)
+ self.job_manager.lock.release ()
+ have_lock = False
+
+ # Execute the job.
+ try:
+ job ()
+ except KeyboardInterrupt:
+ # This is handled below and doesn't require a
+ # traceback.
+ raise
+ except:
+ print ("Executing job %s (%s) from thread %s: %s"
+ % (str (key), str (job),
+ threading.currentThread(),
+ traceback.format_exc ()))
+
+ self.job_manager.lock.acquire ()
+ have_lock = True
+
+ assert key in self.job_manager.in_progress
+ finally:
+ try:
+ self.job_manager.in_progress.remove (key)
+ except ValueError:
+ pass
+
+ debug("Finished executing job %s (%s)" % (key, job,))
+
+ self.job_manager._stats_hooks_run ({'job':job, 'key':key})
+ except KeyboardInterrupt:
+ debug("%s: KeyboardInterrupt" % threading.currentThread())
+ thread.interrupt_main()
+ debug("%s: Forwarded KeyboardInterrupt to main thread"
+ % threading.currentThread())
+ finally:
+ if have_lock:
+ self.job_manager.lock.release ()
+
+ assert self in self.job_manager.threads
+ self.job_manager.threads.remove (self)
+
+ debug ("Job runner %s (%d left) exiting."
+ % (threading.currentThread(),
+ len (self.job_manager.threads)))
+
+_jm = None
+def JobManager(start=False):
+ """
+ Return the job manager instance. The job manager will not start
+ executing jobs until this is called with start set to True. Note:
+ you can still queue jobs.
+ """
+ global _jm
+ if _jm is None:
+ _jm = _JobManager ()
+ if start and not _jm.started:
+ _jm.started = True
+ if _jm.jobs > 0:
+ _jm._stats_hooks_run ()
+ _jm.tickle ()
+
+ return _jm
+
+class _JobManager(object):
+ def __init__(self, started=False, num_threads=4):
+ """
+ Initialize the job manager.
+
+ If started is false, jobs may be queued, but jobs will not be
+ started until start() is called.
+ """
+ # A reentrant lock so that a job runner can call stat without
+ # dropping the lock.
+ self.lock = threading.RLock()
+
+ # If we can start executing jobs.
+ self.started = started
+
+ # The maximum number of threads to use for executing jobs.
+ self._num_threads = num_threads
+
+ # List of jobs (priority, key, job) that are queued for
+ # execution.
+ self.queue = []
+ # List of keys of the jobs that are being executed.
+ self.in_progress = []
+ # List of threads.
+ self.threads = []
+
+ # If 0, jobs may execute, otherwise, job execution is paused.
+ self.pause = 0
+
+ # The total number of jobs that this manager ever executed.
+ self.jobs = 0
+
+ # A list of status hooks to execute when the stats change.
+ self._stats_hooks = []
+ self._current_stats = self.stats ()
+
+ self.do_quit = False
+
+ def _lock(f):
+ def wrapper(*args, **kwargs):
+ self = args[0]
+ self.lock.acquire ()
+ try:
+ return f(*args, **kwargs)
+ finally:
+ self.lock.release()
+ return wrapper
+
+ def get_num_threads(self):
+ return self._num_threads
+ def set_num_threads(self, value):
+ self._num_threads = value
+ self.tickle ()
+ num_threads = property(get_num_threads, set_num_threads)
+
+ @_lock
+ def start(self):
+ """
+ Start executing jobs.
+ """
+ if self.started:
+ return
+ if self.jobs > 0:
+ self._stats_hooks_run ()
+ self.tickle ()
+
+ @_lock
+ def tickle(self):
+ """
+ Ensure that there are enough job runners for the number of
+ pending jobs.
+ """
+ if self.do_quit:
+ debug("%s.quit called, not creating new threads."
+ % self.__class__.__name__)
+ return
+
+ if self.pause > 0:
+ # Job execution is paused. Don't start any new threads.
+ debug("%s.tickle(): Not doing anything: paused"
+ % (self.__class__.__name__))
+ return
+
+ debug("%s.tickle: Have %d threads (can start %d); %d jobs queued"
+ % (self.__class__.__name__,
+ len (self.threads), self.num_threads, len (self.queue)))
+ if len (self.threads) < self.num_threads:
+ for _ in range (min (len (self.queue),
+ self.num_threads - len (self.threads))):
+ thread = JobRunner (self)
+ # Setting threads as daemons means faster shutdown
+ # when the main thread exists, but it results in
+ # exceptions and occassional setfaults.
+ # thread.setDaemon(True)
+ self.threads.append (thread)
+ thread.start ()
+ debug("Now have %d threads" % len (self.threads))
+
+ @_lock
+ def execute(self, job, key=None, priority=default_priority):
+ """
+ Enqueue a job for execution. job is a function to execute.
+ If key is not None, the job is only enqueued if there is no
+ job that is inprogress or enqueued with the same key.
+ priority is the job's priority. Like nice(), a smaller
+ numeric priority corresponds to a higher priority class. Jobs
+ are executed highest priority first, in the order that they
+ were added.
+ """
+ if self.do_quit:
+ debug("%s.quit called, not enqueuing new jobs."
+ % self.__class__.__name__)
+
+ if key is not None:
+ if key in self.in_progress:
+ return
+ for item in self.queue:
+ if item[1] == key:
+ if item[0][0] < priority:
+ # Priority raised.
+ item[0][0] = priority
+ self.queue = heapq.heapify (self.queue)
+ return
+
+ # To ensure that jobs with the same priority are executed
+ # in the order they are added, we set the priority to
+ # [priority, next (monotomic counter)].
+ self.jobs += 1
+ heapq.heappush (self.queue, [[priority, self.jobs], key, job])
+
+ if self.started:
+ self._stats_hooks_run ()
+ self.tickle ()
+ else:
+ debug("%s not initialized. delaying execution of %s (%s)"
+ % (self.__class__.__name__, key, str (job),))
+
+ @_lock
+ def pause(self):
+ """
+ Increasement the pause count. When the pause count is greater
+ than 0, job execution is suspended.
+ """
+ self.pause += 1
+
+ if self.pause == 1:
+ self._stats_hooks_run ()
+
+ @_lock
+ def resume(self):
+ """
+ Decrement the pause count. If the pause count is greater than
+ 0 and this decrement brings it to 0, enqueued jobs are
+ resumed.
+ """
+ assert self.pause > 0
+ self.pause -= 1
+ if not self.paused():
+ self._stats_hooks_run ()
+ self.tickle ()
+
+ @_lock
+ def paused(self):
+ """
+ Returns whether job execution is paused.
+ """
+ return self.pause > 0
+
+ @_lock
+ def cancel(self):
+ """
+ Cancel any pending jobs.
+ """
+ self.queue = []
+ self._stats_hooks_run ()
+
+ def quit(self):
+ self.cancel ()
+ self.do_quit = True
+
+ @_lock
+ def stats(self):
+ """
+ Return a dictionary consisting of:
+
+ - 'paused': whether execution is paused
+ - 'jobs': the total number of jobs this manager has
+ executed, is executing or are queued
+ - 'jobs-completed': the numer of jobs that have completed
+ - 'jobs-in-progress': the number of jobs in progress
+ - 'jobs-queued': the number of jobs currently queued
+ """
+ return {'paused': self.paused(),
+ 'jobs': self.jobs,
+ 'jobs-completed':
+ self.jobs - len (self.in_progress) - len (self.queue),
+ 'jobs-in-progress': len (self.in_progress),
+ 'jobs-queued': len (self.queue)
+ }
+
+ def stats_hook_register(self, func, *args, **kwargs):
+ """
+ Registers a function to be called when the job status changes.
+ Passed the following parameters:
+
+ - the JobManager instance.
+ - the previous stats (as returned by stats)
+ - the current stats
+ - the job that was completed (or None)
+
+ Note: the hook may not be run in the main thread!
+ """
+ mainthread=False
+ try:
+ mainthread = kwargs['run_in_main_thread']
+ del kwargs['run_in_main_thread']
+ except KeyError:
+ pass
+ self._stats_hooks.append ([func, mainthread, args, kwargs])
+
+ def _stats_hooks_run(self, completed_job=None):
+ """
+ Run the stats hooks.
+ """
+ # if not self._stats_hooks:
+ # return
+
+ self.lock.acquire ()
+ try:
+ old_stats = self._current_stats
+ self._current_stats = self.stats ()
+ current_stats = self._current_stats
+ finally:
+ self.lock.release ()
+
+ debug("%s -> %s" % (str (old_stats), str (current_stats)))
+
+ for (f, run_in_main_thread, args, kwargs) in self._stats_hooks:
+ if run_in_main_thread:
+ debug("JobManager._stats_hooks_run: Running %s in main thread"
+ % f)
+ mainthread.execute(
+ f, self, old_stats, current_stats, completed_job,
+ async=True, *args, **kwargs)
+ else:
+ debug("JobManager._stats_hooks_run: Running %s in any thread"
+ % f)
+ f(self, old_stats, current_stats, completed_job,
+ *args, **kwargs)
--- /dev/null
+#!/usr/bin/env python2.5
+
+# Copyright (c) 2011 Neal H. Walfield <neal@walfield.org>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import threading
+import traceback
+import logging
+logger = logging.getLogger(__name__)
+
+_run_in_main_thread = None
+_main_thread = None
+
+def init(run_in_main_thread=None):
+ """
+ run_in_main_thread is a function that takes a single argument, a
+ callable and returns False. run_in_main_thread should run the
+ function in the main thread.
+
+ If you are using glib, gobject.idle_add (the default) is
+ sufficient. (gobject.idle_add is thread-safe.)
+ """
+ if run_in_main_thread is None:
+ import gobject
+ run_in_main_thread = gobject.idle_add
+
+ global _run_in_main_thread
+ assert _run_in_main_thread is None
+ _run_in_main_thread = run_in_main_thread
+
+ global _main_thread
+ _main_thread = threading.currentThread ()
+
+def execute(func, *args, **kwargs):
+ """
+ Execute FUNC in the main thread.
+
+ If kwargs['async'] exists and is True, the function is executed
+ asynchronously (i.e., the thread does not wait for the function to
+ return in which case the function's return value is discarded).
+ Otherwise, this function waits until the function is executed and
+ returns its return value.
+ """
+ async = False
+ try:
+ async = kwargs['async']
+ del kwargs['async']
+ except KeyError:
+ pass
+
+ if threading.currentThread() == _main_thread:
+ if async:
+ try:
+ func (*args, **kwargs)
+ except:
+ logger.debug("mainthread.execute: Executing %s: %s"
+ % (func, traceback.format_exc ()))
+ return
+ else:
+ return func (*args, **kwargs)
+
+ assert _run_in_main_thread is not None, \
+ "You can't call this function from a non-main thread until you've called init()"
+
+ if not async:
+ cond = threading.Condition()
+
+ result = {}
+ result['done'] = False
+
+ def doit():
+ def it():
+ # Execute the function.
+ assert threading.currentThread() == _main_thread
+
+ try:
+ result['result'] = func (*args, **kwargs)
+ except:
+ logger.debug("mainthread.execute: Executing %s: %s"
+ % (func, traceback.format_exc ()))
+
+ if not async:
+ cond.acquire ()
+ result['done'] = True
+ if not async:
+ cond.notify ()
+ cond.release ()
+
+ return False
+ return it
+
+ if not async:
+ cond.acquire ()
+ _run_in_main_thread (doit())
+
+ if async:
+ # Don't wait for the method to complete execution.
+ return
+
+ # Wait for the result to become available.
+ while not result['done']:
+ cond.wait ()
+
+ return result.get ('result', None)
+
+if __name__ == "__main__":
+ import sys
+ import gobject
+
+ init()
+
+ def in_main_thread(test_num):
+ assert threading.currentThread() == _main_thread, \
+ "Test %d failed" % (test_num,)
+ return test_num
+
+ mainloop = gobject.MainLoop()
+ gobject.threads_init()
+
+ assert execute (in_main_thread, 1) == 1
+ assert (execute (in_main_thread, 2, async=False) == 2)
+ execute (in_main_thread, 3, async=True)
+
+ class T(threading.Thread):
+ def __init__(self):
+ threading.Thread.__init__(self)
+
+ def run(self):
+ assert threading.currentThread() != _main_thread
+
+ assert execute (in_main_thread, 4) == 4
+ assert (execute (in_main_thread, 5, async=False) == 5)
+ execute (in_main_thread, 6, async=True)
+ execute (mainloop.quit, async=False)
+
+ def start_thread():
+ t = T()
+ t.start()
+ return False
+
+ gobject.idle_add (start_thread)
+ mainloop.run()
+
+def mainthread(f):
+ def wrapper(*args, **kwargs):
+ return execute (f, *args, **kwargs)
+ return wrapper
--- /dev/null
+#!/usr/bin/env python2.5
+
+#
+# Copyright (c) 2007-2008 INdT.
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+# ============================================================================
+# Name : FeedingIt.py
+# Author : Yves Marcoz
+# Version : 0.2.2
+# Description : Simple RSS Reader
+# ============================================================================
+
+from xml.dom.minidom import parse, parseString
+import urllib2
+import gtk
+import hildon
+import gobject
+import time
+from os.path import isfile, dirname
+import gobject
+import logging
+logger = logging.getLogger(__name__)
+
+class ExportOpmlData():
+ def __init__(self, parent, listing):
+ fs = hildon.FileSystemModel()
+ dialog = hildon.FileChooserDialog(parent, gtk.FILE_CHOOSER_ACTION_SAVE, fs)
+ #(gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL,
+ #gtk.STOCK_SAVE, gtk.RESPONSE_OK))
+ #)
+ #dialog = gobject.new(hildon.FileChooserDialog, \
+ # action=gtk.FILE_CHOOSER_ACTION_SAVE)
+ #dialog.set_default_response(gtk.RESPONSE_OK)
+ #dialog.set_property('autonaming',False)
+ #dialog.set_property('show-files',True)
+ dialog.set_current_folder('/home/user/MyDocs/')
+ dialog.set_current_name('feedingit-export')
+ dialog.set_extension('opml')
+ response = dialog.run()
+ dialog.hide()
+ if response == gtk.RESPONSE_OK:
+ filename = dialog.get_filename()
+ logger.debug("ExportOpmlData: %s" % filename)
+ #try:
+
+ cont = True
+ if isfile(filename):
+ note = "File already exists. Aborted"
+ confirm = hildon.Note ("confirmation", parent, "File already exists. Are you sure you want to overwrite it?", gtk.STOCK_DIALOG_WARNING )
+ confirm.set_button_texts ("Yes", "Cancel")
+ response = confirm.run()
+ confirm.destroy()
+ if response == gtk.RESPONSE_OK:
+ cont = True
+ else:
+ note = "Operation cancelled."
+ cont = False
+ if cont:
+ file = open(filename, "w")
+ file.write(self.getOpmlText(listing))
+ file.close()
+ note = "Feeds exported to %s" %filename
+ #except:
+ note = "Failed to export feeds"
+
+ #dialog.destroy()
+ #dialog = hildon.Note ("information", parent, note , gtk.STOCK_DIALOG_INFO )
+ #dialog.run()
+ #dialog.destroy()
+ elif response == gtk.RESPONSE_CANCEL:
+ dialog.destroy()
+
+ def getOpmlText(self, listing):
+ time_now = time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.gmtime())
+ opml_text = """<?xml version="1.0" encoding="UTF-8"?>
+<opml version="1.0">
+<head>
+ <title>Feeding It Export</title>
+</head>
+<body>
+"""
+ for key in listing.getListOfFeeds():
+ title = listing.getFeedTitle(key)
+ url = listing.getFeedUrl(key)
+ if not title == "Archived Articles":
+ opml_text += """\n\t\t<outline type="rss" text="%s" title="%s" xmlUrl="%s"/>""" % (self.sanitize(title), self.sanitize(title), self.sanitize(url))
+ opml_text += """\n</body>\n</opml>\n"""
+ return opml_text
+
+ def sanitize(self, text):
+ from cgi import escape
+ return escape(text).encode('ascii', 'xmlcharrefreplace')
+
+
+
+class GetOpmlData():
+ def __init__(self, parent):
+ self.parent = parent
+ dialog = hildon.Note ("confirmation", parent, "What type of OPML?", gtk.STOCK_DIALOG_WARNING )
+ dialog.set_button_texts ("File", "URL")
+ response = dialog.run()
+ dialog.destroy()
+
+ if response == gtk.RESPONSE_OK:
+ # Choose a file
+ self.data = self.askForFile()
+ else:
+ # Download a URL
+ self.data = self.downloadFile()
+
+ def getData(self):
+ if not self.data == None:
+ dialog = OpmlDialog(self.parent, self.data)
+ response = dialog.run()
+ if response == gtk.RESPONSE_ACCEPT:
+ items = dialog.getItems()
+ else:
+ items = []
+ dialog.destroy()
+ return items
+ return []
+
+ def downloadFile(self):
+ dlg = gtk.Dialog("Import OPML from web", self.parent, gtk.DIALOG_DESTROY_WITH_PARENT,
+ ('Import', gtk.RESPONSE_OK,
+ gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL))
+ hb = gtk.HBox(False, 5)
+ hb.pack_start(gtk.Label('URL:'), expand=False)
+ entry = hildon.Entry(0)
+ entry.set_text("http://")
+ entry.select_region(-1, -1)
+ hb.pack_start(entry, expand=True)
+ hb.show_all()
+ dlg.vbox.pack_start(hb, False)
+
+ resp = dlg.run()
+ url = entry.get_text()
+ dlg.destroy()
+ if resp == gtk.RESPONSE_CANCEL:
+ return None
+ try:
+ f = urllib2.urlopen(url)
+ data = f.read()
+ f.close()
+ except:
+ #Show error note
+ return None
+ return data
+
+ def askForFile(self):
+ #dialog = hildon.FileChooserDialog(self.parent,
+ # gtk.FILE_CHOOSER_ACTION_OPEN)
+ #dialog = gobject.new(hildon.FileChooserDialog, \
+ # action=gtk.FILE_CHOOSER_ACTION_OPEN)
+ #dialog.set_default_response(gtk.RESPONSE_OK)
+ fs = hildon.FileSystemModel()
+ dialog = hildon.FileChooserDialog(self.parent, gtk.FILE_CHOOSER_ACTION_OPEN, fs)
+
+ filter = gtk.FileFilter()
+ filter.set_name("All files")
+ filter.add_pattern("*")
+ dialog.add_filter(filter)
+
+ filter = gtk.FileFilter()
+ filter.set_name("OPML")
+ filter.add_pattern("*.xml")
+ filter.add_pattern("*.opml")
+ dialog.add_filter(filter)
+
+ response = dialog.run()
+ if response == gtk.RESPONSE_OK:
+ file = open(dialog.get_filename())
+ data = file.read()
+ file.close()
+ dialog.destroy()
+ return data
+ elif response == gtk.RESPONSE_CANCEL:
+ dialog.destroy()
+ return None
+
+
+class OpmlDialog(gtk.Dialog):
+ def parse(self, opmlData):
+ self.feeds = []
+ dom1 = parseString(opmlData)
+
+ outlines = dom1.getElementsByTagName('outline')
+ for outline in outlines:
+ title = outline.getAttribute('text')
+ url = outline.getAttribute('xmlUrl')
+ if url == "":
+ url = outline.getAttribute('htmlUrl')
+ if not url == "":
+ self.feeds.append( (title, url) )
+
+ def getFeedLinks(self):
+ return self.feeds
+
+ def __init__(self, parent, opmlData):
+ self.parse(opmlData)
+ gtk.Dialog.__init__(self, "Select OPML Feeds", parent, gtk.DIALOG_DESTROY_WITH_PARENT, (gtk.STOCK_OK, gtk.RESPONSE_ACCEPT))
+
+ self.pannableArea = hildon.PannableArea()
+ self.treestore = gtk.ListStore(gobject.TYPE_STRING, gobject.TYPE_STRING)
+ self.treeview = gtk.TreeView(self.treestore)
+
+ self.displayFeeds()
+
+ self.set_default_size(-1, 600)
+ self.vbox.pack_start(self.pannableArea)
+
+ button = hildon.GtkButton(gtk.HILDON_SIZE_AUTO)
+ button.set_label("Select All")
+ button.connect("clicked", self.button_select_all_clicked)
+ self.action_area.pack_end(button)
+
+ button = hildon.GtkButton(gtk.HILDON_SIZE_AUTO)
+ button.set_label("Unselect All")
+ button.connect("clicked", self.button_select_none_clicked)
+ self.action_area.pack_end(button)
+
+ self.show_all()
+
+ def button_select_all_clicked(self, button):
+ self.treeview.get_selection().select_all()
+
+ def button_select_none_clicked(self, button):
+ self.treeview.get_selection().unselect_all()
+
+ def displayFeeds(self):
+ self.treeview.destroy()
+ self.treestore = gtk.ListStore(gobject.TYPE_STRING, gobject.TYPE_STRING)
+ self.treeview = gtk.TreeView()
+
+ self.treeview.get_selection().set_mode(gtk.SELECTION_MULTIPLE)
+ hildon.hildon_gtk_tree_view_set_ui_mode(self.treeview, gtk.HILDON_UI_MODE_EDIT)
+ self.refreshList()
+ self.treeview.append_column(gtk.TreeViewColumn('Feed Name', gtk.CellRendererText(), text = 0))
+
+ self.pannableArea.add(self.treeview)
+ self.pannableArea.show_all()
+ self.treeview.get_selection().select_all()
+
+ def refreshList(self, selected=None, offset=0):
+ rect = self.treeview.get_visible_rect()
+ y = rect.y+rect.height
+ self.treestore = gtk.ListStore(gobject.TYPE_STRING, gobject.TYPE_STRING)
+ self.treeview.set_model(self.treestore)
+ for (title, url) in self.feeds:
+ item = self.treestore.append([title, url])
+ self.treeview.get_selection().select_iter(item)
+ #self.treeview.get_selection().select_all()
+ self.pannableArea.show_all()
+
+ def getItems(self):
+ list = []
+ treeselection = self.treeview.get_selection()
+ (model, pathlist) = treeselection.get_selected_rows()
+ for path in pathlist:
+ list.append( (model.get_value(model.get_iter(path),0), model.get_value(model.get_iter(path),1)) )
+ return list
+
+def showOpmlData(widget, parent, button):
+ dialog = GetOpmlData(parent)
+ logger.debug("showOpmlData: %s" % dialog.getData())
+ #dialog.destroy()
+
+if __name__ == "__main__":
+ window = hildon.Window()
+ window.set_title("Test App")
+
+
+ button = gtk.Button("Click to confirm.")
+ window.add(button)
+ button.connect("clicked", showOpmlData, window, button)
+ window.connect("destroy", gtk.main_quit)
+ window.show_all()
+
+ gtk.main()
+ window.destroy()
--- /dev/null
+#!/usr/bin/env python2.5
+
+#
+# Copyright (c) 2007-2008 INdT.
+# Copyright (c) 2011 Neal H. Walfield
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+# ============================================================================
+# Name : FeedingIt.py
+# Author : Yves Marcoz
+# Version : 0.5.4
+# Description : Simple RSS Reader
+# ============================================================================
+
+from __future__ import with_statement
+
+import sqlite3
+from os.path import isfile, isdir
+from shutil import rmtree
+from os import mkdir, remove, utime
+import os
+import md5
+import feedparser
+import time
+import urllib2
+from BeautifulSoup import BeautifulSoup
+from urlparse import urljoin
+from calendar import timegm
+import threading
+import traceback
+from wc import wc, wc_init, woodchuck
+import subprocess
+import dbus
+from updatedbus import update_server_object
+
+from jobmanager import JobManager
+import mainthread
+from httpprogresshandler import HTTPProgressHandler
+import random
+import sys
+import logging
+logger = logging.getLogger(__name__)
+
+def getId(string):
+ return md5.new(string).hexdigest()
+
+def download_callback(connection):
+ if JobManager().do_quit:
+ raise KeyboardInterrupt
+
+def downloader(progress_handler=None, proxy=None):
+ openers = []
+
+ if progress_handler is not None:
+ openers.append(progress_handler)
+ else:
+ openers.append(HTTPProgressHandler(download_callback))
+
+ if proxy:
+ openers.append(proxy)
+
+ return urllib2.build_opener(*openers)
+
+def transfer_stats(sent, received, **kwargs):
+ """
+ This function takes two arguments: sent is the number of bytes
+ sent so far, received is the number of bytes received. The
+ function returns a continuation that you can call later.
+
+ The continuation takes the same two arguments. It returns a tuple
+ of the number of bytes sent, the number of bytes received and the
+ time since the original function was invoked.
+ """
+ start_time = time.time()
+ start_sent = sent
+ start_received = received
+
+ def e(sent, received, **kwargs):
+ return (sent - start_sent,
+ received - start_received,
+ time.time() - start_time)
+
+ return e
+
+# If not None, a subprocess.Popen object corresponding to a
+# update_feeds.py process.
+update_feed_process = None
+
+update_feeds_iface = None
+
+jobs_at_start = 0
+
+class BaseObject(object):
+ # Columns to cache. Classes that inherit from this and use the
+ # cache mechanism should set this to a list of tuples, each of
+ # which contains two entries: the table and the column. Note that
+ # both are case sensitive.
+ cached_columns = ()
+
+ def cache_invalidate(self, table=None):
+ """
+ Invalidate the cache.
+
+ If table is not None, invalidate only the specified table.
+ Otherwise, drop the whole cache.
+ """
+ if not hasattr(self, 'cache'):
+ return
+
+ if table is None:
+ del self.cache
+ else:
+ if table in self.cache:
+ del self.cache[table]
+
+ def lookup(self, table, column, id=None):
+ """
+ Look up a column or value. Uses a cache for columns in
+ cached_columns. Note: the column is returned unsorted.
+ """
+ if not hasattr(self, 'cache'):
+ self.cache = {}
+
+ # Cache data for at most 60 seconds.
+ now = time.time()
+ try:
+ cache = self.cache[table]
+
+ if time.time() - cache[None] > 60:
+ # logger.debug("%s: Cache too old: clearing" % (table,))
+ del self.cache[table]
+ cache = None
+ except KeyError:
+ cache = None
+
+ if (cache is None
+ or (table, column) not in self.cached_columns):
+ # The cache is empty or the caller wants a column that we
+ # don't cache.
+ if (table, column) in self.cached_columns:
+ # logger.debug("%s: Rebuilding cache" % (table,))
+
+ do_cache = True
+
+ self.cache[table] = cache = {}
+ columns = []
+ for t, c in self.cached_columns:
+ if table == t:
+ cache[c] = {}
+ columns.append(c)
+
+ columns.append('id')
+ where = ""
+ else:
+ do_cache = False
+
+ columns = (colums,)
+ if id is not None:
+ where = "where id = '%s'" % id
+ else:
+ where = ""
+
+ results = self.db.execute(
+ "SELECT %s FROM %s %s" % (','.join(columns), table, where))
+
+ if do_cache:
+ for r in results:
+ values = list(r)
+ i = values.pop()
+ for index, value in enumerate(values):
+ cache[columns[index]][i] = value
+
+ cache[None] = now
+ else:
+ results = []
+ for r in results:
+ if id is not None:
+ return values[0]
+
+ results.append(values[0])
+
+ return results
+ else:
+ cache = self.cache[table]
+
+ try:
+ if id is not None:
+ value = cache[column][id]
+ # logger.debug("%s.%s:%s -> %s" % (table, column, id, value))
+ return value
+ else:
+ return cache[column].values()
+ except KeyError:
+ # logger.debug("%s.%s:%s -> Not found" % (table, column, id))
+ return None
+
+class Feed(BaseObject):
+ # Columns to cache.
+ cached_columns = (('feed', 'read'),
+ ('feed', 'title'))
+
+ serial_execution_lock = threading.Lock()
+
+ def _getdb(self):
+ try:
+ db = self.tls.db
+ except AttributeError:
+ db = sqlite3.connect("%s/%s.db" % (self.dir, self.key), timeout=120)
+ self.tls.db = db
+ return db
+ db = property(_getdb)
+
+ def __init__(self, configdir, key):
+ self.key = key
+ self.configdir = configdir
+ self.dir = "%s/%s.d" %(self.configdir, self.key)
+ self.tls = threading.local ()
+
+ if not isdir(self.dir):
+ mkdir(self.dir)
+ if not isfile("%s/%s.db" %(self.dir, self.key)):
+ self.db.execute("CREATE TABLE feed (id text, title text, contentLink text, date float, updated float, link text, read int);")
+ self.db.execute("CREATE TABLE images (id text, imagePath text);")
+ self.db.commit()
+
+ def addImage(self, configdir, key, baseurl, url, proxy=None, opener=None):
+ filename = configdir+key+".d/"+getId(url)
+ if not isfile(filename):
+ try:
+ if not opener:
+ opener = downloader(proxy=proxy)
+
+ abs_url = urljoin(baseurl,url)
+ f = opener.open(abs_url)
+ try:
+ with open(filename, "w") as outf:
+ for data in f:
+ outf.write(data)
+ finally:
+ f.close()
+ except (urllib2.HTTPError, urllib2.URLError, IOError), exception:
+ logger.info("Could not download image %s: %s"
+ % (abs_url, str (exception)))
+ return None
+ except:
+ exception = sys.exc_info()[0]
+
+ logger.info("Downloading image %s: %s" %
+ (abs_url, traceback.format_exc()))
+ try:
+ remove(filename)
+ except OSError:
+ pass
+
+ return None
+ else:
+ #open(filename,"a").close() # "Touch" the file
+ file = open(filename,"a")
+ utime(filename, None)
+ file.close()
+ return filename
+
+ def updateFeed(self, configdir, url, etag, modified, expiryTime=24, proxy=None, imageCache=False, priority=0, postFeedUpdateFunc=None, *postFeedUpdateFuncArgs):
+ if (os.path.basename(sys.argv[0]) == 'update_feeds.py'):
+ def doit():
+ def it():
+ self._updateFeed(configdir, url, etag, modified, expiryTime, proxy, imageCache, postFeedUpdateFunc, *postFeedUpdateFuncArgs)
+ return it
+ JobManager().execute(doit(), self.key, priority=priority)
+ else:
+ def send_update_request():
+ global update_feeds_iface
+ if update_feeds_iface is None:
+ bus=dbus.SessionBus()
+ remote_object = bus.get_object(
+ "org.marcoz.feedingit", # Connection name
+ "/org/marcoz/feedingit/update" # Object's path
+ )
+ update_feeds_iface = dbus.Interface(
+ remote_object, 'org.marcoz.feedingit')
+
+ try:
+ update_feeds_iface.Update(self.key)
+ except Exception, e:
+ logger.error("Invoking org.marcoz.feedingit.Update: %s"
+ % str(e))
+ update_feeds_iface = None
+ else:
+ return True
+
+ if send_update_request():
+ # Success! It seems we were able to start the update
+ # daemon via dbus (or, it was already running).
+ return
+
+ global update_feed_process
+ if (update_feed_process is None
+ or update_feed_process.poll() is not None):
+ # The update_feeds process is not running. Start it.
+ update_feeds = os.path.join(os.path.dirname(__file__),
+ 'update_feeds.py')
+ argv = ['/usr/bin/env', 'python', update_feeds, '--daemon' ]
+ logger.debug("Starting update_feeds: running %s"
+ % (str(argv),))
+ update_feed_process = subprocess.Popen(argv)
+ # Make sure the dbus calls go to the right process:
+ # rebind.
+ update_feeds_iface = None
+
+ for _ in xrange(5):
+ if send_update_request():
+ break
+ time.sleep(1)
+
+ def _updateFeed(self, configdir, url, etag, modified, expiryTime=24, proxy=None, imageCache=False, postFeedUpdateFunc=None, *postFeedUpdateFuncArgs):
+ logger.debug("Updating %s" % url)
+
+ success = False
+ have_serial_execution_lock = False
+ try:
+ update_start = time.time ()
+
+ progress_handler = HTTPProgressHandler(download_callback)
+
+ openers = [progress_handler]
+ if proxy:
+ openers.append (proxy)
+ kwargs = {'handlers':openers}
+
+ feed_transfer_stats = transfer_stats(0, 0)
+
+ tmp=feedparser.parse(url, etag=etag, modified=modified, **kwargs)
+ download_duration = time.time () - update_start
+
+ opener = downloader(progress_handler, proxy)
+
+ if JobManager().do_quit:
+ raise KeyboardInterrupt
+
+ process_start = time.time()
+
+ # Expiry time is in hours
+ expiry = float(expiryTime) * 3600.
+
+ currentTime = 0
+
+ updated_objects = 0
+ new_objects = 0
+
+ def wc_success():
+ try:
+ wc().stream_register (self.key, "", 6 * 60 * 60)
+ except woodchuck.ObjectExistsError:
+ pass
+ try:
+ wc()[self.key].updated (
+ indicator=(woodchuck.Indicator.ApplicationVisual
+ |woodchuck.Indicator.StreamWide),
+ transferred_down=progress_handler.stats['received'],
+ transferred_up=progress_handler.stats['sent'],
+ transfer_time=update_start,
+ transfer_duration=download_duration,
+ new_objects=new_objects,
+ updated_objects=updated_objects,
+ objects_inline=new_objects + updated_objects)
+ except KeyError:
+ logger.warn(
+ "Failed to register update of %s with woodchuck!"
+ % (self.key))
+
+ http_status = tmp.get ('status', 200)
+
+ # Check if the parse was succesful. If the http status code
+ # is 304, then the download was successful, but there is
+ # nothing new. Indeed, no content is returned. This make a
+ # 304 look like an error because there are no entries and the
+ # parse fails. But really, everything went great! Check for
+ # this first.
+ if http_status == 304:
+ logger.debug("%s: No changes to feed." % (self.key,))
+ mainthread.execute(wc_success, async=True)
+ success = True
+ elif len(tmp["entries"])==0 and not tmp.version:
+ # An error occured fetching or parsing the feed. (Version
+ # will be either None if e.g. the connection timed our or
+ # '' if the data is not a proper feed)
+ logger.error(
+ "Error fetching %s: version is: %s: error: %s"
+ % (url, str (tmp.version),
+ str (tmp.get ('bozo_exception', 'Unknown error'))))
+ logger.debug(tmp)
+ def register_stream_update_failed(http_status):
+ def doit():
+ logger.debug("%s: stream update failed!" % self.key)
+
+ try:
+ # It's not easy to get the feed's title from here.
+ # At the latest, the next time the application is
+ # started, we'll fix up the human readable name.
+ wc().stream_register (self.key, "", 6 * 60 * 60)
+ except woodchuck.ObjectExistsError:
+ pass
+ ec = woodchuck.TransferStatus.TransientOther
+ if 300 <= http_status and http_status < 400:
+ ec = woodchuck.TransferStatus.TransientNetwork
+ if 400 <= http_status and http_status < 500:
+ ec = woodchuck.TransferStatus.FailureGone
+ if 500 <= http_status and http_status < 600:
+ ec = woodchuck.TransferStatus.TransientNetwork
+ wc()[self.key].update_failed(ec)
+ return doit
+ if wc().available:
+ mainthread.execute(
+ register_stream_update_failed(
+ http_status=http_status),
+ async=True)
+ else:
+ currentTime = time.time()
+ # The etag and modified value should only be updated if the content was not null
+ try:
+ etag = tmp["etag"]
+ except KeyError:
+ etag = None
+ try:
+ modified = tmp["modified"]
+ except KeyError:
+ modified = None
+ try:
+ abs_url = urljoin(tmp["feed"]["link"],"/favicon.ico")
+ f = opener.open(abs_url)
+ data = f.read()
+ f.close()
+ outf = open(self.dir+"/favicon.ico", "w")
+ outf.write(data)
+ outf.close()
+ del data
+ except (urllib2.HTTPError, urllib2.URLError), exception:
+ logger.debug("Could not download favicon %s: %s"
+ % (abs_url, str (exception)))
+
+ self.serial_execution_lock.acquire ()
+ have_serial_execution_lock = True
+
+ #reversedEntries = self.getEntries()
+ #reversedEntries.reverse()
+
+ ids = self.getIds()
+
+ tmp["entries"].reverse()
+ for entry in tmp["entries"]:
+ # Yield so as to make the main thread a bit more
+ # responsive.
+ time.sleep(0)
+
+ entry_transfer_stats = transfer_stats(
+ *feed_transfer_stats(**progress_handler.stats)[0:2])
+
+ if JobManager().do_quit:
+ raise KeyboardInterrupt
+
+ object_size = 0
+
+ date = self.extractDate(entry)
+ try:
+ entry["title"]
+ except KeyError:
+ entry["title"] = "No Title"
+ try :
+ entry["link"]
+ except KeyError:
+ entry["link"] = ""
+ try:
+ entry["author"]
+ except KeyError:
+ entry["author"] = None
+ if(not(entry.has_key("id"))):
+ entry["id"] = None
+ content = self.extractContent(entry)
+ object_size = len (content)
+ tmpEntry = {"title":entry["title"], "content":content,
+ "date":date, "link":entry["link"], "author":entry["author"], "id":entry["id"]}
+ id = self.generateUniqueId(tmpEntry)
+
+ current_version \
+ = self.db.execute('select date from feed where id=?',
+ (id,)).fetchone()
+ if (current_version is not None
+ and current_version[0] == date):
+ logger.debug("ALREADY DOWNLOADED %s (%s)"
+ % (entry["title"], entry["link"]))
+ continue
+
+ if current_version is not None:
+ # The version was updated. Mark it as unread.
+ logger.debug("UPDATED: %s (%s)"
+ % (entry["title"], entry["link"]))
+ self.setEntryUnread(id)
+ updated_objects += 1
+ else:
+ logger.debug("NEW: %s (%s)"
+ % (entry["title"], entry["link"]))
+ new_objects += 1
+
+ #articleTime = time.mktime(self.entries[id]["dateTuple"])
+ soup = BeautifulSoup(self.getArticle(tmpEntry)) #tmpEntry["content"])
+ images = soup('img')
+ baseurl = tmpEntry["link"]
+ #if not id in ids:
+ if imageCache and len(images) > 0:
+ self.serial_execution_lock.release ()
+ have_serial_execution_lock = False
+ for img in images:
+ filename = self.addImage(
+ configdir, self.key, baseurl, img['src'],
+ opener=opener)
+ if filename:
+ img['src']="file://%s" %filename
+ count = self.db.execute("SELECT count(1) FROM images where id=? and imagePath=?;", (id, filename )).fetchone()[0]
+ if count == 0:
+ self.db.execute("INSERT INTO images (id, imagePath) VALUES (?, ?);", (id, filename) )
+ self.db.commit()
+
+ try:
+ object_size += os.path.getsize (filename)
+ except os.error, exception:
+ logger.error ("Error getting size of %s: %s"
+ % (filename, exception))
+ self.serial_execution_lock.acquire ()
+ have_serial_execution_lock = True
+
+ tmpEntry["contentLink"] = configdir+self.key+".d/"+id+".html"
+ file = open(tmpEntry["contentLink"], "w")
+ file.write(soup.prettify())
+ file.close()
+ if id in ids:
+ self.db.execute("UPDATE feed SET updated=? WHERE id=?;", (currentTime, id) )
+ self.db.commit()
+ else:
+ values = (id, tmpEntry["title"], tmpEntry["contentLink"], tmpEntry["date"], currentTime, tmpEntry["link"], 0)
+ self.db.execute("INSERT INTO feed (id, title, contentLink, date, updated, link, read) VALUES (?, ?, ?, ?, ?, ?, ?);", values)
+ self.db.commit()
+# else:
+# try:
+# self.db.execute("UPDATE feed SET updated=? WHERE id=?;", (currentTime, id) )
+# self.db.commit()
+# filename = configdir+self.key+".d/"+id+".html"
+# file = open(filename,"a")
+# utime(filename, None)
+# file.close()
+# images = self.db.execute("SELECT imagePath FROM images where id=?;", (id, )).fetchall()
+# for image in images:
+# file = open(image[0],"a")
+# utime(image[0], None)
+# file.close()
+# except:
+# pass
+
+ # Register the object with Woodchuck and mark it as
+ # downloaded.
+ def register_object_transferred(
+ id, title, publication_time,
+ sent, received, object_size):
+ def doit():
+ logger.debug("Registering transfer of object %s"
+ % title)
+ try:
+ obj = wc()[self.key].object_register(
+ object_identifier=id,
+ human_readable_name=title)
+ except woodchuck.ObjectExistsError:
+ obj = wc()[self.key][id]
+ else:
+ obj.publication_time = publication_time
+ obj.transferred(
+ indicator=(
+ woodchuck.Indicator.ApplicationVisual
+ |woodchuck.Indicator.StreamWide),
+ transferred_down=received,
+ transferred_up=sent,
+ object_size=object_size)
+ return doit
+ if wc().available:
+ # If the entry does not contain a publication
+ # time, the attribute won't exist.
+ pubtime = entry.get('date_parsed', None)
+ if pubtime:
+ publication_time = time.mktime (pubtime)
+ else:
+ publication_time = None
+
+ sent, received, _ \
+ = entry_transfer_stats(**progress_handler.stats)
+ # sent and received are for objects (in
+ # particular, images) associated with this
+ # item. We also want to attribute the data
+ # transferred for the item's content. This is
+ # a good first approximation.
+ received += len(content)
+
+ mainthread.execute(
+ register_object_transferred(
+ id=id,
+ title=tmpEntry["title"],
+ publication_time=publication_time,
+ sent=sent, received=received,
+ object_size=object_size),
+ async=True)
+ self.db.commit()
+
+ sent, received, _ \
+ = feed_transfer_stats(**progress_handler.stats)
+ logger.debug (
+ "%s: Update successful: transferred: %d/%d; objects: %d)"
+ % (url, sent, received, len (tmp.entries)))
+ mainthread.execute (wc_success, async=True)
+ success = True
+
+ rows = self.db.execute("SELECT id FROM feed WHERE (read=0 AND updated<?) OR (read=1 AND updated<?);", (currentTime-2*expiry, currentTime-expiry))
+ for row in rows:
+ self.removeEntry(row[0])
+
+ from glob import glob
+ from os import stat
+ for file in glob(configdir+self.key+".d/*"):
+ #
+ stats = stat(file)
+ #
+ # put the two dates into matching format
+ #
+ lastmodDate = stats[8]
+ #
+ expDate = time.time()-expiry*3
+ # check if image-last-modified-date is outdated
+ #
+ if expDate > lastmodDate:
+ #
+ try:
+ #
+ #print 'Removing', file
+ #
+ # XXX: Tell woodchuck.
+ remove(file) # commented out for testing
+ #
+ except OSError, exception:
+ #
+ logger.error('Could not remove %s: %s'
+ % (file, str (exception)))
+ logger.debug("updated %s: %fs in download, %fs in processing"
+ % (self.key, download_duration,
+ time.time () - process_start))
+ except:
+ logger.error("Updating %s: %s" % (self.key, traceback.format_exc()))
+ finally:
+ self.db.commit ()
+
+ if have_serial_execution_lock:
+ self.serial_execution_lock.release ()
+
+ updateTime = 0
+ try:
+ rows = self.db.execute("SELECT MAX(date) FROM feed;")
+ for row in rows:
+ updateTime=row[0]
+ except Exception, e:
+ logger.error("Fetching update time: %s: %s"
+ % (str(e), traceback.format_exc()))
+ finally:
+ if not success:
+ etag = None
+ modified = None
+ title = None
+ try:
+ title = tmp.feed.title
+ except (AttributeError, UnboundLocalError), exception:
+ pass
+ if postFeedUpdateFunc is not None:
+ postFeedUpdateFunc (self.key, updateTime, etag, modified,
+ title, *postFeedUpdateFuncArgs)
+
+ self.cache_invalidate()
+
+ def setEntryRead(self, id):
+ self.db.execute("UPDATE feed SET read=1 WHERE id=?;", (id,) )
+ self.db.commit()
+
+ def doit():
+ try:
+ wc()[self.key][id].used()
+ except KeyError:
+ pass
+ if wc().available():
+ mainthread.execute(doit, async=True)
+ self.cache_invalidate('feed')
+
+ def setEntryUnread(self, id):
+ self.db.execute("UPDATE feed SET read=0 WHERE id=?;", (id,) )
+ self.db.commit()
+ self.cache_invalidate('feed')
+
+ def markAllAsRead(self):
+ self.db.execute("UPDATE feed SET read=1 WHERE read=0;")
+ self.db.commit()
+ self.cache_invalidate('feed')
+
+ def isEntryRead(self, id):
+ return self.lookup('feed', 'read', id) == 1
+
+ def getTitle(self, id):
+ return self.lookup('feed', 'title', id)
+
+ def getContentLink(self, id):
+ return self.db.execute("SELECT contentLink FROM feed WHERE id=?;", (id,) ).fetchone()[0]
+
+ def getExternalLink(self, id):
+ return self.db.execute("SELECT link FROM feed WHERE id=?;", (id,) ).fetchone()[0]
+
+ def getDate(self, id):
+ dateStamp = self.db.execute("SELECT date FROM feed WHERE id=?;", (id,) ).fetchone()[0]
+ return time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime(dateStamp))
+
+ def getDateTuple(self, id):
+ dateStamp = self.db.execute("SELECT date FROM feed WHERE id=?;", (id,) ).fetchone()[0]
+ return time.localtime(dateStamp)
+
+ def getDateStamp(self, id):
+ return self.db.execute("SELECT date FROM feed WHERE id=?;", (id,) ).fetchone()[0]
+
+ def generateUniqueId(self, entry):
+ """
+ Generate a stable identifier for the article. For the same
+ entry, this should result in the same identifier. If
+ possible, the identifier should remain the same even if the
+ article is updated.
+ """
+ # Prefer the entry's id, which is supposed to be globally
+ # unique.
+ key = entry.get('id', None)
+ if not key:
+ # Next, try the link to the content.
+ key = entry.get('link', None)
+ if not key:
+ # Ok, the title and the date concatenated are likely to be
+ # relatively stable.
+ key = entry.get('title', None) + entry.get('date', None)
+ if not key:
+ # Hmm, the article's content will at least guarantee no
+ # false negatives (i.e., missing articles)
+ key = entry.get('content', None)
+ if not key:
+ # If all else fails, just use a random number.
+ key = str (random.random ())
+ return getId (key)
+
+ def getIds(self, onlyUnread=False):
+ if onlyUnread:
+ rows = self.db.execute("SELECT id FROM feed where read=0 ORDER BY date DESC;").fetchall()
+ else:
+ rows = self.db.execute("SELECT id FROM feed ORDER BY date DESC;").fetchall()
+ ids = []
+ for row in rows:
+ ids.append(row[0])
+ #ids.reverse()
+ return ids
+
+ def getNextId(self, id, forward=True):
+ if forward:
+ delta = 1
+ else:
+ delta = -1
+ ids = self.getIds()
+ index = ids.index(id)
+ return ids[(index + delta) % len(ids)]
+
+ def getPreviousId(self, id):
+ return self.getNextId(id, forward=False)
+
+ def getNumberOfUnreadItems(self):
+ return self.db.execute("SELECT count(*) FROM feed WHERE read=0;").fetchone()[0]
+
+ def getNumberOfEntries(self):
+ return self.db.execute("SELECT count(*) FROM feed;").fetchone()[0]
+
+ def getArticle(self, entry):
+ #self.setEntryRead(id)
+ #entry = self.entries[id]
+ title = entry['title']
+ #content = entry.get('content', entry.get('summary_detail', {}))
+ content = entry["content"]
+
+ link = entry['link']
+ author = entry['author']
+ date = time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime(entry["date"]) )
+
+ #text = '''<div style="color: black; background-color: white;">'''
+ text = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">'
+ text += "<html><head><title>" + title + "</title>"
+ text += '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>\n'
+ #text += '<style> body {-webkit-user-select: none;} </style>'
+ text += '</head><body bgcolor=\"#ffffff\"><div><a href=\"' + link + '\">' + title + "</a>"
+ if author != None:
+ text += "<BR /><small><i>Author: " + author + "</i></small>"
+ text += "<BR /><small><i>Date: " + date + "</i></small></div>"
+ text += "<BR /><BR />"
+ text += content
+ text += "</body></html>"
+ return text
+
+ def getContent(self, id):
+ contentLink = self.db.execute("SELECT contentLink FROM feed WHERE id=?;", (id,)).fetchone()[0]
+ try:
+ file = open(self.entries[id]["contentLink"])
+ content = file.read()
+ file.close()
+ except:
+ content = "Content unavailable"
+ return content
+
+ def extractDate(self, entry):
+ if entry.has_key("updated_parsed"):
+ return timegm(entry["updated_parsed"])
+ elif entry.has_key("published_parsed"):
+ return timegm(entry["published_parsed"])
+ else:
+ return time.time()
+
+ def extractContent(self, entry):
+ content = ""
+ if entry.has_key('summary'):
+ content = entry.get('summary', '')
+ if entry.has_key('content'):
+ if len(entry.content[0].value) > len(content):
+ content = entry.content[0].value
+ if content == "":
+ content = entry.get('description', '')
+ return content
+
+ def removeEntry(self, id):
+ contentLink = self.db.execute("SELECT contentLink FROM feed WHERE id=?;", (id,)).fetchone()[0]
+ if contentLink:
+ try:
+ remove(contentLink)
+ except OSError, exception:
+ logger.error("Deleting %s: %s" % (contentLink, str (exception)))
+ self.db.execute("DELETE FROM feed WHERE id=?;", (id,) )
+ self.db.execute("DELETE FROM images WHERE id=?;", (id,) )
+ self.db.commit()
+
+ def doit():
+ try:
+ wc()[self.key][id].files_deleted (
+ woodchuck.DeletionResponse.Deleted)
+ del wc()[self.key][id]
+ except KeyError:
+ pass
+ if wc().available():
+ mainthread.execute (doit, async=True)
+
+class ArchivedArticles(Feed):
+ def addArchivedArticle(self, title, link, date, configdir):
+ id = self.generateUniqueId({"date":date, "title":title})
+ values = (id, title, link, date, 0, link, 0)
+ self.db.execute("INSERT INTO feed (id, title, contentLink, date, updated, link, read) VALUES (?, ?, ?, ?, ?, ?, ?);", values)
+ self.db.commit()
+
+ def updateFeed(self, configdir, url, etag, modified, expiryTime=24, proxy=None, imageCache=False):
+ currentTime = 0
+ rows = self.db.execute("SELECT id, link FROM feed WHERE updated=0;")
+ for row in rows:
+ currentTime = time.time()
+ id = row[0]
+ link = row[1]
+ f = urllib2.urlopen(link)
+ #entry["content"] = f.read()
+ html = f.read()
+ f.close()
+ soup = BeautifulSoup(html)
+ images = soup('img')
+ baseurl = link
+ for img in images:
+ filename = self.addImage(configdir, self.key, baseurl, img['src'], proxy=proxy)
+ img['src']=filename
+ self.db.execute("INSERT INTO images (id, imagePath) VALUES (?, ?);", (id, filename) )
+ self.db.commit()
+ contentLink = configdir+self.key+".d/"+id+".html"
+ file = open(contentLink, "w")
+ file.write(soup.prettify())
+ file.close()
+
+ self.db.execute("UPDATE feed SET read=0, contentLink=?, updated=? WHERE id=?;", (contentLink, time.time(), id) )
+ self.db.commit()
+ return (currentTime, None, None)
+
+ def purgeReadArticles(self):
+ rows = self.db.execute("SELECT id FROM feed WHERE read=1;")
+ #ids = self.getIds()
+ for row in rows:
+ self.removeArticle(row[0])
+
+ def removeArticle(self, id):
+ rows = self.db.execute("SELECT imagePath FROM images WHERE id=?;", (id,) )
+ for row in rows:
+ try:
+ count = self.db.execute("SELECT count(*) FROM images WHERE id!=? and imagePath=?;", (id,row[0]) ).fetchone()[0]
+ if count == 0:
+ os.remove(row[0])
+ except:
+ pass
+ self.removeEntry(id)
+
+class Listing(BaseObject):
+ # Columns to cache.
+ cached_columns = (('feeds', 'updateTime'),
+ ('feeds', 'unread'),
+ ('feeds', 'title'),
+ ('categories', 'title'))
+
+ def _getdb(self):
+ try:
+ db = self.tls.db
+ except AttributeError:
+ db = sqlite3.connect("%s/feeds.db" % self.configdir, timeout=120)
+ self.tls.db = db
+ return db
+ db = property(_getdb)
+
+ # Lists all the feeds in a dictionary, and expose the data
+ def __init__(self, config, configdir):
+ self.config = config
+ self.configdir = configdir
+
+ self.tls = threading.local ()
+
+ try:
+ table = self.db.execute("SELECT sql FROM sqlite_master").fetchone()
+ if table == None:
+ self.db.execute("CREATE TABLE feeds(id text, url text, title text, unread int, updateTime float, rank int, etag text, modified text, widget int, category int);")
+ self.db.execute("CREATE TABLE categories(id text, title text, unread int, rank int);")
+ self.addCategory("Default Category")
+ if isfile(self.configdir+"feeds.pickle"):
+ self.importOldFormatFeeds()
+ else:
+ self.addFeed("Maemo News", "http://maemo.org/news/items.xml")
+ else:
+ from string import find, upper
+ if find(upper(table[0]), "WIDGET")<0:
+ self.db.execute("ALTER TABLE feeds ADD COLUMN widget int;")
+ self.db.execute("UPDATE feeds SET widget=1;")
+ self.db.commit()
+ if find(upper(table[0]), "CATEGORY")<0:
+ self.db.execute("CREATE TABLE categories(id text, title text, unread int, rank int);")
+ self.addCategory("Default Category")
+ self.db.execute("ALTER TABLE feeds ADD COLUMN category int;")
+ self.db.execute("UPDATE feeds SET category=1;")
+ self.db.commit()
+ except:
+ pass
+
+ # Check that Woodchuck's state is up to date with respect our
+ # state.
+ updater = os.path.basename(sys.argv[0]) == 'update_feeds.py'
+ wc_init (self, True if updater else False)
+ if wc().available() and updater:
+ # The list of known streams.
+ streams = wc().streams_list ()
+ stream_ids = [s.identifier for s in streams]
+
+ # Register any unknown streams. Remove known streams from
+ # STREAMS_IDS.
+ for key in self.getListOfFeeds():
+ title = self.getFeedTitle(key)
+ # XXX: We should also check whether the list of
+ # articles/objects in each feed/stream is up to date.
+ if key not in stream_ids:
+ logger.debug(
+ "Registering previously unknown channel: %s (%s)"
+ % (key, title,))
+ # Use a default refresh interval of 6 hours.
+ wc().stream_register (key, title, 6 * 60 * 60)
+ else:
+ # Make sure the human readable name is up to date.
+ if wc()[key].human_readable_name != title:
+ wc()[key].human_readable_name = title
+ stream_ids.remove (key)
+
+
+ # Unregister any streams that are no longer subscribed to.
+ for id in stream_ids:
+ logger.debug("Unregistering %s" % (id,))
+ w.stream_unregister (id)
+
+ def importOldFormatFeeds(self):
+ """This function loads feeds that are saved in an outdated format, and converts them to sqlite"""
+ import rss
+ listing = rss.Listing(self.configdir)
+ rank = 0
+ for id in listing.getListOfFeeds():
+ try:
+ rank += 1
+ values = (id, listing.getFeedTitle(id) , listing.getFeedUrl(id), 0, time.time(), rank, None, "None", 1)
+ self.db.execute("INSERT INTO feeds (id, title, url, unread, updateTime, rank, etag, modified, widget, category) VALUES (?, ?, ? ,? ,? ,?, ?, ?, ?, 1);", values)
+ self.db.commit()
+
+ feed = listing.getFeed(id)
+ new_feed = self.getFeed(id)
+
+ items = feed.getIds()[:]
+ items.reverse()
+ for item in items:
+ if feed.isEntryRead(item):
+ read_status = 1
+ else:
+ read_status = 0
+ date = timegm(feed.getDateTuple(item))
+ title = feed.getTitle(item)
+ newId = new_feed.generateUniqueId({"date":date, "title":title})
+ values = (newId, title , feed.getContentLink(item), date, tuple(time.time()), feed.getExternalLink(item), read_status)
+ new_feed.db.execute("INSERT INTO feed (id, title, contentLink, date, updated, link, read) VALUES (?, ?, ?, ?, ?, ?, ?);", values)
+ new_feed.db.commit()
+ try:
+ images = feed.getImages(item)
+ for image in images:
+ new_feed.db.execute("INSERT INTO images (id, imagePath) VALUES (?, ?);", (item, image) )
+ new_feed.db.commit()
+ except:
+ pass
+ self.updateUnread(id)
+ except:
+ logger.error("importOldFormatFeeds: %s"
+ % (traceback.format_exc(),))
+ remove(self.configdir+"feeds.pickle")
+
+
+ def addArchivedArticle(self, key, index):
+ feed = self.getFeed(key)
+ title = feed.getTitle(index)
+ link = feed.getExternalLink(index)
+ date = feed.getDate(index)
+ count = self.db.execute("SELECT count(*) FROM feeds where id=?;", ("ArchivedArticles",) ).fetchone()[0]
+ if count == 0:
+ self.addFeed("Archived Articles", "", id="ArchivedArticles")
+
+ archFeed = self.getFeed("ArchivedArticles")
+ archFeed.addArchivedArticle(title, link, date, self.configdir)
+ self.updateUnread("ArchivedArticles")
+
+ def updateFeed(self, key, expiryTime=None, proxy=None, imageCache=None,
+ priority=0):
+ if expiryTime is None:
+ expiryTime = self.config.getExpiry()
+ if not expiryTime:
+ # Default to 24 hours
+ expriyTime = 24
+ if proxy is None:
+ (use_proxy, proxy) = self.config.getProxy()
+ if not use_proxy:
+ proxy = None
+ if imageCache is None:
+ imageCache = self.config.getImageCache()
+
+ feed = self.getFeed(key)
+ (url, etag, modified) = self.db.execute("SELECT url, etag, modified FROM feeds WHERE id=?;", (key,) ).fetchone()
+ try:
+ modified = time.struct_time(eval(modified))
+ except:
+ modified = None
+ feed.updateFeed(
+ self.configdir, url, etag, modified, expiryTime, proxy, imageCache,
+ priority, postFeedUpdateFunc=self._queuePostFeedUpdate)
+
+ def _queuePostFeedUpdate(self, *args, **kwargs):
+ mainthread.execute (self._postFeedUpdate, async=True, *args, **kwargs)
+
+ def _postFeedUpdate(self, key, updateTime, etag, modified, title):
+ if modified==None:
+ modified="None"
+ else:
+ modified=str(tuple(modified))
+ if updateTime > 0:
+ self.db.execute("UPDATE feeds SET updateTime=?, etag=?, modified=? WHERE id=?;", (updateTime, etag, modified, key) )
+ else:
+ self.db.execute("UPDATE feeds SET etag=?, modified=? WHERE id=?;", (etag, modified, key) )
+
+ if title is not None:
+ self.db.execute("UPDATE feeds SET title=(case WHEN title=='' THEN ? ELSE title END) where id=?;",
+ (title, key))
+ self.db.commit()
+ self.cache_invalidate('feeds')
+ self.updateUnread(key)
+
+ update_server_object().ArticleCountUpdated()
+
+ stats = JobManager().stats()
+ global jobs_at_start
+ completed = stats['jobs-completed'] - jobs_at_start
+ in_progress = stats['jobs-in-progress']
+ queued = stats['jobs-queued']
+
+ try:
+ percent = (100 * ((completed + in_progress / 2.))
+ / (completed + in_progress + queued))
+ except ZeroDivisionError:
+ percent = 100
+
+ update_server_object().UpdateProgress(
+ percent, completed, in_progress, queued, 0, 0, 0, key)
+
+ if in_progress == 0 and queued == 0:
+ jobs_at_start = stats['jobs-completed']
+
+ def getFeed(self, key):
+ if key == "ArchivedArticles":
+ return ArchivedArticles(self.configdir, key)
+ return Feed(self.configdir, key)
+
+ def editFeed(self, key, title, url, category=None):
+ if category:
+ self.db.execute("UPDATE feeds SET title=?, url=?, category=? WHERE id=?;", (title, url, category, key))
+ else:
+ self.db.execute("UPDATE feeds SET title=?, url=? WHERE id=?;", (title, url, key))
+ self.db.commit()
+ self.cache_invalidate('feeds')
+
+ if wc().available():
+ try:
+ wc()[key].human_readable_name = title
+ except KeyError:
+ logger.debug("Feed %s (%s) unknown." % (key, title))
+
+ def getFeedUpdateTime(self, key):
+ update_time = self.lookup('feeds', 'updateTime', key)
+
+ if not update_time:
+ return "Never"
+
+ delta = time.time() - update_time
+
+ delta_hours = delta / (60. * 60.)
+ if delta_hours < .1:
+ return "A few minutes ago"
+ if delta_hours < .75:
+ return "Less than an hour ago"
+ if delta_hours < 1.5:
+ return "About an hour ago"
+ if delta_hours < 18:
+ return "About %d hours ago" % (int(delta_hours + 0.5),)
+
+ delta_days = delta_hours / 24.
+ if delta_days < 1.5:
+ return "About a day ago"
+ if delta_days < 18:
+ return "%d days ago" % (int(delta_days + 0.5),)
+
+ delta_weeks = delta_days / 7.
+ if delta_weeks <= 8:
+ return "%d weeks ago" % int(delta_weeks + 0.5)
+
+ delta_months = delta_days / 30.
+ if delta_months <= 30:
+ return "%d months ago" % int(delta_months + 0.5)
+
+ return time.strftime("%x", time.gmtime(update_time))
+
+ def getFeedNumberOfUnreadItems(self, key):
+ return self.lookup('feeds', 'unread', key)
+
+ def getFeedTitle(self, key):
+ title = self.lookup('feeds', 'title', key)
+ if title:
+ return title
+
+ return self.getFeedUrl(key)
+
+ def getFeedUrl(self, key):
+ return self.db.execute("SELECT url FROM feeds WHERE id=?;", (key,)).fetchone()[0]
+
+ def getFeedCategory(self, key):
+ return self.db.execute("SELECT category FROM feeds WHERE id=?;", (key,)).fetchone()[0]
+
+ def getListOfFeeds(self, category=None):
+ if category:
+ rows = self.db.execute("SELECT id FROM feeds WHERE category=? ORDER BY rank;", (category, ) )
+ else:
+ rows = self.db.execute("SELECT id FROM feeds ORDER BY rank;" )
+ keys = []
+ for row in rows:
+ if row[0]:
+ keys.append(row[0])
+ return keys
+
+ def getListOfCategories(self):
+ return list(row[0] for row in self.db.execute(
+ "SELECT id FROM categories ORDER BY rank;"))
+
+ def getCategoryTitle(self, id):
+ return self.lookup('categories', 'title', id)
+
+ def getSortedListOfKeys(self, order, onlyUnread=False, category=1):
+ if order == "Most unread":
+ tmp = "ORDER BY unread DESC"
+ #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][1], reverse=True)
+ elif order == "Least unread":
+ tmp = "ORDER BY unread"
+ #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][1])
+ elif order == "Most recent":
+ tmp = "ORDER BY updateTime DESC"
+ #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][2], reverse=True)
+ elif order == "Least recent":
+ tmp = "ORDER BY updateTime"
+ #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][2])
+ else: # order == "Manual" or invalid value...
+ tmp = "ORDER BY rank"
+ #keyorder = sorted(feedInfo, key = lambda k: feedInfo[k][0])
+ if onlyUnread:
+ sql = "SELECT id FROM feeds WHERE unread>0 AND category=%s " %category + tmp
+ else:
+ sql = "SELECT id FROM feeds WHERE category=%s " %category + tmp
+ rows = self.db.execute(sql)
+ keys = []
+ for row in rows:
+ if row[0]:
+ keys.append(row[0])
+ return keys
+
+ def getFavicon(self, key):
+ filename = "%s%s.d/favicon.ico" % (self.configdir, key)
+ if isfile(filename):
+ return filename
+ else:
+ return False
+
+ def updateUnread(self, key):
+ feed = self.getFeed(key)
+ self.db.execute("UPDATE feeds SET unread=? WHERE id=?;", (feed.getNumberOfUnreadItems(), key))
+ self.db.commit()
+ self.cache_invalidate('feeds')
+
+ def addFeed(self, title, url, id=None, category=1):
+ if not id:
+ id = getId(url)
+ count = self.db.execute("SELECT count(*) FROM feeds WHERE id=?;", (id,) ).fetchone()[0]
+ if count == 0:
+ max_rank = self.db.execute("SELECT MAX(rank) FROM feeds;").fetchone()[0]
+ if max_rank == None:
+ max_rank = 0
+ values = (id, title, url, 0, 0, max_rank+1, None, "None", 1, category)
+ self.db.execute("INSERT INTO feeds (id, title, url, unread, updateTime, rank, etag, modified, widget, category) VALUES (?, ?, ? ,? ,? ,?, ?, ?, ?,?);", values)
+ self.db.commit()
+ # Ask for the feed object, it will create the necessary tables
+ self.getFeed(id)
+
+ if wc().available():
+ # Register the stream with Woodchuck. Update approximately
+ # every 6 hours.
+ wc().stream_register(stream_identifier=id,
+ human_readable_name=title,
+ freshness=6*60*60)
+
+ return True
+ else:
+ return False
+
+ def addCategory(self, title):
+ rank = self.db.execute("SELECT MAX(rank)+1 FROM categories;").fetchone()[0]
+ if rank==None:
+ rank=1
+ id = self.db.execute("SELECT MAX(id)+1 FROM categories;").fetchone()[0]
+ if id==None:
+ id=1
+ self.db.execute("INSERT INTO categories (id, title, unread, rank) VALUES (?, ?, 0, ?)", (id, title, rank))
+ self.db.commit()
+
+ def removeFeed(self, key):
+ if wc().available ():
+ try:
+ del wc()[key]
+ except KeyError:
+ logger.debug("Removing unregistered feed %s failed" % (key,))
+
+ rank = self.db.execute("SELECT rank FROM feeds WHERE id=?;", (key,) ).fetchone()[0]
+ self.db.execute("DELETE FROM feeds WHERE id=?;", (key, ))
+ self.db.execute("UPDATE feeds SET rank=rank-1 WHERE rank>?;", (rank,) )
+ self.db.commit()
+
+ if isdir(self.configdir+key+".d/"):
+ rmtree(self.configdir+key+".d/")
+
+ def removeCategory(self, key):
+ if self.db.execute("SELECT count(*) FROM categories;").fetchone()[0] > 1:
+ rank = self.db.execute("SELECT rank FROM categories WHERE id=?;", (key,) ).fetchone()[0]
+ self.db.execute("DELETE FROM categories WHERE id=?;", (key, ))
+ self.db.execute("UPDATE categories SET rank=rank-1 WHERE rank>?;", (rank,) )
+ self.db.execute("UPDATE feeds SET category=1 WHERE category=?;", (key,) )
+ self.db.commit()
+
+ #def saveConfig(self):
+ # self.listOfFeeds["feedingit-order"] = self.sortedKeys
+ # file = open(self.configdir+"feeds.pickle", "w")
+ # pickle.dump(self.listOfFeeds, file)
+ # file.close()
+
+ def moveUp(self, key):
+ rank = self.db.execute("SELECT rank FROM feeds WHERE id=?;", (key,)).fetchone()[0]
+ if rank>0:
+ self.db.execute("UPDATE feeds SET rank=? WHERE rank=?;", (rank, rank-1) )
+ self.db.execute("UPDATE feeds SET rank=? WHERE id=?;", (rank-1, key) )
+ self.db.commit()
+
+ def moveCategoryUp(self, key):
+ rank = self.db.execute("SELECT rank FROM categories WHERE id=?;", (key,)).fetchone()[0]
+ if rank>0:
+ self.db.execute("UPDATE categories SET rank=? WHERE rank=?;", (rank, rank-1) )
+ self.db.execute("UPDATE categories SET rank=? WHERE id=?;", (rank-1, key) )
+ self.db.commit()
+
+ def moveDown(self, key):
+ rank = self.db.execute("SELECT rank FROM feeds WHERE id=?;", (key,)).fetchone()[0]
+ max_rank = self.db.execute("SELECT MAX(rank) FROM feeds;").fetchone()[0]
+ if rank<max_rank:
+ self.db.execute("UPDATE feeds SET rank=? WHERE rank=?;", (rank, rank+1) )
+ self.db.execute("UPDATE feeds SET rank=? WHERE id=?;", (rank+1, key) )
+ self.db.commit()
+
+ def moveCategoryDown(self, key):
+ rank = self.db.execute("SELECT rank FROM categories WHERE id=?;", (key,)).fetchone()[0]
+ max_rank = self.db.execute("SELECT MAX(rank) FROM categories;").fetchone()[0]
+ if rank<max_rank:
+ self.db.execute("UPDATE categories SET rank=? WHERE rank=?;", (rank, rank+1) )
+ self.db.execute("UPDATE categories SET rank=? WHERE id=?;", (rank+1, key) )
+ self.db.commit()
+
+
--- /dev/null
+#!/usr/bin/env python2.5
+
+#
+# Copyright (c) 2007-2008 INdT.
+# Copyright (c) 2011 Neal H. Walfield
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+# ============================================================================
+# Name : update_feeds.py
+# Author : Yves Marcoz
+# Version : 0.6.1
+# Description : Simple RSS Reader
+# ============================================================================
+
+from rss_sqlite import Listing
+from config import Config
+from updatedbus import UpdateServerObject
+
+import os
+import traceback
+import sys
+import dbus
+
+from jobmanager import JobManager
+import mainthread
+
+import gobject
+gobject.threads_init()
+
+import logging
+logger = logging.getLogger(__name__)
+import debugging
+debugging.init(dot_directory=".feedingit", program_name="update_feeds")
+
+#CONFIGDIR="/home/user/.feedingit/"
+CONFIGDIR = os.environ.get("HOME", "/home/user") + "/.feedingit/"
+#DESKTOP_FILE = "/usr/share/applications/hildon-status-menu/feedingit_status.desktop"
+
+from socket import setdefaulttimeout
+timeout = 5
+setdefaulttimeout(timeout)
+del timeout
+
+class FeedUpdate(UpdateServerObject):
+ def __init__(self, bus_name):
+ UpdateServerObject.__init__(self, bus_name)
+
+ self.config = Config(self, CONFIGDIR+"config.ini")
+ self.listing = Listing(self.config, CONFIGDIR)
+
+ jm = JobManager(True)
+ jm.stats_hook_register (self.job_manager_update,
+ run_in_main_thread=True)
+
+ # Whether or no an update is in progress.
+ self.am_updating = False
+
+ # After an update an finished, we start the inactivity timer.
+ # If this fires before a new job arrives, we quit.
+ self.inactivity_timer = 0
+
+ # Whether we started in daemon mode, or not.
+ self.daemon = '--daemon' in sys.argv
+
+ if self.daemon:
+ logger.debug("Running in daemon mode: waiting for commands.")
+ self.inactivity_timer = gobject.timeout_add(
+ 5 * 60 * 1000, self.inactivity_cb)
+ else:
+ # Update all feeds.
+ logger.debug("Not running in daemon mode: updating all feeds.")
+ gobject.idle_add(self.UpdateAll)
+
+# # If the system becomes idle
+# bus = dbus.SystemBus()
+#
+# mce_request_proxy = bus.get_object(
+# 'com.nokia.mce', '/com/nokia/mce/request')
+# mce_request_iface = dbus.Interface(
+# mce_request_proxy, 'com.nokia.mce.request')
+# system_idle = mce_request_iface.get_inactivity_status()
+# # Force self.system_inactivity_ind to run: ensure that a state
+# # change occurs.
+# self.system_idle = not system_idle
+# self.system_inactivity_ind(system_idle)
+#
+# mce_signal_proxy = bus.get_object(
+# 'com.nokia.mce', '/com/nokia/mce/signal')
+# mce_signal_iface = dbus.Interface(
+# mce_signal_proxy, 'com.nokia.mce.signal')
+# mce_signal_iface.connect_to_signal(
+# 'system_inactivity_ind', self.system_inactivity_ind)
+
+ def increase_download_parallelism(self):
+ # The system has been idle for a while. Enable parallel
+ # downloads.
+ logger.debug("Increasing parallelism to 4 workers.")
+ JobManager().num_threads = 4
+ gobject.source_remove (self.increase_download_parallelism_id)
+ del self.increase_download_parallelism_id
+ return False
+
+ def system_inactivity_ind(self, idle):
+ # The system's idle state changed.
+ if (self.system_idle and idle) or (not self.system_idle and not idle):
+ # No change.
+ return
+
+ if not idle:
+ if hasattr (self, 'increase_download_parallelism_id'):
+ gobject.source_remove (self.increase_download_parallelism_id)
+ del self.increase_download_parallelism_id
+ else:
+ self.increase_download_parallelism_id = \
+ gobject.timeout_add_seconds(
+ 60, self.increase_download_parallelism)
+
+ if not idle:
+ logger.debug("Reducing parallelism to 1 worker.")
+ JobManager().num_threads = 1
+
+ self.system_idle = idle
+
+ def job_manager_update(self, jm, old_stats, new_stats, updated_feed):
+ queued = new_stats['jobs-queued']
+ in_progress = new_stats['jobs-in-progress']
+
+ if (queued or in_progress) and not self.am_updating:
+ logger.debug("new update started")
+ self.am_updating = True
+ self.UpdateStarted()
+ self.UpdateProgress(0, 0, in_progress, queued, 0, 0, 0, "")
+
+ if not queued and not in_progress:
+ logger.debug("update finished!")
+ self.am_updating = False
+ self.UpdateFinished()
+ self.ArticleCountUpdated()
+
+ if self.daemon:
+ self.inactivity_timer = gobject.timeout_add(
+ 60 * 1000, self.inactivity_cb)
+ else:
+ logger.debug("update finished, not running in daemon mode: "
+ "quitting")
+ mainloop.quit()
+
+ if (queued or in_progress) and self.inactivity_timer:
+ gobject.source_remove(self.inactivity_timer)
+ self.inactivity_timer = 0
+
+ def inactivity_cb(self):
+ """
+ The updater has been inactive for a while. Quit.
+ """
+ assert self.inactivity_timer
+ self.inactivity_timer = 0
+
+ if not self.am_updating:
+ logger.info("Nothing to do for a while. Quitting.")
+ mainloop.quit()
+
+ def StopUpdate(self):
+ """
+ Stop updating.
+ """
+ super(FeedUpdate, self).stopUpdate()
+
+ JobManager().quit()
+
+ def UpdateAll(self):
+ """
+ Update all feeds.
+ """
+ logger.info("starting update.")
+ super(FeedUpdate, self).UpdateAll()
+
+ feeds = self.listing.getListOfFeeds()
+ for k in feeds:
+ self.listing.updateFeed(k)
+ logger.debug("Queued all feeds (%d) for update." % len(feeds))
+
+ def Update(self, feed):
+ """
+ Update a particular feed.
+ """
+ super(FeedUpdate, self).Update(feed)
+
+ # We got a request via dbus. If we weren't in daemon mode
+ # before, enter it now.
+ self.daemon = True
+
+ self.listing.updateFeed(feed)
+
+
+import dbus.mainloop.glib
+dbus.mainloop.glib.DBusGMainLoop(set_as_default=True)
+
+mainloop = gobject.MainLoop()
+mainthread.init()
+
+# Acquire our name on the session bus. If this doesn't work, most
+# likely another update_feeds instance is already running. In this
+# case, just quit.
+try:
+ bus_name = dbus.service.BusName('org.marcoz.feedingit',
+ bus=dbus.SessionBus(),
+ do_not_queue=True)
+except Exception:
+ # We failed to acquire our bus name. Die.
+ try:
+ dbus_proxy = dbus.SessionBus().get_object(
+ 'org.freedesktop.DBus', '/org/freedesktop/DBus')
+ dbus_iface = dbus.Interface(dbus_proxy, 'org.freedesktop.DBus')
+ pid = dbus_iface.GetConnectionUnixProcessID('org.marcoz.feedingit')
+ logger.error("update_feeds already running: pid %d." % pid)
+ except Exception, e:
+ logger.error("Getting pid associated with org.marcoz.feedingit: %s"
+ % str(e))
+ logger.error("update_feeds already running.")
+
+ sys.exit(1)
+
+# Run the updater. Note: we run this until feed.am_updating is false.
+# Only is this case have all worker threads exited. If the main
+# thread exits before all threads have exited and the process gets a
+# signal, the Python interpreter is unable to handle the signal and it
+# runs really slow (rescheduling after ever single instruction instead
+# of every few thousand).
+feed = FeedUpdate(bus_name)
+while True:
+ try:
+ mainloop.run()
+ except KeyboardInterrupt:
+ logger.error("Interrupted. Quitting.")
+ JobManager().quit()
+
+ if not feed.am_updating:
+ break
+
--- /dev/null
+#!/usr/bin/env python2.5
+
+#
+# Copyright (c) 2007-2008 INdT.
+# Copyright (c) 2011 Neal H. Walfield
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+# ============================================================================
+# Name : FeedingIt.py
+# Author : Yves Marcoz
+# Version : 0.6.1
+# Description : Simple RSS Reader
+# ============================================================================
+
+import dbus.service
+import logging
+logger = logging.getLogger(__name__)
+
+_update_server_object = None
+def update_server_object():
+ global _update_server_object
+ assert _update_server_object is not None, \
+ "No UpdateServerObject instantiated!"
+ return _update_server_object
+
+class UpdateServerObject(dbus.service.Object):
+ def __init__(self, bus_name):
+ """
+ Start listening for requests.
+ """
+ global _update_server_object
+ assert _update_server_object is None, \
+ "Attempt to instantiate multiple UpdateServerObject objects."
+ _update_server_object = self
+
+ dbus.service.Object.__init__(self, bus_name,
+ '/org/marcoz/feedingit/update')
+
+ @dbus.service.method('org.marcoz.feedingit')
+ def StopUpdate(self):
+ logger.debug("Stop update called.")
+
+ @dbus.service.method('org.marcoz.feedingit')
+ def UpdateAll(self):
+ logger.debug("UpdateAll called.")
+
+ @dbus.service.method('org.marcoz.feedingit', in_signature='s')
+ def Update(self, feed):
+ logger.debug("Update(%s) called." % feed)
+
+ # A signal that will be exported to dbus
+ @dbus.service.signal('org.marcoz.feedingit', signature='')
+ def ArticleCountUpdated(self):
+ pass
+
+ # A signal that will be exported to dbus
+ @dbus.service.signal('org.marcoz.feedingit', signature='uuuuttus')
+ def UpdateProgress(self, percent_complete,
+ feeds_downloaded, feeds_downloading, feeds_pending,
+ bytes_downloaded, bytes_uploaded, bytes_per_second,
+ updated_feed):
+ pass
+
+ # A signal that will be exported to dbus
+ @dbus.service.signal('org.marcoz.feedingit', signature='')
+ def UpdateStarted(self):
+ pass
+
+ # A signal that will be exported to dbus
+ @dbus.service.signal('org.marcoz.feedingit', signature='')
+ def UpdateFinished(self):
+ pass
+
+
--- /dev/null
+# Copyright (c) 2011 Neal H. Walfield
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import logging
+logger = logging.getLogger(__name__)
+import traceback
+
+# Don't fail if the Woodchuck modules are not available. Just disable
+# Woodchuck's functionality.
+
+# Whether we imported the woodchuck modules successfully.
+woodchuck_imported = True
+try:
+ import pywoodchuck
+ from pywoodchuck import PyWoodchuck
+ from pywoodchuck import woodchuck
+except ImportError, exception:
+ logger.info(
+ "Unable to load Woodchuck modules: disabling Woodchuck support: %s"
+ % traceback.format_exc ())
+ woodchuck_imported = False
+ class PyWoodchuck (object):
+ def available(self):
+ return False
+ woodchuck = None
+
+# The default channel refresh interval: 6 hours.
+refresh_interval = 6 * 60 * 60
+
+class mywoodchuck (PyWoodchuck):
+ def __init__(self, listing, human_readable_name, identifier,
+ request_feedback):
+ try:
+ PyWoodchuck.__init__ (self, human_readable_name, identifier,
+ request_feedback)
+ except Exception, e:
+ logger.error(
+ "Failed to establish a connection to the Woodchuck server: %s"
+ % (str(e),))
+ self.available = self.not_available
+ return
+
+ self.listing = listing
+
+ def not_available(self):
+ return False
+
+ # Woodchuck upcalls.
+ def stream_update_cb(self, stream):
+ logger.debug("stream update called on %s (%s)"
+ % (stream.human_readable_name, stream.identifier,))
+
+ # Make sure no one else is concurrently updating this
+ # feed.
+ try:
+ self.listing.updateFeed(stream.identifier)
+ except:
+ logger.debug("Updating %s: %s"
+ % (stream.identifier, traceback.format_exc ()))
+
+ def object_transfer_cb(self, stream, object,
+ version, filename, quality):
+ log ("object transfer called on %s (%s) in stream %s (%s)"
+ % (object.human_readable_name, object.identifier,
+ stream.human_readable_name, stream.identifier))
+
+_w = None
+def wc_init(listing, request_feedback=False):
+ """Connect to the woodchuck server and initialize any state."""
+ global _w
+ assert _w is None
+
+ _w = mywoodchuck (listing, "FeedingIt", "org.marcoz.feedingit",
+ request_feedback)
+
+ if not woodchuck_imported or not _w.available ():
+ logger.info("Unable to contact Woodchuck server.")
+ else:
+ logger.debug("Woodchuck appears to be available.")
+
+def wc():
+ """Return the Woodchuck singleton."""
+ global _w
+ assert _w is not None
+ return _w
--- /dev/null
+import Qt 4.7
+import QtWebKit 1.0
+import "common" as Common
+
+Rectangle {
+ /*x: parent.width; height: parent.height;*/
+ width: parent.width;
+ height: parent.height
+ property alias zoomEnabled: slider.visible;
+ property alias value: slider.value;
+ //anchors.top: parent.top; anchors.bottom: parent.bottom;
+ color: "white";
+
+ Flickable {
+ id: flickable
+ //anchors.fill: screen;
+ height: parent.height;
+ width: parent.width;
+ contentWidth: webView.width*webView.scale; //Math.max(screen.width,webView.width*webView.scale)
+ contentHeight: Math.max(articleViewer.height,webView.height*webView.scale)
+ //contentWidth: childrenRect.width; contentHeight: childrenRect.height
+ interactive: parent.vertPanningEnabled;
+
+ flickDeceleration: 1500;
+ flickableDirection: Flickable.VerticalFlick
+ WebView {
+ id: webView
+ //url: flipItem.url;
+ html: flipItem.html;
+ preferredWidth: flickable.width
+ preferredHeight: flickable.height
+ //scale: 1.25;
+ transformOrigin: Item.TopLeft
+ scale: slider.value;
+ settings.defaultFontSize: 24
+ }
+
+// onFlickStarted: {
+// console.log("start contentx"+contentX)
+// console.log("start contenty"+contentY)
+// }
+ }
+
+ Common.Slider {
+ id: slider; visible: false
+ minimum: 0.2;
+ maximum: 2;
+ property real prevScale: 1
+ anchors {
+ bottom: parent.bottom; bottomMargin: 65
+ left: parent.left; leftMargin: 25
+ right: parent.right; rightMargin: 25
+ }
+ onValueChanged: {
+ if (webView.width * value > flickable.width) {
+ var xoff = (flickable.width/2 + flickable.contentX) * value / prevScale;
+ flickable.contentX = xoff - flickable.width/2;
+ }
+ if (webView.height * value > flickable.height) {
+ var yoff = (flickable.height/2 + flickable.contentY) * value / prevScale;
+ flickable.contentY = yoff - flickable.height/2;
+ }
+ prevScale = value;
+ }
+ Component.onCompleted: {value=0; value=1; }
+ }
+}
--- /dev/null
+import Qt 4.7
+
+Item {
+ id: articleViewer
+ //width: 480; height: 360;
+ width: parent.width; height: parent.height;
+ //property string feedid: "61ac1458d761423344998dc76770e36e" //articlesItem.feedid;
+ //property string hideReadArticles: "";
+ property alias articleShown: articleView.visible;
+ property bool zoomEnabled: false;
+ property bool vertPanningEnabled: true
+
+ function modulo(x,y) {
+ // Fixes modulo for negative numbers
+ return ((x%y)+y)%y;
+ }
+
+ function reload() {
+ articles.xml = articleViewer.feedid == "" ? "<?xml version=\"1.0\" encoding=\"utf-8\"?><xml></xml>" : controller.getArticlesXml(articleViewer.feedid);
+ articles.reload()
+ }
+
+ function next() {
+ if (articleView.visible) {
+ //articleView.positionViewAtIndex(modulo(articleView.currentIndex+1, articleView.count), ListView.Contain);
+ articleView.incrementCurrentIndex();
+ }
+ }
+
+ function prev() {
+ if (articleView.visible) {
+ //articleView.positionViewAtIndex(modulo(articleView.currentIndex-1, articleView.count), ListView.Contain);
+ articleView.decrementCurrentIndex();
+ }
+ }
+
+ function markAllAsRead() {
+ if (feedid!="") {
+ controller.markAllAsRead(feedid)
+ articles.reload();
+ }
+ }
+
+ function viewArticle(articleid) {
+ var index = 0;
+ for (var i=0; i<articleList.count; ++i) {
+ if (articles.get(0).articleid==articleid) {
+ index = i;
+ }
+ }
+ articleView.positionViewAtIndex(index, ListView.Contain); articleView.visible = true;
+ }
+
+ ListView {
+ id: articleList; model: visualModel.parts.list; z: 6
+ width: parent.width; height: parent.height; /*x: 0;*/
+ cacheBuffer: 100;
+ flickDeceleration: 1500
+ }
+
+ ListView {
+ id: articleView; model: visualModel.parts.flip; orientation: ListView.Horizontal
+ width: parent.width; height: parent.height; visible: false; z:8
+ //onCurrentIndexChanged: photosGridView.positionViewAtIndex(currentIndex, GridView.Contain)
+ highlightRangeMode: ListView.StrictlyEnforceRange; snapMode: ListView.SnapOneItem
+ //cacheBuffer: 5;
+ onMovementStarted: articleViewer.vertPanningEnabled=false;
+ onMovementEnded: articleViewer.vertPanningEnabled=true;
+ highlightMoveDuration: 300;
+ }
+
+ Rectangle {
+ id: noArticle
+ //width: parent.width; height: parent.height;
+ //color: "#000000"
+ anchors.centerIn: parent;
+ visible: false;
+ z:8;
+ Text { id: noText; color: "#ffffff"; anchors.centerIn: parent; text: qsTr("No articles available"); }
+ Image { id: loadingImage; anchors.centerIn: parent; source: "common/images/loading.png";
+ height: 96; width: 96;
+ NumberAnimation on rotation {
+ from: 0; to: 360; running: (loadingImage.visible == true); loops: Animation.Infinite; duration: 900
+ }
+ }
+
+ states: [ State {
+ name: "noArticle"; when: articles.count==0 && articles.status==XmlListModel.Ready
+ PropertyChanges { target: noArticle; visible: true; }
+ PropertyChanges { target: loadingImage; visible: false; }
+ PropertyChanges { target: noText; visible: true; }
+ }, State {
+ name: "loading"; when: articles.count==0 && articles.status != XmlListModel.Ready
+ PropertyChanges { target: noArticle; visible: true; }
+ PropertyChanges { target: noText; visible: false; }
+ PropertyChanges { target: loadingImage; visible: true; }
+ }
+ ]
+ }
+
+ VisualDataModel {
+ id: visualModel;
+ delegate: Package {
+ id: packageItem
+ Item { id: flipItem; Package.name: 'flip'; width: articleViewer.width; height: articleViewer.height;
+
+ property string url: (articleView.visible && Math.abs(articleView.currentIndex-index)<2) ? path: ""; //http://localhost:8000/html/" + articleViewer.feedid + "/" + articleid : "";
+ property string html: controller.getArticle(articleViewer.feedid, articleid)
+ ArticleDisplay {
+ zoomEnabled: articleViewer.zoomEnabled;
+ property bool vertPanningEnabled: articleViewer.vertPanningEnabled;
+
+ states: [ State {
+ name: 'articleIsRead';
+ when: articleView.visible && articleView.currentIndex == index;
+ StateChangeScript {
+ name: "myScript"
+ script: {
+ flipItem.url=path; //"http://localhost:8000/html/" + articleViewer.feedid + "/" + articleid;
+ controller.setEntryRead(articleViewer.feedid, articleid)
+ }
+ }
+ }, State {
+ name: 'articleIsClose'; when: articleView.visible && Math.abs(articleView.currentIndex-index)<2;
+ StateChangeScript {
+ script: { flipItem.url=path; } //"http://localhost:8000/html/" + articleViewer.feedid + "/" + articleid;}
+ }
+ }
+ ]
+ }
+ }
+
+ Item { Package.name: 'list';
+ id: wrapper; width: articleViewer.width; height: 86
+ Item {
+ id: moveMe
+ Rectangle { id: backRect; color: "black"; opacity: index % 2 ? 0.2 : 0.4; height: 84; width: wrapper.width; y: 1 }
+ Text {
+ anchors.fill: backRect
+ anchors.margins: 5
+ verticalAlignment: Text.AlignVCenter; text: title; color: (unread=="True") ? "white" : "#7b97fd";
+ width: wrapper.width; wrapMode: Text.WordWrap; font.bold: false;
+ }
+ }
+ MouseArea { anchors.fill: wrapper;
+ onClicked: { articleView.positionViewAtIndex(index, ListView.Contain); articleView.visible = true; }
+ }
+ }
+ }
+ model: articles
+ }
+
+ XmlListModel {
+ id: articles
+
+ //source: articleViewer.feedid == "" ? "" : "http://localhost:8000/articles/" + feedid + "?onlyUnread=" + hideReadArticles
+ //xml: articleViewer.feedid == "" ? "" : controller.getArticlesXml(articleViewer.feedid)
+ query: "/xml/article"
+
+ XmlRole { name: "title"; query: "title/string()" }
+ XmlRole { name: "articleid"; query: "articleid/string()"; isKey: true }
+ XmlRole { name: "path"; query: "path/string()" }
+ XmlRole { name: "unread"; query: "unread/string()"; isKey: true}
+ }
+
+
+}
--- /dev/null
+import Qt 4.7
+
+Item {
+ //anchors.fill: parent;
+ width: parent.width;
+ property string feedid : ""
+ property alias count: articles.count
+ property alias url: articles.source
+
+ x: parent.width; height: parent.height;
+ anchors.top: parent.top; anchors.bottom: parent.bottom
+
+ function getArticleid(index) {
+ return articles.get(index).articleid
+ }
+
+ function reload() {
+ //articlesModel.reload()
+ }
+
+ ListView {
+ id: articleList; model: articlesModel; delegate: articleDelegate; z: 6
+ width: parent.width; height: parent.height; /*x: 0;*/
+ cacheBuffer: 100;
+ flickDeceleration: 1500
+ }
+
+ XmlListModel {
+ id: articles
+
+ source: feedid == "" ? "" : "http://localhost:8000/articles/" + feedid + "?onlyUnread=" + hideReadArticles
+ query: "/xml/article"
+
+ XmlRole { name: "title"; query: "title/string()" }
+ XmlRole { name: "articleid"; query: "articleid/string()"; isKey: true }
+ XmlRole { name: "path"; query: "path/string()" }
+ XmlRole { name: "unread"; query: "unread/string()"; isKey: true}
+ }
+
+ Component {
+ id: articleDelegate
+
+ Item {
+ id: wrapper; width: wrapper.ListView.view.width; height: 86
+ Item {
+ id: moveMe
+ Rectangle { id: backRect; color: "black"; opacity: index % 2 ? 0.2 : 0.4; height: 84; width: wrapper.width; y: 1 }
+ Text {
+ anchors.fill: backRect
+ anchors.margins: 5
+ verticalAlignment: Text.AlignVCenter; text: title; color: (model.article.unread=="True") ? "white" : "#7b97fd";
+ width: wrapper.width; wrapMode: Text.WordWrap; font.bold: false;
+ }
+// Rectangle {
+// x: 3; y: 4; width: 77; height: 77; color: "#ff0000"; smooth: true
+
+// }
+
+// Column {
+// x: 3;
+
+// width: wrapper.width - 3; y: 5; spacing: 2
+// height: parent.height;
+// Text { Rectangle {anchors.fill: parent; color: "white"; opacity: 0.5;}
+// verticalAlignment: Text.AlignVCenter; text: model.article.title; color: (model.article.unread=="True") ? "white" : "#7b97fd"; width: parent.width; wrapMode: Text.WordWrap; font.bold: false; /*elide: Text.ElideRight;*/ /*style: Text.Raised;*/ styleColor: "black"; }
+// //Text { text: feedname; width: parent.width; elide: Text.ElideLeft; color: "#cccccc"; style: Text.Raised; styleColor: "black" }
+// }
+ }
+ MouseArea {
+ anchors.fill: wrapper;
+ onClicked: {
+ container.articleClicked(model.article.articleid, index)
+ }
+ }
+ }
+
+ }
+
+}
--- /dev/null
+import Qt 4.7
+
+Item {
+// anchors.fill: parent;
+ width: parent.width; height: parent.height;
+ //anchors.top: parent.top; anchors.bottom: parent.bottom
+ property bool inEditMode: true
+
+ function reload() {
+ categories.reload();
+ }
+
+ ListView {
+ id: categoryList; model: categories; delegate: categoryDelegate; z: 6;
+ cacheBuffer: 100; width: parent.width; height: parent.height;
+ }
+
+ XmlListModel {
+
+ id: categories
+
+ xml: controller.getCategoryXml()
+ query: "/xml/category"
+
+ XmlRole { name: "title"; query: "catname/string()" }
+ XmlRole { name: "catid"; query: "catid/string()"; isKey: true }
+ }
+
+ Component {
+ id: categoryDelegate
+
+ Item {
+
+ id: wrapper; width: wrapper.ListView.view.width; height: 86
+ Item {
+ id: moveMe
+ height: parent.height
+ Rectangle { color: "black"; opacity: index % 2 ? 0.2 : 0.4; height: 84; width: wrapper.width; y: 1 }
+ Rectangle {
+ x: 6; y: 4; width: 77; height: parent.height - 9; color: "white"; smooth: true
+
+ }
+ Column {
+ x: 92; width: wrapper.ListView.view.width - 95; y: 15; spacing: 2
+ Text { text: title; color: "white"; width: parent.width; font.bold: true; elide: Text.ElideRight; style: Text.Raised; styleColor: "black" }
+ //Text { text: feedname; width: parent.width; elide: Text.ElideLeft; color: "#cccccc"; style: Text.Raised; styleColor: "black" }
+ }
+ Item {
+ x: wrapper.ListView.view.width - 128; y: 12
+ height: 58; width: 58;
+ //anchors.horizontalCenter: parent.horizontalCenter;
+ Image { source: "common/images/wmEditIcon.png" }
+ MouseArea {
+ anchors.fill: parent; onClicked: { container.categoryEdit(catname, catid); }
+ }
+ visible: inEditMode
+ }
+ Item {
+ x: wrapper.ListView.view.width - 64; y: 12
+ height: 58; width: 58;
+ //anchors.horizontalCenter: parent.horizontalCenter;
+ Image { source: "common/images/delete.png" }
+ MouseArea {
+ anchors.fill: parent; onClicked: { container.categoryDeleted(catid); }
+ }
+ visible: inEditMode
+ }
+ }
+ MouseArea { enabled: !inEditMode; anchors.fill: wrapper; onClicked: { container.categoryClicked(catid); } }
+ }
+ }
+}
--- /dev/null
+import Qt 4.7
+import "common" as Common
+// Depends on qt4-declarative-qmlviewer
+
+Item {
+ width: 480
+ height: 640
+ anchors.fill: parent
+ id: screen
+
+ Rectangle {
+ id: container
+ anchors.fill: parent; color: "#343434";
+ anchors.centerIn: parent
+ //transformOrigin: Item.Center
+ property bool editMode: false
+ property bool inPortrait: width < height
+
+ function categoryClicked(catid) {
+ feedsItem.catid = catid;
+ feedsItem.reload();
+ categoriesItem.isShown = false;
+ feedsItem.visible = true;
+ }
+
+ function feedClicked(feedid, updating) {
+ flipper.feedid = feedid;
+ flipper.reload();
+ toolBar.feedUpdating = updating;
+ flipper.visible = true;
+ }
+
+ function backClicked() {
+ if (flipper.visible && flipper.articleShown) {
+ // We're viewing an article, and going back to article listing
+ flipper.articleShown = false;
+ flipper.reload()
+ //flipper.articleid = "";
+ //flipper.value = 1;
+ //articlesItem.reload()
+ return;
+ }
+ if (flipper.visible) {
+ feedsItem.reload();
+ toolBar.feedUpdating = false;
+ flipper.visible = false;
+ flipper.feedid = "";
+ flipper.reload();
+ return;
+ }
+
+ if (feedsItem.visible) {
+ // Viewing feeds, going back to categories
+ //feedsItem.catid = "";
+ feedsItem.visible = false;
+ //feedsItem.reload();
+ categoriesItem.isShown = true;
+ return;
+ }
+ if (!feedsItem.visible) {
+ // Viewing categories, quitting
+ Qt.quit();
+ }
+ }
+
+ function categoryDeleted(catid) {
+ confirmationMessage.catid=catid;
+ confirmationMessage.state="deleteCat";
+ }
+
+ function feedDeleted(catid, feedid) {
+ confirmationMessage.catid=catid;
+ confirmationMessage.feedid=feedid;
+ confirmationMessage.state="deleteFeed";
+ }
+
+ function feedEdit(feedname, feedid, url) {
+ addFeed.feedEdit = true;
+ addFeed.feedName = feedname;
+ addFeed.feedUrl = url;
+ addFeed.visible = true;
+ }
+
+ function addCategory(categoryName) {
+ controller.addCategory(categoryName)
+ categoriesItem.reload();
+ addCat.visible=false;
+ }
+
+ function addFeed(catid, feedName, feedURL) {
+ controller.addFeed(feedName, feedURL, catid)
+ var doc = new XMLHttpRequest();
+ feedsItem.reload();
+ addFeedDialog.visible=false;
+ }
+
+ function updateClicked(feedid) {
+ controller.updateFeed(feedid);
+ }
+
+ function updateAllClicked() {
+ controller.updateAll();
+ }
+
+ Common.Menu {
+ id: config
+ z: 5
+ property string hideReadFeeds : "False"
+ property string hideReadArticles : "False"
+
+ property bool isShown: false;
+
+ //width: parent.width; height: parent.height;
+
+ //height: 0
+ states: State {
+ name: "shown"; when: config.isShown == true
+ PropertyChanges { target: config; y: 66 }
+ }
+
+ transitions: Transition {
+ NumberAnimation { properties: "y"; duration: 300; easing.type: "InOutQuad" }
+ }
+
+ }
+
+ Common.ConfirmationMessage {
+ id: confirmationMessage;
+ property string catid: "";
+ property string feedid: "";
+
+ function action() {
+ if (state=="markAll") {
+ flipper.markAllAsRead();
+ state="hidden"
+ feedsItem.reload()
+ return;
+ }
+ if (state=="deleteCat") {
+ var doc = new XMLHttpRequest();
+ var url = "http://localhost:8000/deleteCat/"+catid
+ doc.open("GET", url);
+ doc.send();
+ categoriesItem.reload();
+ state="hidden";
+ return;
+ }
+ if (state=="deleteFeed") {
+ var doc = new XMLHttpRequest();
+ var url = "http://localhost:8000/deleteFeed/"+catid+"/"+feedid
+ doc.open("GET", url);
+ doc.send();
+ feedsItem.reload();
+ state="hidden";
+ return;
+ }
+ }
+ visible: false
+ onOkClicked: action()
+ onCancelClicked: visible=false
+ state: "hidden"
+ states: [ State {name: "markAll";
+ PropertyChanges { target: confirmationMessage; text: qsTr("Do you want to mark all items as read?") }
+ PropertyChanges { target: confirmationMessage; visible: true; }
+
+ }, State {name: "deleteCat";
+ PropertyChanges { target: confirmationMessage; text: qsTr("Do you want to delete this category?") }
+ PropertyChanges { target: confirmationMessage; visible: true; }
+ }, State {name: "deleteFeed";
+ PropertyChanges { target: confirmationMessage; text: qsTr("Do you want to delete this feed and all its articles?") }
+ PropertyChanges { target: confirmationMessage; visible: true; }
+ }, State {name: "hidden";
+ PropertyChanges { target: confirmationMessage; visible: false; }
+ }
+ ]
+
+ }
+
+ Common.ToolBar {
+ id: toolBar; z: 7
+ height: 66; anchors.top: parent.top; width: parent.width; opacity: 0.9
+ menuLabel: qsTr("Config"); backLabel: qsTr("Back")
+ nextLabel: qsTr("Next"); prevLabel: qsTr("Previous")
+ markAllLabel: qsTr("Mark All As Read"); zoomLabel: qsTr("Zoom")
+ taskSwitcherLabel: qsTr("Task Switch")
+ onMenuClicked: config.isShown = !config.isShown;
+ onBackClicked: container.backClicked()
+ onPrevClicked: flipper.prev();
+ onNextClicked: flipper.next();
+ onMarkAllClicked: {
+ confirmationMessage.state = "markAll";
+ }
+ onZoomClicked: { flipper.zoomEnabled = !flipper.zoomEnabled; }
+ onTaskSwitcherClicked: {
+ var doc = new XMLHttpRequest();
+ var url = "http://localhost:8000/task"
+ doc.open("GET", url);
+ doc.send();
+ }
+ onAddClicked: {
+ if (feedsItem.visible) {
+ addFeedDialog.feedName="";
+ addFeedDialog.catid = feedsItem.catid;
+ addFeedDialog.visible = true;
+ return;
+ }
+ if (categoriesItem.visible) {
+ addCat.catName="";
+ addCat.visible=true;
+ return;
+ }
+ }
+ onUpdateClicked: {
+ if (flipper.visible) {
+ toolBar.feedUpdating = true
+ container.updateClicked(flipper.feedid);
+ } else {
+ container.updateAllClicked();
+ }
+ }
+
+ states: [ State {
+ name: "navButtons"; when: flipper.articleShown
+ PropertyChanges { target: toolBar; nextVisible: !container.inPortrait; }
+ PropertyChanges { target: toolBar; prevVisible: !container.inPortrait; }
+ //PropertyChanges { target: toolBar; zoomVisible: true; }
+ PropertyChanges { target: toolBar; addVisible: false; }
+ },
+ State {
+ name: "feedButtons"; when: (flipper.visible)&&(!flipper.articleShown)
+ PropertyChanges { target: toolBar; markAllVisible: true; }
+ PropertyChanges { target: toolBar; addVisible: false; }
+ PropertyChanges { target: toolBar; updateVisible: true; }
+ },
+ State {
+ name: "quitButton"; when: (!feedsItem.visible)
+ PropertyChanges { target: toolBar; quitVisible: true;}
+ PropertyChanges { target: toolBar; updateVisible: true; }
+ //PropertyChanges { target: toolBar; addVisible: true; }
+ }
+ ]
+ }
+
+ Item {
+ id: views
+ //x: 2;
+ //y:66;
+ width: parent.width // - 4
+ height: parent.height-toolBar.height;
+ anchors.top: toolBar.bottom; anchors.bottom: parent.bottom
+ y: toolBar.height;
+
+ Common.AddCat {
+ visible: false;
+ id: addCat
+ width: parent.width;
+ height: parent.height;
+ z: 10;
+ }
+
+ Common.AddFeed {
+ visible: false;
+ id: addFeedDialog
+ width: parent.width;
+ height: parent.height;
+ z: 10;
+ }
+
+ Timer {
+ function checkUpdates() {
+ if (categoriesItem.visible && !feedsItem.visible) {
+ var doc = new XMLHttpRequest();
+ var url = "http://localhost:8000/isUpdating/"
+ doc.onreadystatechange = function() {
+ if (doc.readyState == XMLHttpRequest.DONE) {
+ var xmlDoc = doc.responseXML.documentElement;
+ //var els = xmlDoc.getElementsByTagName("updating");
+ var isUpdating = xmlDoc.firstChild.firstChild.nodeValue;
+
+ //console.log(isUpdating);
+ if (isUpdating=="True") {
+ toolBar.feedUpdating = true;
+ } else {
+ if (toolBar.feedUpdating) {
+ // We changed from updating to not updating, so we reload the listing
+ toolBar.feedUpdating = false;
+ categoriesItem.reload();
+ }
+ }
+ var commands = xmlDoc.lastChild.childNodes;
+ for (var ii = 0; ii < commands.length; ++ii) {
+ // process the commands
+ var command = commands[ii].attributes[0].value; //("c")
+ //console.log(command)
+ if (command=="openFeed") {
+ // Open feed feed
+ var catid = commands[ii].attributes[1].value;
+ var feedid = commands[ii].firstChild.nodeValue;
+ if (!flipper.visible) {
+ container.categoryClicked(catid);
+ container.feedClicked(feedid,false);
+ console.log("feedid: " + feedid);
+ }
+ }
+ if (command=="openArticle") {
+ // Open feed and article
+ var catid = commands[ii].attributes[1].value;
+ var feedid = commands[ii].attributes[2].value; //("key");
+ var articleid = commands[ii].firstChild.nodeValue;
+ if (!flipper.visible) {
+ container.categoryClicked(catid);
+ container.feedClicked(feedid,false);
+ flipper.viewArticle(articleid)
+ }
+ }
+ if (command=="addFeed") {
+ // Open the addFeed dialog
+ var url = commands[ii].firstChild.nodeValue;
+ //console.log("add: "+url)
+
+ }
+ }
+
+ }
+ }
+ doc.open("GET", url);
+ doc.send();
+ //categoriesItem.reload()
+ }
+ if (feedsItem.visible && !flipper.visible) {
+ //feedsItem.reload()
+ }
+ if (flipper.visible) {
+ var doc = new XMLHttpRequest();
+ var url = "http://localhost:8000/isUpdating/" + flipper.feedid
+ doc.onreadystatechange = function() {
+ if (doc.readyState == XMLHttpRequest.DONE) {
+ var xmlDoc = doc.responseXML.documentElement;
+ var isUpdating = xmlDoc.firstChild.firstChild.nodeValue;
+ //console.log(isUpdating);
+ if (isUpdating=="True") {
+ toolBar.feedUpdating = true;
+ } else {
+ if (toolBar.feedUpdating) {
+ // We changed from updating to not updating, so we reload the listing
+ toolBar.feedUpdating = false;
+ flipper.reload();
+ }
+ }
+ }
+ }
+ doc.open("GET", url);
+ doc.send();
+
+ //flipper.reload()
+ }
+ }
+ interval: 2000; running: false; repeat: true
+ onTriggered: checkUpdates();
+ }
+
+ Categories {
+ // Loads the categoryList view and delegate
+ id: categoriesItem
+ property bool isShown: true;
+ inEditMode: container.editMode;
+
+ states: State {
+ name: "shown"; when: categoriesItem.isShown == false
+ PropertyChanges { target: categoriesItem; x: -screen.width }
+ }
+
+ transitions: Transition {
+ NumberAnimation { properties: "x"; duration: 300; easing.type: "InOutQuad" }
+ }
+
+ }
+
+ Feeds {
+
+ // Loads the feedList view and delegate
+ id: feedsItem;
+ property string hideReadFeeds: config.hideReadFeeds
+ visible: false;
+ inEditMode: container.editMode;
+
+ states: [
+ State { name: "articlesShown"; when: flipper.visible; PropertyChanges { target: feedsItem; x: -parent.width } },
+ State { name: "shown"; when: feedsItem.visible; PropertyChanges { target: feedsItem; x: 0 } }
+ ]
+
+ transitions: Transition {
+ NumberAnimation { properties: "x"; duration: 300; easing.type: "InOutQuad" }
+ }
+
+ }
+
+ ArticleViewer {
+ id: flipper
+ visible: false;
+ property string hideReadArticles: config.hideReadArticles
+ property string feedid: ""
+ x: parent.width
+
+ states: State { name: "shown"; when: flipper.visible; PropertyChanges { target: flipper; x: 0 }
+ }
+
+ transitions: Transition {
+ NumberAnimation { properties: "x"; duration: 300; easing.type: "InOutQuad" }
+ }
+ }
+ }
+
+// Text {
+// x: container.width/2
+// y:container.height/2
+// text: runtime.orientation;
+// }
+
+}
+}
--- /dev/null
+/* File generated by QtCreator */
+
+import QmlProject 1.0
+
+Project {
+ /* Include .qml, .js, and image files from current directory and subdirectories */
+ QmlFiles {
+ directory: "."
+ }
+ JavaScriptFiles {
+ directory: "."
+ }
+ ImageFiles {
+ directory: "."
+ }
+ /* List of plugin directories passed to QML runtime */
+ // importPaths: [ "../exampleplugin" ]
+}
--- /dev/null
+<!DOCTYPE QtCreatorProject>
+<qtcreator>
+ <data>
+ <variable>ProjectExplorer.Project.ActiveTarget</variable>
+ <value type="int">0</value>
+ </data>
+ <data>
+ <variable>ProjectExplorer.Project.EditorSettings</variable>
+ <valuemap type="QVariantMap">
+ <value key="EditorConfiguration.Codec" type="QByteArray">Default</value>
+ </valuemap>
+ </data>
+ <data>
+ <variable>ProjectExplorer.Project.Target.0</variable>
+ <valuemap type="QVariantMap">
+ <value key="ProjectExplorer.ProjectConfiguration.DefaultDisplayName" type="QString">QML Viewer</value>
+ <value key="ProjectExplorer.ProjectConfiguration.DisplayName" type="QString">QML Viewer</value>
+ <value key="ProjectExplorer.ProjectConfiguration.Id" type="QString">QmlProjectManager.QmlTarget</value>
+ <value key="ProjectExplorer.Target.ActiveBuildConfiguration" type="int">-1</value>
+ <value key="ProjectExplorer.Target.ActiveDeployConfiguration" type="int">-1</value>
+ <value key="ProjectExplorer.Target.ActiveRunConfiguration" type="int">0</value>
+ <value key="ProjectExplorer.Target.BuildConfigurationCount" type="int">0</value>
+ <value key="ProjectExplorer.Target.DeployConfigurationCount" type="int">0</value>
+ <valuemap key="ProjectExplorer.Target.RunConfiguration.0" type="QVariantMap">
+ <value key="ProjectExplorer.ProjectConfiguration.DefaultDisplayName" type="QString"></value>
+ <value key="ProjectExplorer.ProjectConfiguration.DisplayName" type="QString">QML Viewer</value>
+ <value key="ProjectExplorer.ProjectConfiguration.Id" type="QString">QmlProjectManager.QmlRunConfiguration</value>
+ <value key="QmlProjectManager.QmlRunConfiguration.MainScript" type="QString">CurrentFile</value>
+ <value key="QmlProjectManager.QmlRunConfiguration.QDeclarativeViewerArguments" type="QString"></value>
+ <value key="QmlProjectManager.QmlRunConfiguration.QtVersion" type="int">4</value>
+ <value key="RunConfiguration.QmlDebugServerPort" type="uint">3768</value>
+ <value key="RunConfiguration.UseCppDebugger" type="bool">false</value>
+ <value key="RunConfiguration.UseQmlDebugger" type="bool">true</value>
+ </valuemap>
+ <value key="ProjectExplorer.Target.RunConfigurationCount" type="int">1</value>
+ </valuemap>
+ </data>
+ <data>
+ <variable>ProjectExplorer.Project.TargetCount</variable>
+ <value type="int">1</value>
+ </data>
+ <data>
+ <variable>ProjectExplorer.Project.Updater.EnvironmentId</variable>
+ <value type="QString">{6449687d-a4d3-4afc-95ac-89e1027ef47e}</value>
+ </data>
+ <data>
+ <variable>ProjectExplorer.Project.Updater.FileVersion</variable>
+ <value type="int">8</value>
+ </data>
+</qtcreator>
--- /dev/null
+import Qt 4.7
+
+Item {
+ //anchors.fill: parent;
+ width: parent.width;
+ property string catid : ""
+ property bool inEditMode: true
+ x: parent.width; height: parent.height;
+ anchors.top: parent.top; anchors.bottom: parent.bottom
+
+ function reload() {
+ feeds.xml = catid == "" ? "" : controller.getFeedsXml(catid);
+ //feeds.reload()
+ }
+
+ //Component.onCompleted: { console.log(x + " /") }
+
+ ListView {
+ id: feedList; model: feeds; delegate: feedDelegate; z: 6
+ width: parent.width; height: parent.height; /*x: 0;*/
+ cacheBuffer: 100;
+ flickDeceleration: 1500
+ }
+
+ XmlListModel {
+
+ id: feeds
+
+ //source: catid == "" ? "" : "http://localhost:8000/feeds/" + catid //+ "?onlyUnread=" + parent.hideReadArticles
+ //xml: catid == "" ? "" : controller.getFeedsXml(catid)
+ query: "/xml/feed"
+
+ XmlRole { name: "title"; query: "feedname/string()" }
+ XmlRole { name: "feedid"; query: "feedid/string()"; isKey: true }
+ XmlRole { name: "unread"; query: "unread/string()"; isKey: true }
+ XmlRole { name: "updatedDate"; query: "updatedDate/string()" }
+ XmlRole { name: "icon"; query: "icon/string()" }
+ XmlRole { name: "updating"; query: "updating/string()"; isKey: true }
+ //XmlRole { name: "url"; query: "url/string()"; }
+ }
+
+ Component {
+ id: feedDelegate
+
+ Item {
+ id: wrapper; width: wrapper.ListView.view.width;
+ visible: (unread == "0" && feedsItem.hideReadFeeds=="True") ? false : true
+ height: (visible) ? 86 : 0
+
+ Item {
+ id: moveMe
+ Rectangle { color: "black"; opacity: index % 2 ? 0.2 : 0.4; height: 84; width: wrapper.width; y: 1 }
+ Rectangle {
+ x: 3; y: 4; width: 77; height: 77; color: "#000000"; smooth: true
+ Image { width:32; height: 32; anchors.verticalCenter: parent.verticalCenter; anchors.horizontalCenter: parent.horizontalCenter;
+ source: (updating=="True")? "common/images/loading.png" : (icon == "False") ? "common/images/feedingit.png" : icon;
+ NumberAnimation on rotation {
+ from: 0; to: 360; running: (updating=="True"); loops: Animation.Infinite; duration: 900
+ }
+ }
+ }
+
+ Column {
+ x: 92; width: wrapper.ListView.view.width - 95; y: 5; spacing: 2
+ Text { text: title; color: "white"; width: parent.width; font.bold: true; elide: Text.ElideRight; style: Text.Raised; styleColor: "black" }
+ Text { text: updatedDate + " / " + qsTr("%1 unread items").arg(unread); color: (unread=="0") ? "white" : "#7b97fd"; width: parent.width; font.bold: false; elide: Text.ElideRight; style: Text.Raised; styleColor: "black" }
+ //Text { text: feedname; width: parent.width; elide: Text.ElideLeft; color: "#cccccc"; style: Text.Raised; styleColor: "black" }
+ }
+// Item {
+// x: wrapper.ListView.view.width - 128; y: 12
+// height: 58; width: 58;
+// //anchors.horizontalCenter: parent.horizontalCenter;
+// Image { source: "common/images/wmEditIcon.png" }
+// MouseArea {
+// anchors.fill: parent; onClicked: { container.feedEdit(feedname, feedid, url); }
+// }
+// visible: inEditMode
+// }
+ Item {
+ x: wrapper.ListView.view.width - 64; y: 12
+ height: 58; width: 58;
+ //anchors.horizontalCenter: parent.horizontalCenter;
+ Image { source: "common/images/delete.png" }
+ MouseArea {
+ anchors.fill: parent; onClicked: { container.feedDeleted(feedid); }
+ }
+ visible: inEditMode
+ }
+ }
+ MouseArea {
+ anchors.fill: wrapper;
+ onClicked: {
+ controller.feedClicked(model.feed)
+ container.feedClicked(feedid, updating=="True")
+
+ }
+ }
+ }
+
+ }
+
+}
--- /dev/null
+import QtQuick 1.1
+import com.meego 1.0
+
+Page {
+ id: mainPage
+ tools: commonTools
+ Label {
+ id: label
+ anchors.centerIn: parent
+ text: qsTr("Hello world!")
+ visible: false
+ }
+ Button{
+ anchors.horizontalCenter: parent.horizontalCenter
+ anchors.top: label.bottom
+ anchors.topMargin: 10
+ text: qsTr("Click here!")
+ onClicked: label.visible=true
+ }
+}
--- /dev/null
+import Qt 4.7
+import QtWebKit 1.0
+import "common" as Common
+
+Rectangle {
+ width: 380
+ height: 480
+
+ //anchors.top: parent.top; anchors.bottom: parent.bottom;
+ color: "white";
+
+ property string url: "";
+ Flickable {
+ id: flickable
+ //anchors.fill: screen;
+ height: parent.height;
+ width: parent.width;
+ contentWidth: webView.width*scale; //Math.max(screen.width,webView.width*webView.scale)
+ contentHeight: Math.max(parent.height,webView.height*webView.scale)
+
+ WebView {
+ id: webView
+ url: "http://www.google.com";
+ //url: "/home/user/.feedingit/640fb167aca8bf5318ed721c5162f5eb.d/56a86b6b1675716ab54db83b1a78ab4c.html"
+ preferredWidth: flickable.width
+ preferredHeight: flickable.height
+ settings.defaultFontSize: 32
+ scale: slider.value;
+ //smooth: false
+ //width: 200
+ //width: parent.width; height: parent.height;
+// Rectangle {
+// color: "#10000000"
+// anchors.fill: parent
+// }
+ //onLoadFinished: {console.log("Hello"); url="javascript:void(document.body.style.background='red');" }
+ onLoadFinished: {console.log(url);/* url="javascript:(function() { " +
+ "document.getElementsByTagName('body')[0].style.background = 'red'; " +
+ "})()"; console.log(url);*/ /*heuristicZoom(0,0,100)*/ }
+ }
+ }
+ Common.Slider {
+ id: slider; visible: true
+ minimum: 0.2;
+ maximum: 2;
+ value: 1
+ property real prevScale: 1
+ anchors {
+ bottom: parent.bottom; bottomMargin: 65
+ left: parent.left; leftMargin: 25
+ right: parent.right; rightMargin: 25
+ }
+ onValueChanged: {
+ if (webView.width * value > flickable.width) {
+ var xoff = (flickable.width/2 + flickable.contentX) * value / prevScale;
+ flickable.contentX = xoff - flickable.width/2;
+ }
+ if (webView.height * value > flickable.height) {
+ var yoff = (flickable.height/2 + flickable.contentY) * value / prevScale;
+ flickable.contentY = yoff - flickable.height/2;
+ }
+ prevScale = value;
+ }
+ Component.onCompleted: { value=0; value=1; }
+ }
+}
--- /dev/null
+import Qt 4.7
+
+Rectangle {
+ id: addCat;
+ width: 200 //parent.width
+ height: 172
+ color: "white"
+ property alias catName: categoryName.text
+ MouseArea { anchors.fill: parent; onClicked: {} }
+ Column {
+ Row {
+ width: addCat.width
+ height: 86;
+ Text { anchors.verticalCenter: parent.verticalCenter; text: qsTr("Category name:") }
+ LineInput{
+ id: categoryName
+ anchors.centerIn: parent
+ width: 140
+ focus: true
+ }
+ }
+ Row {
+ width: addCat.width
+ Button {
+ id: ok
+ text: qsTr("OK")
+ anchors.margins: 5; y: 3; width: 80; height: 60
+ onClicked: container.addCategory(categoryName.text)
+ }
+
+ Button {
+ id: cancel
+ text: qsTr("Cancel")
+ anchors.margins: 5; y: 3; width: 80; height: 60
+ onClicked: addCat.visible=false;
+ }
+ }
+ }
+
+}
--- /dev/null
+import Qt 4.7
+
+Rectangle {
+ id: addFeed;
+ width: 500 //parent.width
+ height: 172
+ color: "white"
+ property alias feedName: feedName.text
+ property string catid
+ property string feedUrl: feedURL.text
+ //property boolean feedEdit: false;
+
+ MouseArea { anchors.fill: parent; onClicked: {} }
+ Column {
+ Row {
+ width: addFeed.width
+ height: 86;
+ Text { anchors.verticalCenter: parent.verticalCenter; text: qsTr("Feed name:") }
+ LineInput{
+ id: feedName
+ anchors.centerIn: parent
+ width: 140
+ focus: true
+ }
+ }
+ Row {
+ width: addFeed.width
+ height: 86;
+ Text { anchors.verticalCenter: parent.verticalCenter; text: qsTr("Feed URL:") }
+ LineInput{
+ id: feedURL
+ anchors.centerIn: parent
+ width: 140
+ focus: true
+ text: "http://"
+ }
+ }
+ Row {
+ width: addFeed.width
+ Button {
+ id: ok
+ text: qsTr("OK")
+ anchors.margins: 5; y: 3; width: 80; height: 60
+ onClicked: container.addFeed(catid, feedName.text, feedURL.text)
+ }
+ Button {
+ id: cancel
+ text: qsTr("Cancel")
+ anchors.margins: 5; y: 3; width: 80; height: 60
+ onClicked: addFeed.visible=false;
+ }
+ }
+ }
+}
--- /dev/null
+import Qt 4.7
+
+Item {
+ id: container
+
+ signal clicked
+
+ property string text
+ property string imageSource: ""
+ property int imageRotation: 0
+
+ property alias iconRotation: icon.rotation
+
+ BorderImage {
+ id: buttonImage
+ source: "images/toolbutton.sci"
+ width: container.width; height: container.height
+ //visible: (container.imageSource=="")
+ }
+ BorderImage {
+ id: pressed
+ opacity: 0
+ source: "images/toolbutton.sci"
+ width: container.width; height: container.height
+ //visible: (container.imageSource=="")
+ }
+ Image {
+ id: icon
+ source: container.imageSource
+ rotation: container.imageRotation
+ //fillMode: Image.PreserveAspectFit
+ smooth: true
+ anchors.centerIn: buttonImage;
+ //width: container.width; height: container.height
+ }
+ MouseArea {
+ id: mouseRegion
+ anchors.fill: buttonImage
+ onClicked: { container.clicked(); }
+ }
+ Text {
+ color: "white"
+ anchors.centerIn: buttonImage; font.bold: true
+ text: container.text; style: Text.Raised; styleColor: "black"
+ visible: (container.imageSource=="")
+ }
+ states: [
+ State {
+ name: "Pressed"
+ when: mouseRegion.pressed == true
+ PropertyChanges { target: pressed; opacity: 1 }
+ }
+ ]
+}
--- /dev/null
+import Qt 4.7
+
+Rectangle {
+ id: confirmationMessage
+ signal okClicked
+ signal cancelClicked
+
+ property alias text: question.text
+
+ border.color: "black";
+ border.width : 4;
+ radius: 10;
+ color: "white"
+ height: 160;
+ width: 160;
+ z: 10;
+ anchors.fill: parent
+
+ Text {
+ id: question
+ text: qsTr("Are you sure?")
+ width: parent.width; height: 80
+ horizontalAlignment: Text.AlignHCenter
+ verticalAlignment: Text.AlignVCenter
+ anchors.top: parent.top
+ //anchors.bottom: parent.bottom
+ anchors.margins: 10;
+ //anchors.verticalCenter: parent.verticalCenter
+ }
+
+ Button {
+ id: ok
+ text: qsTr("OK")
+ width: parent.width/2 - 10;
+ anchors.left: parent.left; anchors.margins: 5; y: 3; height: 60
+ anchors.top: question.bottom
+ //anchors.bottom: parent.bottom
+ onClicked: confirmationMessage.okClicked()
+ }
+
+ Button {
+ id: cancel
+ text: qsTr("Cancel")
+ width: parent.width/2 - 10;
+ anchors.right: parent.right; anchors.margins: 5; y: 3; height: 60
+ anchors.top: question.bottom
+ //anchors.bottom: parent.bottom
+ anchors.left: ok.right
+ onClicked: confirmationMessage.cancelClicked()
+ }
+
+}
--- /dev/null
+import Qt 4.7
+
+FocusScope {
+ property alias text: input.text
+ property alias maximumLength: input.maximumLength
+ //anchors.centerIn: parent
+ width: 180; height: 28
+ BorderImage {
+ source: "images/lineedit.sci"
+ anchors.fill: parent
+ }
+ TextInput {
+ id: input
+ color: "#151515"; selectionColor: "green"
+ font.pixelSize: 16; font.bold: true
+ width: parent.width-16
+ anchors.centerIn: parent
+ focus: true
+ }
+}
--- /dev/null
+import Qt 4.7
+
+Rectangle {
+ width: 640
+ height: 480
+
+ ListView {
+ id: categoryList; model: categories; delegate: categoryDelegate; z: 6;
+ cacheBuffer: 100; width: parent.width; height: parent.height;
+ }
+
+
+
+}
--- /dev/null
+import Qt 4.7
+
+Item {
+// anchors.fill: parent;
+ width: 300; //height: 0;
+ //anchors.top: parent.top; anchors.bottom: parent.bottom
+ y: -parent.height
+
+ function getConfig() {
+ config.hideReadFeeds = controller.getConfig("hideReadFeeds");
+ config.hideReadArticles = controller.getConfig("hideReadArticles");
+
+ }
+
+ Switch {
+ id: hideReadFeedsSwitch;
+ text: qsTr("Hide Read Feeds");
+ value: config.hideReadFeeds
+ onClicked: config.hideReadFeeds = (config.hideReadFeeds == "False") ? "True" : "False"
+ }
+
+ Switch {
+ id: hideReadArticlesSwitch;
+ text: qsTr("Hide Read Articles");
+ value: config.hideReadArticles
+ onClicked: config.hideReadArticles = (config.hideReadArticles == "False") ? "True" : "False"
+ anchors.top: hideReadFeedsSwitch.bottom
+ }
+
+ Switch {
+ id: lockRotation;
+ text: qsTr("Lock Rotation");
+ value: container.lockRotation ? "True" : "False"
+ onClicked: { container.lockRotation=!container.lockRotation;
+ container.selectedOrientation = (container.lockRotation) ? container.activeOrientation : Orientation.UnknownOrientation }
+ anchors.top: hideReadArticlesSwitch.bottom
+ }
+
+ Switch {
+ id: editMode;
+ text: qsTr("Enter Edit Mode");
+ value: container.editMode ? "True" : "False"
+ onClicked: { container.editMode=!container.editMode; }
+ anchors.top: lockRotation.bottom
+ }
+
+ Rectangle {
+ id: closeButton
+ height: 50;
+ gradient: Gradient {
+ GradientStop {
+ position: 0.00;
+ color: "#343434";
+ }
+ GradientStop {
+ position: 1.00;
+ color: "#ffffff";
+ }
+ }
+ radius: 10;
+ width: parent.width
+ anchors.top: editMode.bottom
+
+ MouseArea {
+ id: mouseRegion
+ anchors.fill: closeButton
+ onClicked: { config.isShown = false }
+ }
+ }
+
+// ListView {
+// id: configList; model: configs; delegate: configDelegate; z: 6;
+// cacheBuffer: 100; width: parent.width; height: parent.height;
+// }
+
+// XmlListModel {
+
+// id: configs
+
+// //source: "http://api.flickr.com/services/feeds/photos_public.gne?"+(tags ? "tags="+tags+"&" : "")+"format=rss2"
+// //source: "/home/ymarcoz/feedlist.xml"
+// source: "http://localhost:8000/config"
+// query: "/xml/config"
+// //namespaceDeclarations: "declare namespace media=\"http://search.yahoo.com/mrss/\";"
+
+// XmlRole { name: "hideReadFeeds"; query: "hideReadFeeds/string()" }
+// XmlRole { name: "hideReadArticles"; query: "hideReadArticles/string()" }
+// //XmlRole { name: "catid"; query: "catid/string()"; isKey: true }
+
+// }
+
+// Component {
+// id: configDelegate
+
+// Item {
+
+// id: wrapper; width: wrapper.ListView.view.width; height: 86
+// Item {
+// id: moveMe
+// height: parent.height
+// Rectangle { color: "black"; opacity: index % 2 ? 0.2 : 0.4; height: 84; width: wrapper.width; y: 1 }
+// Rectangle {
+// x: 6; y: 4; width: 77; height: parent.height - 9; color: "white"; smooth: true
+
+// }
+// Column {
+// x: 92; width: wrapper.ListView.view.width - 95; y: 15; spacing: 2
+// Text { text: title; color: "white"; width: parent.width; font.bold: true; elide: Text.ElideRight; style: Text.Raised; styleColor: "black" }
+// //Text { text: feedname; width: parent.width; elide: Text.ElideLeft; color: "#cccccc"; style: Text.Raised; styleColor: "black" }
+// }
+// }
+// MouseArea { anchors.fill: wrapper; onClicked: { container.categoryClicked(catid); } }
+// }
+// }
+
+ Component.onCompleted: getConfig();
+}
--- /dev/null
+import Qt 4.7
+
+Item {
+ id: slider; width: 340; height: 48
+
+ // value is read/write.
+ property real value
+ onValueChanged: { handle.x = 2 + (value - minimum) * slider.xMax / (maximum - minimum); }
+ property real maximum: 1
+ property real minimum: 1
+ property int xMax: slider.width - handle.width - 4
+
+ Rectangle {
+ anchors.fill: parent
+ border.color: "white"; border.width: 0; radius: 8
+ gradient: Gradient {
+ GradientStop { position: 0.0; color: "#66343434" }
+ GradientStop { position: 1.0; color: "#66000000" }
+ }
+ }
+
+ Rectangle {
+ id: handle; smooth: true
+ x: slider.width / 2 - handle.width / 2; y: 2; width: 30; height: slider.height-4; radius: 6
+ gradient: Gradient {
+ GradientStop { position: 0.0; color: "lightgray" }
+ GradientStop { position: 1.0; color: "gray" }
+ }
+
+ MouseArea {
+ anchors.fill: parent; drag.target: parent
+ drag.axis: "XAxis"; drag.minimumX: 2; drag.maximumX: slider.xMax+2
+ onPositionChanged: { value = (maximum - minimum) * (handle.x-2) / slider.xMax + minimum; }
+ }
+ }
+}
--- /dev/null
+import Qt 4.7
+
+Item {
+ id: container
+
+ signal clicked
+
+ property string text
+ property string value
+
+ width: parent.width;
+ height: 86;
+ //anchors.fill: parent;
+
+// BorderImage {
+// id: buttonImage
+// source: "images/toolbutton.sci"
+// width: container.width; height: container.height
+// }
+// BorderImage {
+// id: pressed
+// opacity: 0
+// source: "images/toolbutton.sci"
+// width: container.width; height: container.height
+// }
+
+ Rectangle {
+ id: back
+ width: parent.width;
+ height: 82;
+ color: "#343434";
+ border.width : 4;
+ border.color: "black";
+ radius: 10;
+ }
+
+ Rectangle {
+ id: valueSwitch
+ color: (value=="False") ? "red" : "green";
+ border.width : 4;
+ border.color: "black";
+ radius: 10;
+ height: 40;
+ width: 40;
+ anchors.verticalCenter: back.verticalCenter
+ //anchors.verticalCenter: parent.verticalCenter
+ anchors.margins: 10;
+ anchors.right: back.right;
+ Text {
+ color: "white"
+ anchors.centerIn: valueSwitch; font.bold: true
+ text: (container.value == "False") ? "OFF" : "ON"; style: Text.Raised; styleColor: "black"
+ }
+ }
+
+ MouseArea {
+ id: mouseRegion
+ anchors.fill: back
+ onClicked: { container.clicked(); }
+ }
+ Text {
+ color: "white"
+ /*anchors.centerIn: back;*/ font.bold: true
+ anchors.left: parent.left;
+ anchors.margins: 10
+ anchors.verticalCenter: back.verticalCenter
+ text: container.text; style: Text.Raised; styleColor: "black"
+ }
+// states: [
+// State {
+// name: "Pressed"
+// when: mouseRegion.pressed == true
+// PropertyChanges { target: pressed; opacity: 1 }
+// }
+// ]
+}
--- /dev/null
+import Qt 4.7
+
+Item {
+ id: toolbar
+
+ property alias menuLabel: menuButton.text
+ property alias backLabel: backButton.text
+ property alias prevLabel: prevButton.text
+ property alias nextLabel: nextButton.text
+ property alias markAllLabel: markAllButton.text
+ property alias zoomLabel: zoomButton.text
+ property alias taskSwitcherLabel: taskSwitcherButton.text
+
+ property alias nextVisible: nextButton.visible
+ property alias prevVisible: prevButton.visible
+ property alias markAllVisible: markAllButton.visible
+ property alias zoomVisible: zoomButton.visible
+ property alias quitVisible: quitButton.visible
+ property alias addVisible: addButton.visible
+ property alias updateVisible: updateFeedButton.visible
+
+ property bool feedUpdating: false
+
+ signal menuClicked
+ signal backClicked
+ signal prevClicked
+ signal nextClicked
+ signal markAllClicked
+ signal zoomClicked
+ signal taskSwitcherClicked
+ signal addClicked
+ signal updateClicked
+ //signal rotateClicked
+
+ //BorderImage { source: "images/titlebar.sci"; width: parent.width; height: parent.height + 14; y: -7 }
+ Rectangle {
+ anchors.fill: parent; color: "#343434";
+ border.color: "black"
+ gradient: Gradient {
+ GradientStop {
+ position: 0.00;
+ color: "#343434";
+ }
+ GradientStop {
+ position: 1.00;
+ color: "#ffffff";
+ }
+ }
+
+ Row {
+ anchors.fill: parent
+ Button {
+ id: taskSwitcherButton
+ /*anchors.left: parent.left;*/ anchors.leftMargin: 5; y: 3; width: 116; height: 60
+ onClicked: toolbar.taskSwitcherClicked()
+ imageSource: "images/wmTaskLauncherIcon.png"
+ visible: false
+ }
+
+ Button {
+ id: menuButton
+ /*anchors.left: taskSwitcherButton.right;*/ anchors.leftMargin: 5; y: 3; width: 60; height: 60
+ onClicked: toolbar.menuClicked()
+ imageSource: "images/wmEditIcon.png"
+ }
+
+ Button {
+ id: addButton
+ visible: true; /*anchors.left: menuButton.right;*/
+ anchors.rightMargin: 5; y: 3; width: 60; height: 60
+ onClicked: toolbar.addClicked()
+ imageSource: "images/plus.png"
+
+ }
+
+ Button {
+ id: updateFeedButton
+ visible: false; /*anchors.left: menuButton.right;*/
+ anchors.rightMargin: 5; y: 3; width: 60; height: 60
+ onClicked: toolbar.updateClicked()
+ //imageSource: (!feedUpdating) ? "images/rotate.png" : "images/loading.png"
+ NumberAnimation on iconRotation {
+ from: 0; to: 360; running: (visible == true) && (feedUpdating); loops: Animation.Infinite; duration: 900
+ }
+ state: "update"
+ states : [State {name: "loading"; when: (feedUpdating);
+ PropertyChanges {target: updateFeedButton; imageSource: "images/loading2.png" }
+ }, State { name: "update"; when: (!feedUpdating);
+ PropertyChanges {target: updateFeedButton; iconRotation: 0}
+ PropertyChanges {target: updateFeedButton; imageSource: "images/rotate.png"}
+ }
+ ]
+ }
+
+ Button {
+ id: markAllButton
+ visible: false
+ /*anchors.left: updateFeedButton.right;*/ anchors.rightMargin: 5; y: 3; width: 60; height: 60
+ onClicked: toolbar.markAllClicked()
+ imageSource: "images/checkmark.png"
+ }
+
+ Button {
+ id: prevButton
+ visible: false
+ /*anchors.left: menuButton.right;*/ anchors.rightMargin: 5; y: 3; width: 120; height: 60
+ onClicked: toolbar.prevClicked()
+ imageSource: "images/InputMethodShiftButtonNormal.png"
+ imageRotation: -90;
+ }
+
+ Button {
+ id: zoomButton
+ visible: false
+ /*anchors.right: backButton.left; */anchors.rightMargin: 5; y: 3; width: 80; height: 60
+ onClicked: toolbar.zoomClicked()
+ imageSource: "images/Zoom-In-icon.png"
+ }
+
+ Button {
+ id: nextButton
+ visible: false
+ /*anchors.right: zoomButton.left;*/ anchors.rightMargin: 5; y: 3; width: 120; height: 60
+ onClicked: toolbar.nextClicked()
+ imageSource: "images/InputMethodShiftButtonNormal.png"
+ imageRotation: 90
+ }
+
+ Button {
+ id: backButton
+ anchors.rightMargin: 5; y: 3; width: 116; height: 60
+ anchors.right: parent.right
+ onClicked: toolbar.backClicked()
+ imageSource: "images/wmBackIcon.png"
+ visible: !quitButton.visible
+ }
+
+ Button {
+ id: quitButton
+ visible: false
+ anchors.rightMargin: 5; y: 3; width: 116; height: 60
+ anchors.right: parent.right
+ onClicked: toolbar.backClicked()
+ imageSource: "images/wmCloseIcon.png"
+ }
+ }
+ }
+}
--- /dev/null
+[Dolphin]
+ShowPreview=true
+Timestamp=2010,11,7,0,33,30
--- /dev/null
+border.left: 10
+border.top: 10
+border.bottom: 10
+border.right: 10
+source: lineedit.png
--- /dev/null
+border.left: 15
+border.top: 4
+border.bottom: 4
+border.right: 15
+source: toolbutton.png
--- /dev/null
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE TS>
+<TS version="2.0" language="en_CA">
+<context>
+ <name>FeedingItUI2</name>
+ <message>
+ <location filename="../FeedingItUI2.qml" line="53"/>
+ <source>Back</source>
+ <translation type="unfinished">Back 2</translation>
+ </message>
+</context>
+</TS>
--- /dev/null
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE TS>
+<TS version="2.0" language="en_CA">
+<context>
+ <name>FeedingItUI2</name>
+ <message>
+ <location filename="../FeedingItUI2.qml" line="75"/>
+ <source>Back</source>
+ <translation>Back 2</translation>
+ </message>
+ <message>
+ <location filename="../FeedingItUI2.qml" line="75"/>
+ <source>Config</source>
+ <translation>Config</translation>
+ </message>
+</context>
+<context>
+ <name>Feeds</name>
+ <message>
+ <source>unreadItems</source>
+ <translation type="obsolete"> %1 unread items</translation>
+ </message>
+ <message>
+ <location filename="../Feeds.qml" line="55"/>
+ <source>%1 unread items</source>
+ <translation>%1 unread items</translation>
+ </message>
+</context>
+</TS>
--- /dev/null
+import QtQuick 1.0
+import com.nokia.meego 1.0
+
+
+PageStackWindow {
+ initialPage: mainPage
+
+ Page{
+ id: mainPage
+ Component.onCompleted: {
+ var main = Qt.createComponent("FeedingIt.qml");
+ main.createObject(mainPage);
+ }
+ }
+}
--- /dev/null
+from distutils.core import setup
+import os, sys, glob
+
+def read(fname):
+ return open(os.path.join(os.path.dirname(__file__), fname)).read()
+
+setup(name="feedingit",
+ scripts=['feedingit'],
+ version='0.1.0',
+ maintainer="Yves",
+ maintainer_email="yves@marcoz.org",
+ description="FeedingIt - RSS Reader",
+ long_description=read('feedingit.longdesc'),
+ data_files=[('share/applications',['feedingit.desktop']),
+ ('share/icons/hicolor/64x64/apps', ['feedingit.png']),
+ ('share/feedingit/qml', glob.glob('qml/*.qml')),
+ ('share/feedingit/qml/common', glob.glob('qml/common/*.qml')),
+ ('share/feedingit/qml/common/images', glob.glob('qml/common/images/*')),
+ ('share/feedingit/qml/i18n', glob.glob('qml/i18n/*')),
+ ('share/dbus-1/services', ['feedingit_status.service']),
+ ('share/feedingit', glob.glob('pysrc/*.py')) ],)
--- /dev/null
+[DEFAULT]
+XS-Python-Version: 2.6
+Package: feedingit
+Section: user/development
+Depends: python-pyside.qtgui, python-pyside.qtopengl, python-pyside.qtdeclarative, python-dbus, python-gconf
--- /dev/null
+import Thread
+
+class Download(Thread):
+ def __init__(self, listing, key, config):
+ Thread.__init__(self)
+ self.listing = listing
+ self.key = key
+ self.config = config
+
+ def run (self):
+ (use_proxy, proxy) = self.config.getProxy()
+ key_lock = get_lock(self.key)
+ if key_lock != None:
+ if use_proxy:
+ self.listing.updateFeed(self.key, self.config.getExpiry(), proxy=proxy, imageCache=self.config.getImageCache() )
+ else:
+ self.listing.updateFeed(self.key, self.config.getExpiry(), imageCache=self.config.getImageCache() )
+ del key_lock
\ No newline at end of file