vcs.maemo.org Git - gc-dialer/blob - src/backends/browser_emu.py

   1 """
   2 @author:          Laszlo Nagy
   3 @copyright:   (c) 2005 by Szoftver Messias Bt.
   4 @licence:        BSD style
   5
   6 Objects of the MozillaEmulator class can emulate a browser that is capable of:
   7
   8         - cookie management
   9         - configurable user agent string
  10         - GET and POST
  11         - multipart POST (send files)
  12         - receive content into file
  13
  14 I have seen many requests on the python mailing list about how to emulate a browser. I'm using this class for years now, without any problems. This is how you can use it:
  15
  16         1. Use firefox
  17         2. Install and open the livehttpheaders plugin
  18         3. Use the website manually with firefox
  19         4. Check the GET and POST requests in the livehttpheaders capture window
  20         5. Create an instance of the above class and send the same GET and POST requests to the server.
  21
  22 Optional steps:
  23
  24         - You can change user agent string in the build_opened method
  25         - The "encode_multipart_formdata" function can be used alone to create POST data from a list of field values and files
  26 """
  27
  28 import urllib2
  29 import cookielib
  30 import logging
  31
  32 import socket
  33
  34
  35 _moduleLogger = logging.getLogger(__name__)
  36 socket.setdefaulttimeout(45)
  37
  38
  39 def add_proxy(protocol, url, port):
  40         proxyInfo = "%s:%s" % (url, port)
  41         proxy = urllib2.ProxyHandler(
  42                 {protocol: proxyInfo}
  43         )
  44         opener = urllib2.build_opener(proxy)
  45         urllib2.install_opener(opener)
  46
  47
  48 class MozillaEmulator(object):
  49
  50         USER_AGENT = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.1.4) Gecko/20091016 Firefox/3.5.4 (.NET CLR 3.5.30729)'
  51
  52         def __init__(self, trycount = 1):
  53                 """Create a new MozillaEmulator object.
  54
  55                 @param trycount: The download() method will retry the operation if it
  56                 fails. You can specify -1 for infinite retrying.  A value of 0 means no
  57                 retrying. A value of 1 means one retry. etc."""
  58                 self.debug = False
  59                 self.trycount = trycount
  60                 self._cookies = cookielib.LWPCookieJar()
  61                 self._loadedFromCookies = False
  62
  63         def load_cookies(self, path):
  64                 assert not self._loadedFromCookies, "Load cookies only once"
  65                 if path is None:
  66                         return
  67
  68                 self._cookies.filename = path
  69                 try:
  70                         self._cookies.load()
  71                 except cookielib.LoadError:
  72                         _moduleLogger.exception("Bad cookie file")
  73                 except IOError:
  74                         _moduleLogger.exception("No cookie file")
  75                 except Exception, e:
  76                         _moduleLogger.exception("Unknown error with cookies")
  77                 self._loadedFromCookies = True
  78
  79                 return self._loadedFromCookies
  80
  81         def save_cookies(self):
  82                 if self._loadedFromCookies:
  83                         self._cookies.save()
  84
  85         def clear_cookies(self):
  86                 if self._loadedFromCookies:
  87                         self._cookies.clear()
  88
  89         def download(self, url,
  90                         postdata = None, extraheaders = None, forbidRedirect = False,
  91                         trycount = None, only_head = False,
  92                 ):
  93                 """Download an URL with GET or POST methods.
  94
  95                 @param postdata: It can be a string that will be POST-ed to the URL.
  96                         When None is given, the method will be GET instead.
  97                 @param extraheaders: You can add/modify HTTP headers with a dict here.
  98                 @param forbidRedirect: Set this flag if you do not want to handle
  99                         HTTP 301 and 302 redirects.
 100                 @param trycount: Specify the maximum number of retries here.
 101                         0 means no retry on error. Using -1 means infinite retring.
 102                         None means the default value (that is self.trycount).
 103                 @param only_head: Create the openerdirector and return it. In other
 104                         words, this will not retrieve any content except HTTP headers.
 105
 106                 @return: The raw HTML page data
 107                 """
 108                 _moduleLogger.debug("Performing download of %s" % url)
 109
 110                 if extraheaders is None:
 111                         extraheaders = {}
 112                 if trycount is None:
 113                         trycount = self.trycount
 114                 cnt = 0
 115
 116                 while True:
 117                         try:
 118                                 req, u = self._build_opener(url, postdata, extraheaders, forbidRedirect)
 119                                 openerdirector = u.open(req)
 120                                 if self.debug:
 121                                         _moduleLogger.info("%r - %r" % (req.get_method(), url))
 122                                         _moduleLogger.info("%r - %r" % (openerdirector.code, openerdirector.msg))
 123                                         _moduleLogger.info("%r" % (openerdirector.headers))
 124                                 self._cookies.extract_cookies(openerdirector, req)
 125                                 if only_head:
 126                                         return openerdirector
 127
 128                                 return self._read(openerdirector, trycount)
 129                         except urllib2.URLError, e:
 130                                 _moduleLogger.debug("%s: %s" % (e, url))
 131                                 cnt += 1
 132                                 if (-1 < trycount) and (trycount < cnt):
 133                                         raise
 134
 135                         # Retry :-)
 136                         _moduleLogger.debug("MozillaEmulator: urllib2.URLError, retrying %d" % cnt)
 137
 138         def _build_opener(self, url, postdata = None, extraheaders = None, forbidRedirect = False):
 139                 if extraheaders is None:
 140                         extraheaders = {}
 141
 142                 txheaders = {
 143                         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png',
 144                         'Accept-Language': 'en,en-us;q=0.5',
 145                         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 146                         'User-Agent': self.USER_AGENT,
 147                 }
 148                 for key, value in extraheaders.iteritems():
 149                         txheaders[key] = value
 150                 req = urllib2.Request(url, postdata, txheaders)
 151                 self._cookies.add_cookie_header(req)
 152                 if forbidRedirect:
 153                         redirector = HTTPNoRedirector()
 154                         #_moduleLogger.info("Redirection disabled")
 155                 else:
 156                         redirector = urllib2.HTTPRedirectHandler()
 157                         #_moduleLogger.info("Redirection enabled")
 158
 159                 http_handler = urllib2.HTTPHandler(debuglevel=self.debug)
 160                 https_handler = urllib2.HTTPSHandler(debuglevel=self.debug)
 161
 162                 u = urllib2.build_opener(
 163                         http_handler,
 164                         https_handler,
 165                         urllib2.HTTPCookieProcessor(self._cookies),
 166                         redirector
 167                 )
 168                 if not postdata is None:
 169                         req.add_data(postdata)
 170                 return (req, u)
 171
 172         def _read(self, openerdirector, trycount):
 173                 chunks = []
 174
 175                 chunk = openerdirector.read()
 176                 chunks.append(chunk)
 177                 #while chunk and cnt < trycount:
 178                 #       time.sleep(1)
 179                 #       cnt += 1
 180                 #       chunk = openerdirector.read()
 181                 #       chunks.append(chunk)
 182
 183                 data = "".join(chunks)
 184
 185                 if "Content-Length" in openerdirector.info():
 186                         assert len(data) == int(openerdirector.info()["Content-Length"]), "The packet header promised %s of data but only was able to read %s of data" % (
 187                                 openerdirector.info()["Content-Length"],
 188                                 len(data),
 189                         )
 190
 191                 return data
 192
 193
 194 class HTTPNoRedirector(urllib2.HTTPRedirectHandler):
 195         """This is a custom http redirect handler that FORBIDS redirection."""
 196
 197         def http_error_302(self, req, fp, code, msg, headers):
 198                 e = urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
 199                 if e.code in (301, 302):
 200                         if 'location' in headers:
 201                                 newurl = headers.getheaders('location')[0]
 202                         elif 'uri' in headers:
 203                                 newurl = headers.getheaders('uri')[0]
 204                         e.newurl = newurl
 205                 _moduleLogger.info("New url: %s" % e.newurl)
 206                 raise e