3 @copyright: (c) 2005 by Szoftver Messias Bt.
6 Objects of the MozillaEmulator class can emulate a browser that is capable of:
9 - configurable user agent string
11 - multipart POST (send files)
12 - receive content into file
14 I have seen many requests on the python mailing list about how to emulate a browser. I'm using this class for years now, without any problems. This is how you can use it:
17 2. Install and open the livehttpheaders plugin
18 3. Use the website manually with firefox
19 4. Check the GET and POST requests in the livehttpheaders capture window
20 5. Create an instance of the above class and send the same GET and POST requests to the server.
24 - You can change user agent string in the build_opened method
25 - The "encode_multipart_formdata" function can be used alone to create POST data from a list of field values and files
35 socket.setdefaulttimeout(10)
38 class MozillaEmulator(object):
40 def __init__(self, trycount = 1):
41 """Create a new MozillaEmulator object.
43 @param trycount: The download() method will retry the operation if it fails. You can specify -1 for infinite retrying.
44 A value of 0 means no retrying. A value of 1 means one retry. etc."""
45 self.cookies = cookielib.LWPCookieJar()
47 self.trycount = trycount
49 def build_opener(self, url, postdata = None, extraheaders = None, forbid_redirect = False):
50 if extraheaders is None:
54 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png',
55 'Accept-Language': 'en,en-us;q=0.5',
56 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
58 for key, value in extraheaders.iteritems():
59 txheaders[key] = value
60 req = urllib2.Request(url, postdata, txheaders)
61 self.cookies.add_cookie_header(req)
63 redirector = HTTPNoRedirector()
65 redirector = urllib2.HTTPRedirectHandler()
67 http_handler = urllib2.HTTPHandler(debuglevel=self.debug)
68 https_handler = urllib2.HTTPSHandler(debuglevel=self.debug)
70 u = urllib2.build_opener(
73 urllib2.HTTPCookieProcessor(self.cookies),
78 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.8) Gecko/20050511 Firefox/1.0.4'
80 if not postdata is None:
81 req.add_data(postdata)
84 def download(self, url,
85 postdata = None, extraheaders = None, forbid_redirect = False,
86 trycount = None, only_head = False,
88 """Download an URL with GET or POST methods.
90 @param postdata: It can be a string that will be POST-ed to the URL.
91 When None is given, the method will be GET instead.
92 @param extraheaders: You can add/modify HTTP headers with a dict here.
93 @param forbid_redirect: Set this flag if you do not want to handle
94 HTTP 301 and 302 redirects.
95 @param trycount: Specify the maximum number of retries here.
96 0 means no retry on error. Using -1 means infinite retring.
97 None means the default value (that is self.trycount).
98 @param only_head: Create the openerdirector and return it. In other
99 words, this will not retrieve any content except HTTP headers.
101 @return: The raw HTML page data
103 logging.warning("Performing download of %s" % url)
105 if extraheaders is None:
108 trycount = self.trycount
113 req, u = self.build_opener(url, postdata, extraheaders, forbid_redirect)
114 openerdirector = u.open(req)
116 print req.get_method(), url
117 print openerdirector.code, openerdirector.msg
118 print openerdirector.headers
119 self.cookies.extract_cookies(openerdirector, req)
121 return openerdirector
123 return self._read(openerdirector, trycount)
124 except urllib2.URLError:
126 if (-1 < trycount) and (trycount < cnt):
131 print "MozillaEmulator: urllib2.URLError, retryting ", cnt
133 def _read(self, openerdirector, trycount):
136 chunk = openerdirector.read()
138 #while chunk and cnt < trycount:
141 # chunk = openerdirector.read()
142 # chunks.append(chunk)
144 data = "".join(chunks)
146 if "Content-Length" in openerdirector.info():
147 assert len(data) == int(openerdirector.info()["Content-Length"]), "The packet header promised %s of data but only was able to read %s of data" % (
148 openerdirector.info()["Content-Length"],
155 class HTTPNoRedirector(urllib2.HTTPRedirectHandler):
156 """This is a custom http redirect handler that FORBIDS redirection."""
158 def http_error_302(self, req, fp, code, msg, headers):
159 e = urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
160 if e.code in (301, 302):
161 if 'location' in headers:
162 newurl = headers.getheaders('location')[0]
163 elif 'uri' in headers:
164 newurl = headers.getheaders('uri')[0]