1 from BeautifulSoup import BeautifulSoup, NavigableString
3 from datetime import time, datetime
4 from textwrap import wrap
7 class ParserError(Exception):
8 def __init__(self, value='', code=0):
13 return repr(self.value)
17 STATE_START, STATE_SEARCH, STATE_RESULT = range(3)
19 def __init__(self, html):
20 self.soup = BeautifulSoup(html)
23 self._current_state = 0
26 for detail in self.details():
29 def _parse_details(self):
30 if self._current_state < 0:
31 raise ParserError('Unable to parse details while in error state')
33 trips = map(lambda x: map(lambda x: {
35 'time': map(lambda x: (time(*map(int, x.split(':')))), wrap(x.find('td', {'class': 'col_time'}).text, 5)), # black magic appears
36 'station': map(lambda x: x[2:].strip(),
37 filter(lambda x: type(x) == NavigableString, x.find('td', {'class': 'col_station'}).contents)), # filter non NaviStrings
38 'info': map(lambda x: x.strip(),
39 filter(lambda x: type(x) == NavigableString, x.find('td', {'class': 'col_info'}).contents)),
40 }, x.find('tbody').findAll('tr')),
41 self.soup.findAll('div', {'class': 'data_table tourdetail'})) # all routes
46 """returns list of trip details
47 [ [ { 'time': [datetime.time, datetime.time] if time else [],
48 'station': [u'start', u'end'] if station else [],
49 'info': [u'start station' if station else u'details for walking', u'end station' if station else u'walking duration']
50 }, ... # next trip step
51 ], ... # next trip possibility
55 self._details = self._parse_details()
59 def _parse_overview(self):
60 def get_tdtext(x, cl):
61 return x.find('td', {'class': cl}).text
64 y = get_tdtext(x, 'col_change')
71 y = get_tdtext(x, 'col_price')
73 return float(y.replace(',', '.'))
78 y = get_tdtext(x, 'col_date')
80 return datetime.strptime(y, '%d.%m.%Y').date()
85 table = self.soup.find('table', {'id': 'tbl_fahrten'})
87 # check if there is an overview table
88 if table and table.findAll('tr'):
90 rows = table.findAll('tr')[1:] # cut off headline
91 overview = map(lambda x: {
93 'time': map(lambda x: time(*map(int, x.strip().split(':'))) if x else None, # extract times or set to None if empty
94 x.find('td', {'class': 'col_time'}).text.split('-')) if x.find('td', {'class': 'col_time'}) else [],
95 'duration': time(*map(int, get_tdtext(x, 'col_duration').split(':'))), # grab duration
96 'change': get_change(x),
97 'price': get_price(x),
101 self._current_state = self.STATE_ERROR
102 raise ParserError('Unable to parse details while in error state')
115 if not self._overview:
117 self._overview = self._parse_overview()
118 except AttributeError:
119 f = open('DEBUG', 'w')
120 f.write(str(self.soup))
123 return self._overview
125 def _check_request_state(self):
126 raise NotImplementedError()
129 def request_state(self):
130 return self._current_state
139 def get_stations(self, letter):
140 if not self._stations.has_key(letter):
141 bs = BeautifulSoup(urllib2.urlopen(settings.stations % letter))
142 self._stations[letter] = map(lambda x: x['value'], bs.find('select', {'id': 'letter'}).findAll('option'))
144 return self._stations[letter]
148 bs = BeautifulSoup(urllib2.urlopen(settings.line_overview))
150 lines = bs.findAll('table', {'class': 'linie'})
151 # cut line parameter out of href
152 self._lines = map(lambda x: map(lambda x: x['href'][x['href'].find('=') + 1:], x.findAll('a')), lines)