vcs.maemo.org Git - pywienerlinien/blob - parseHtml.py

   1 from BeautifulSoup import BeautifulSoup, NavigableString
   2 import urllib2
   3 from datetime import time, datetime
   4 from textwrap import wrap
   5 import settings
   6
   7 class ParserError(Exception):
   8      def __init__(self, value='', code=0):
   9          self.value = value
  10          self.code = code
  11
  12      def __str__(self):
  13          return repr(self.value)
  14
  15 class Parser:
  16     _overview = None
  17     _details = None
  18     STATE_ERROR = -1
  19     STATE_START = 0
  20     STATE_SEARCH = 1
  21     STATE_RESULT = 2
  22     _current_state = 0
  23
  24     def __init__(self, html):
  25         self.soup = BeautifulSoup(html)
  26
  27     def __iter__(self):
  28         for detail in self.details():
  29             yield detail
  30         raise IndexError()
  31
  32     def _parse_details(self):
  33         if self._current_state < 0:
  34             raise ParserError('Unable to parse details while in error state')
  35
  36         trips = map(lambda x: map(lambda x: {
  37                                              # TODO kick out wrap
  38                         'time': map(lambda x: (time(*map(lambda x: int(x), x.split(':')))), wrap(x.find('td', {'class': 'col_time'}).text, 5)), # black magic appears
  39                         'station': map(lambda x: x[2:].strip(),
  40                                        filter(lambda x: type(x) == NavigableString, x.find('td', {'class': 'col_station'}).contents)), # filter non NaviStrings
  41                         'info': map(lambda x: x.strip(),
  42                                     filter(lambda x: type(x) == NavigableString, x.find('td', {'class': 'col_info'}).contents)),
  43                     }, x.find('tbody').findAll('tr')),
  44                     self.soup.findAll('div', {'class': 'data_table tourdetail'})) # all routes
  45         return trips
  46
  47     @property
  48     def details(self):
  49         """returns list of trip details
  50         [ [ { 'time': [datetime.time, datetime.time] if time else [],
  51               'station': [u'start', u'end'] if station else [],
  52               'info': [u'start station' if station else u'details for walking', u'end station' if station else u'walking duration']
  53             }, ... # next trip step
  54           ], ... # next trip possibility
  55         ]
  56         """
  57         if not self._details:
  58             self._details = self._parse_details()
  59
  60         return self._details
  61
  62     def _parse_overview(self):
  63         # get overview table
  64         table = self.soup.find('table', {'id': 'tbl_fahrten'})
  65
  66         # check if there is an overview table
  67         if table and table.findAll('tr'):
  68             # get rows
  69             rows = table.findAll('tr')[1:] # cut off headline
  70             overview = map(lambda x: {
  71                                'date': datetime.strptime(x.find('td', {'class': 'col_date'}).text, '%d.%m.%Y') # grab date
  72                                            if x.find('td', {'class': 'col_date'}).text else None, # if date is empty set to None
  73                                'time': map(lambda x: time(*map(lambda x: int(x), x.strip().split(':'))) if x else None, # extract times or set to None if empty
  74                                            x.find('td', {'class': 'col_time'}).text.split('-')) if x.find('td', {'class': 'col_time'}) else [],
  75                                'duration': time(*map(lambda x: int(x), x.find('td', {'class': 'col_duration'}).text.split(':'))), # grab duration
  76                                'change': int(x.find('td', {'class': 'col_change'}).text) # grab changes
  77                                            if x.find('td', {'class': 'col_change'}).text else 0, # if change is empty set to 0
  78                                'price': float(x.find('td', {'class': 'col_price'}).text.replace(',', '.')) # grab price
  79                                            if x.find('td', {'class': 'col_price'}).text.find(',') >= 0 else 0.0, # if price is empty set to 0.0
  80                            },
  81                            rows)
  82         else:
  83             self._current_state = self.STATE_ERROR
  84             raise ParserError('Unable to parse details while in error state')
  85
  86         return overview
  87
  88     @property
  89     def overview(self):
  90         """dict containing
  91         date: datetime
  92         time: [time, time]
  93         duration: time
  94         change: int
  95         price: float
  96         """
  97         if not self._overview:
  98             try:
  99                 self._overview = self._parse_overview()
 100             except AttributeError:
 101                 f = open('DEBUG', 'w')
 102                 f.write(self.soup)
 103                 f.close()
 104
 105         return self._overview
 106
 107     def _check_request_state(self):
 108         raise NotImplementedError()
 109
 110     @property
 111     def request_state(self):
 112         return self._current_state
 113
 114
 115 class iTipParser:
 116     _stations = {}
 117     _lines = []
 118
 119     def __init__(self):
 120         pass
 121
 122     def get_stations(self, letter):
 123         if not self._stations.has_key(letter):
 124             bs = BeautifulSoup(urllib2.urlopen(settings.stations % letter).read())
 125             self._stations[letter] = map(lambda x: x['value'], bs.find('select', {'id': 'letter'}).findAll('option'))
 126
 127         return self._stations[letter]
 128
 129     def get_lines(self):
 130         if not self._lines:
 131             bs = BeautifulSoup(urllib2.urlopen(settings.line_overview).read())
 132             # get tables
 133             lines = bs.findAll('table', {'class': 'linie'})
 134             # cut line parameter out of href
 135             self._lines = map(lambda x: map(lambda x: x['href'][x['href'].find('=') + 1:], x.findAll('a')), lines)
 136
 137         return self._lines