Browse Source

Added station arr/dep scrapper

Added scrapper for arrivals and departures at station
python3
Dan Cojocaru 3 years ago
parent
commit
0a7e2b2568
Signed by: kbruen
GPG Key ID: 818A889458EDC937
  1. 20
      scraper/schemas.py
  2. 87
      scraper/scrape_station.py
  3. 137
      scraper/scrape_station_schema_v2.json
  4. 143
      scraper/scrape_train.py
  5. 134
      scraper/scrape_train_schema.json
  6. 22
      scraper/scrape_train_schema_v2.json
  7. 171
      scraper/scraper.py
  8. 79
      scraper/utils.py
  9. 1
      server/Pipfile
  10. 114
      server/Pipfile.lock
  11. 106
      server/server/db.py
  12. 29
      server/server/flask_utils.py
  13. 16
      server/server/server.py
  14. 23
      server/server/utils.py
  15. 67
      server/server/v2/v2.py

20
scraper/schemas.py

@ -0,0 +1,20 @@
from contextlib import ExitStack as _ExitStack
_es = _ExitStack()
def _load_file(name: str):
import json
from os.path import join, dirname
dir = dirname(__file__)
return json.load(_es.enter_context(open(join(dir, name))))
TRAIN_INFO_SCHEMA = {
'v1': _load_file('scrape_train_schema.json'),
'v2': _load_file('scrape_train_schema_v2.json'),
}
STATION_SCHEMA = {
'v2': _load_file('scrape_station_schema_v2.json'),
}
_es.close()

87
scraper/scrape_station.py

@ -0,0 +1,87 @@
import re
from datetime import datetime, timedelta
import pytz
import requests
from bs4 import BeautifulSoup
from .utils import *
# region regex definitions
RO_LETTERS = r'A-Za-zăâîșțĂÂÎȚȘ'
STATION_INFO_REGEX = re.compile(rf'^([{RO_LETTERS} ]+) în ([0-9.]+)$')
STOPPING_TIME_REGEX = re.compile(r'^(necunoscută \(stație terminus\))|(?:([0-9]+) min \((?:începând cu|până la) ([0-9]{1,2}:[0-9]{2})\))$')
# endregion
def scrape(station_name: str):
station_name = ro_letters_to_en(station_name)
# Start scrapping session
s = requests.Session()
r = s.get(build_url(
'https://mersultrenurilor.infofer.ro/ro-RO/Statie/{station}',
station=station_name.replace(' ', '-'),
))
soup = BeautifulSoup(r.text, features='html.parser')
sform = soup.find(id='form-search')
result_data = { elem['name']: elem['value'] for elem in sform('input') }
r = s.post('https://mersultrenurilor.infofer.ro/ro-RO/Stations/StationsResult', data=result_data)
soup = BeautifulSoup(r.text, features='html.parser')
scraped = {}
station_info_div, _, departures_div, arrivals_div, *_ = soup('div', recursive=False)
scraped['stationName'], scraped['date'] = STATION_INFO_REGEX.match(collapse_space(station_info_div.h2.text)).groups()
date_d, date_m, date_y = (int(comp) for comp in scraped['date'].split('.'))
date = datetime(date_y, date_m, date_d)
dt_seq = DateTimeSequencer(date.year, date.month, date.day)
tz = pytz.timezone('Europe/Bucharest')
def parse_arrdep_list(elem, end_station_field_name):
def parse_item(elem):
result = {}
try:
data_div, status_div = elem('div', recursive=False)
except ValueError:
data_div, *_ = elem('div', recursive=False)
status_div = None
data_main_div, data_details_div = data_div('div', recursive=False)
time_div, dest_div, train_div, *_ = data_main_div('div', recursive=False)
operator_div, route_div, stopping_time_div = data_details_div.div('div', recursive=False)
result['time'] = collapse_space(time_div.div.div('div', recursive=False)[1].text)
st_hr, st_min = (int(comp) for comp in result['time'].split(':'))
result['time'] = tz.localize(dt_seq(st_hr, st_min)).isoformat()
unknown_st, st, st_opposite_time = STOPPING_TIME_REGEX.match(
collapse_space(stopping_time_div.div('div', recursive=False)[1].text)
).groups()
if unknown_st:
result['stoppingTime'] = None
elif st:
result['stoppingTime'] = int(st)
result['train'] = {}
result['train']['rank'] = collapse_space(train_div.div.div('div', recursive=False)[1].span.text)
result['train']['number'] = collapse_space(train_div.div.div('div', recursive=False)[1].a.text)
result['train'][end_station_field_name] = collapse_space(dest_div.div.div('div', recursive=False)[1].text)
result['train']['operator'] = collapse_space(operator_div.div('div', recursive=False)[1].text)
result['train']['route'] = collapse_space(route_div.div('div', recursive=False)[1].text).split(' - ')
return result
return [parse_item(elem) for elem in elem.div.ul('li', recursive=False)]
scraped['departures'] = parse_arrdep_list(departures_div, 'destination')
scraped['arrivals'] = parse_arrdep_list(arrivals_div, 'origin')
return scraped

137
scraper/scrape_station_schema_v2.json

@ -0,0 +1,137 @@
{
"$schema": "http://json-schema.org/schema",
"title": "Train Info InfoFer Scrap Station Schema",
"description": "Results of scrapping InfoFer website for station arrival/departure info",
"definitions": {
"arrDepItem": {
"type": "object",
"properties": {
"time": {
"description": "Time of arrival/departure",
"type": "string",
"format": "date-time"
},
"train": {
"type": "object",
"properties": {
"rank": {
"type": "string",
"examples": [
"R",
"R-E",
"IR",
"IRN"
]
},
"number": {
"type": "string",
"examples": [
"74",
"15934"
]
},
"operator": {
"type": "string",
"examples": [
"CFR Călători",
"Softrans",
"Regio Călători"
]
},
"route": {
"description": "All the stations the train stops at",
"type": "array",
"items": {
"type": "string"
}
}
},
"required": [
"rank",
"number",
"operator"
]
},
"stoppingTime": {
"type": [
"integer",
"null"
],
"minimum": 1
}
},
"required": [
"time",
"train",
"stoppingTime"
]
}
},
"type": "object",
"properties": {
"arrivals": {
"type": "array",
"items": {
"allOf": [
{
"$ref": "#/definitions/arrDepItem"
},
{
"type": "object",
"properties": {
"train": {
"type": "object",
"properties": {
"origin": {
"type": "string"
}
},
"required": ["origin"]
}
},
"required": ["train"]
}
]
}
},
"departures": {
"type": "array",
"items": {
"allOf": [
{
"$ref": "#/definitions/arrDepItem"
},
{
"type": "object",
"properties": {
"train": {
"type": "object",
"properties": {
"destination": {
"type": "string"
}
},
"required": ["destination"]
}
},
"required": ["train"]
}
]
}
},
"stationName": {
"type": "string"
},
"date": {
"description": "Date for which the data is provided (likely today)",
"type": "string",
"pattern": "^[0-9]{1,2}\\.[0-9]{2}\\.[0-9]{4}$"
}
},
"required": [
"arrivals",
"departures",
"stationName",
"date"
]
}

143
scraper/scrape_train.py

@ -0,0 +1,143 @@
import re
from datetime import datetime, timedelta
import pytz
import requests
from bs4 import BeautifulSoup
from .utils import *
# region regex definitions
TRAIN_INFO_REGEX = re.compile(r'^([A-Z-]+) ([0-9]+) în ([0-9.]+)$')
OPERATOR_REGEX = re.compile(r'^Operat de (.+)$')
SL_REGEX = re.compile(r'^(?:Fără|([0-9]+) min) (întârziere|mai devreme) la (trecerea fără oprire prin|sosirea în|plecarea din) (.+)\.$')
SL_STATE_MAP = {
't': 'passing',
's': 'arrival',
'p': 'departure',
}
RO_LETTERS = r'A-Za-zăâîșțĂÂÎȚȘ'
ROUTE_REGEX = re.compile(rf'^Parcurs tren ([{RO_LETTERS} ]+)[-–]([{RO_LETTERS} ]+)$')
KM_REGEX = re.compile(r'^km ([0-9]+)$')
PLATFORM_REGEX = re.compile(r'^linia (.+)$')
STOPPING_TIME_REGEX = re.compile(r'^([0-9]+) min oprire$')
STATION_DEPARR_STATUS_REGEX = re.compile(r'^(?:(la timp)|(?:((?:\+|-)[0-9]+) min \((?:(?:întârziere)|(?:mai devreme))\)))(\*?)$')
# endregion
def scrape(train_no: int, use_yesterday=False, date_override=None):
# Start scrapping session
s = requests.Session()
date = datetime.today()
if use_yesterday:
date -= timedelta(days=1)
if date_override:
date = date_override
r = s.get(build_url(
'https://mersultrenurilor.infofer.ro/ro-RO/Tren/{train_no}',
train_no=train_no,
query=[
('Date', date.strftime('%d.%m.%Y')),
],
))
soup = BeautifulSoup(r.text, features='html.parser')
sform = soup.find(id='form-search')
result_data = { elem['name']: elem['value'] for elem in sform('input') }
r = s.post('https://mersultrenurilor.infofer.ro/ro-RO/Trains/TrainsResult', data=result_data)
soup = BeautifulSoup(r.text, features='html.parser')
scraped = {}
train_info_div, _, _, results_div, *_ = soup('div', recursive=False)
train_info_div = train_info_div.div('div', recursive=False)[0]
scraped['rank'], scraped['number'], scraped['date'] = TRAIN_INFO_REGEX.match(collapse_space(train_info_div.h2.text)).groups()
date_d, date_m, date_y = (int(comp) for comp in scraped['date'].split('.'))
date = datetime(date_y, date_m, date_d)
scraped['operator'] = OPERATOR_REGEX.match(collapse_space(train_info_div.p.text)).groups()[0]
results_div = results_div.div
status_div = results_div('div', recursive=False)[0]
route_text = collapse_space(status_div.h4.text)
route_from, route_to = ROUTE_REGEX.match(route_text).groups()
scraped['route'] = {
'from': route_from,
'to': route_to,
}
try:
status_line_match = SL_REGEX.match(collapse_space(status_div.div.text))
slm_delay, slm_late, slm_arrival, slm_station = status_line_match.groups()
scraped['status'] = {
'delay': (int(slm_delay) if slm_late == 'întârziere' else -int(slm_delay)) if slm_delay else 0,
'station': slm_station,
'state': SL_STATE_MAP[slm_arrival[0]],
}
except Exception:
scraped['status'] = None
stations = status_div.ul('li', recursive=False)
scraped['stations'] = []
dt_seq = DateTimeSequencer(date.year, date.month, date.day)
tz = pytz.timezone('Europe/Bucharest')
for station in stations:
station_scraped = {}
left, middle, right = station.div('div', recursive=False)
station_scraped['name'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[0].text)
station_scraped['km'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[1].text)
station_scraped['km'] = int(KM_REGEX.match(station_scraped['km']).groups()[0])
station_scraped['stoppingTime'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[2].text)
if not station_scraped['stoppingTime']:
station_scraped['stoppingTime'] = None
else:
station_scraped['stoppingTime'] = int(STOPPING_TIME_REGEX.match(station_scraped['stoppingTime']).groups()[0])
station_scraped['platform'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[3].text)
if not station_scraped['platform']:
station_scraped['platform'] = None
else:
station_scraped['platform'] = PLATFORM_REGEX.match(station_scraped['platform']).groups()[0]
def scrape_time(elem, setter):
parts = elem.div.div('div', recursive=False)
if parts:
result = {}
time, *_ = parts
result['scheduleTime'] = collapse_space(time.text)
st_hr, st_min = (int(comp) for comp in result['scheduleTime'].split(':'))
result['scheduleTime'] = tz.localize(dt_seq(st_hr, st_min)).isoformat()
if len(parts) >= 2:
_, status, *_ = parts
result['status'] = {}
on_time, delay, approx = STATION_DEPARR_STATUS_REGEX.match(collapse_space(status.text)).groups()
result['status']['delay'] = 0 if on_time else int(delay)
result['status']['real'] = not approx
else:
result['status'] = None
setter(result)
else:
setter(None)
scrape_time(left, lambda value: station_scraped.update(arrival=value))
scrape_time(right, lambda value: station_scraped.update(departure=value))
scraped['stations'].append(station_scraped)
return scraped

134
scraper/scrape_train_schema.json

@ -0,0 +1,134 @@
{
"$schema": "http://json-schema.org/schema",
"title": "Train Info InfoFer Scrap Train Schema",
"description": "Results of scrapping InfoFer website for train info",
"definitions": {
"delayType": {
"description": "Delay of the train (negative for being early)",
"type": "integer"
},
"stationArrDepTime": {
"description": "Time of arrival at/departure from station",
"type": ["object", "null"],
"properties": {
"scheduleTime": {
"description": "The time the train is scheduled to arrive/depart",
"type": "string",
"pattern": "^[0-9]{1,2}:[0-9]{2}$"
},
"status": {
"type": ["object", "null"],
"properties": {
"delay": {
"$ref": "#/definitions/delayType"
},
"real": {
"description": "Determines whether delay was actually reported or is an approximation",
"type": "boolean"
}
},
"required": ["delay", "real"]
}
},
"required": ["scheduleTime"]
}
},
"type": "object",
"properties": {
"rank": {
"description": "The rank of the train",
"type": "string",
"examples": [
"R",
"R-E",
"IR",
"IRN"
]
},
"number": {
"description": "The number of the train",
"type": "string",
"examples": [
"74",
"15934"
]
},
"date": {
"description": "Date of departure from the first station (dd.mm.yyyy)",
"type": "string",
"pattern": "^[0-9]{1,2}\\.[0-9]{2}\\.[0-9]{4}$"
},
"operator": {
"description": "Operator of the train",
"type": "string",
"examples": [
"CFR Călători",
"Softrans",
"Regio Călători"
]
},
"route": {
"description": "Route of the train",
"type": "object",
"properties": {
"from": {
"type": "string"
},
"to": {
"type": "string"
}
},
"required": ["from", "to"]
},
"status": {
"description": "Current status of the train",
"type": ["object", "null"],
"properties": {
"delay": {
"$ref": "#/definitions/delayType"
},
"station": {
"type": "string"
},
"state": {
"type": "string",
"enum": ["passing", "arrival", "departure"]
}
},
"required": ["delay", "station", "state"]
},
"stations": {
"description": "List of stations the train stops at",
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"km": {
"description": "The distance the train travelled until reaching this station",
"type": "integer"
},
"stoppingTime": {
"description": "The number of minutes the train is scheduled to stop in this station",
"type": ["integer", "null"],
"minimum": 1
},
"platform": {
"description": "The platform the train stopped at",
"type": ["string", "null"]
},
"arrival": {
"$ref": "#/definitions/stationArrDepTime"
},
"departure": {
"$ref": "#/definitions/stationArrDepTime"
}
},
"required": ["name", "km"]
}
}
},
"required": ["route", "stations", "rank", "number", "date", "operator"]
}

22
scraper/trainInfoScrapResultSchema.json → scraper/scrape_train_schema_v2.json

@ -1,6 +1,6 @@
{
"$schema": "http://json-schema.org/schema",
"title": "Train Info InfoFer Scrap Result Schema",
"title": "Train Info InfoFer Scrap Train Schema",
"description": "Results of scrapping InfoFer website for train info",
"definitions": {
"delayType": {
@ -13,7 +13,8 @@
"properties": {
"scheduleTime": {
"description": "The time the train is scheduled to arrive/depart",
"type": "string"
"type": "string",
"format": "date-time"
},
"status": {
"type": ["object", "null"],
@ -38,23 +39,24 @@
"description": "The rank of the train",
"type": "string",
"examples": [
"74",
"15934"
"R",
"R-E",
"IR",
"IRN"
]
},
"number": {
"description": "The number of the train",
"type": "string",
"examples": [
"R",
"R-E",
"IR",
"IRN"
"74",
"15934"
]
},
"date": {
"description": "Date of departure from the first station",
"type": "string"
"description": "Date of departure from the first station (dd.mm.yyyy)",
"type": "string",
"pattern": "^[0-9]{1,2}\\.[0-9]{2}\\.[0-9]{4}$"
},
"operator": {
"description": "Operator of the train",

171
scraper/scraper.py

@ -1,177 +1,12 @@
#! /usr/bin/env python3
from datetime import datetime, timedelta
import re
import pytz
import requests
from bs4 import BeautifulSoup
from urllib.parse import quote, urlencode
TRAIN_INFO_REGEX = re.compile(r'^([A-Z-]+) ([0-9]+) în ([0-9.]+)$')
OPERATOR_REGEX = re.compile(r'^Operat de (.+)$')
SL_REGEX = re.compile(r'^(?:Fără|([0-9]+) min) (întârziere|mai devreme) la (trecerea fără oprire prin|sosirea în|plecarea din) (.+)\.$')
SL_STATE_MAP = {
't': 'passing',
's': 'arrival',
'p': 'departure',
}
RO_LETTERS = r'A-Za-zăâîșțĂÂÎȚȘ'
ROUTE_REGEX = re.compile(rf'^Parcurs tren ([{RO_LETTERS} ]+)[-–]([{RO_LETTERS} ]+)$')
KM_REGEX = re.compile(r'^km ([0-9]+)$')
PLATFORM_REGEX = re.compile(r'^linia (.+)$')
STOPPING_TIME_REGEX = re.compile(r'^([0-9]+) min oprire$')
STATION_DEPARR_STATUS_REGEX = re.compile(r'^(?:(la timp)|(?:((?:\+|-)[0-9]+) min \((?:(?:întârziere)|(?:mai devreme))\)))(\*?)$')
class DateTimeSequencer:
def __init__(self, year: int, month: int, day: int) -> None:
self.current = datetime(year, month, day, 0, 0, 0)
self.current -= timedelta(seconds=1)
def __call__(self, hour: int, minute: int = 0, second: int = 0) -> datetime:
potential_new_date = datetime(self.current.year, self.current.month, self.current.day, hour, minute, second)
if (self.current > potential_new_date):
potential_new_date += timedelta(days=1)
self.current = potential_new_date
return self.current
def collapse_space(string: str) -> str:
return re.sub(
rf'[{BeautifulSoup.ASCII_SPACES}]+',
' ',
string,
flags=re.MULTILINE
).strip()
def build_url(base: str, /, query: dict, **kwargs):
result = base.format(**{ k: quote(str(v)) for k, v in kwargs.items() })
if query:
result += '?'
result += urlencode(query)
return result
def scrape(train_no: int, use_yesterday=False, date_override=None):
# Start scrapping session
s = requests.Session()
date = datetime.today()
if use_yesterday:
date -= timedelta(days=1)
if date_override:
date = date_override
r = s.get(build_url(
'https://mersultrenurilor.infofer.ro/ro-RO/Tren/{train_no}',
train_no=train_no,
query=[
('Date', date.strftime('%d.%m.%Y')),
],
))
soup = BeautifulSoup(r.text, features='html.parser')
sform = soup.find(id='form-search')
result_data = { elem['name']: elem['value'] for elem in sform('input') }
r = s.post('https://mersultrenurilor.infofer.ro/ro-RO/Trains/TrainsResult', data=result_data)
soup = BeautifulSoup(r.text, features='html.parser')
scraped = {}
train_info_div, _, _, results_div, *_ = soup('div', recursive=False)
train_info_div = train_info_div.div('div', recursive=False)[0]
scraped['rank'], scraped['number'], scraped['date'] = TRAIN_INFO_REGEX.match(collapse_space(train_info_div.h2.text)).groups()
date_d, date_m, date_y = (int(comp) for comp in scraped['date'].split('.'))
date = datetime(date_y, date_m, date_d)
scraped['operator'] = OPERATOR_REGEX.match(collapse_space(train_info_div.p.text)).groups()[0]
results_div = results_div.div
status_div = results_div('div', recursive=False)[0]
route_text = collapse_space(status_div.h4.text)
route_from, route_to = ROUTE_REGEX.match(route_text).groups()
scraped['route'] = {
'from': route_from,
'to': route_to,
}
try:
status_line_match = SL_REGEX.match(collapse_space(status_div.div.text))
slm_delay, slm_late, slm_arrival, slm_station = status_line_match.groups()
scraped['status'] = {
'delay': (int(slm_delay) if slm_late == 'întârziere' else -int(slm_delay)) if slm_delay else 0,
'station': slm_station,
'state': SL_STATE_MAP[slm_arrival[0]],
}
except Exception:
scraped['status'] = None
stations = status_div.ul('li', recursive=False)
scraped['stations'] = []
dt_seq = DateTimeSequencer(date.year, date.month, date.day)
tz = pytz.timezone('Europe/Bucharest')
for station in stations:
station_scraped = {}
left, middle, right = station.div('div', recursive=False)
station_scraped['name'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[0].text)
station_scraped['km'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[1].text)
station_scraped['km'] = int(KM_REGEX.match(station_scraped['km']).groups()[0])
station_scraped['stoppingTime'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[2].text)
if not station_scraped['stoppingTime']:
station_scraped['stoppingTime'] = None
else:
station_scraped['stoppingTime'] = int(STOPPING_TIME_REGEX.match(station_scraped['stoppingTime']).groups()[0])
station_scraped['platform'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[3].text)
if not station_scraped['platform']:
station_scraped['platform'] = None
else:
station_scraped['platform'] = PLATFORM_REGEX.match(station_scraped['platform']).groups()[0]
def scrape_time(elem, setter):
parts = elem.div.div('div', recursive=False)
if parts:
result = {}
time, *_ = parts
result['scheduleTime'] = collapse_space(time.text)
st_hr, st_min = (int(comp) for comp in result['scheduleTime'].split(':'))
result['scheduleTime'] = tz.localize(dt_seq(st_hr, st_min)).isoformat()
if len(parts) >= 2:
_, status, *_ = parts
result['status'] = {}
on_time, delay, approx = STATION_DEPARR_STATUS_REGEX.match(collapse_space(status.text)).groups()
result['status']['delay'] = 0 if on_time else int(delay)
result['status']['real'] = not approx
else:
result['status'] = None
setter(result)
else:
setter(None)
scrape_time(left, lambda value: station_scraped.update(arrival=value))
scrape_time(right, lambda value: station_scraped.update(departure=value))
scraped['stations'].append(station_scraped)
return scraped
from .scrape_train import scrape as scrape_train
from .scrape_station import scrape as scrape_station
def main():
train_no = 1538
print(f'Testing package with train number {train_no}')
from pprint import pprint
# pprint(scrape('473'))
pprint(scrape(train_no))
pprint(scrape_train(train_no))
if __name__ == '__main__':
main()

79
scraper/utils.py

@ -0,0 +1,79 @@
import re
from datetime import datetime, timedelta
from urllib.parse import urlencode, quote
# From: https://en.wikipedia.org/wiki/Whitespace_character#Unicode
ASCII_WHITESPACE = [
'\u0009', # HT; Character Tabulation
'\u000a', # LF
'\u000b', # VT; Line Tabulation
'\u000c', # FF; Form Feed
'\u000d', # CR
'\u0020', # Space
]
WHITESPACE = ASCII_WHITESPACE + [
'\u0085', # NEL; Next Line
'\u00a0', # No-break Space;  
'\u1680', # Ogham Space Mark
'\u2000', # En Quad
'\u2001', # Em Quad
'\u2002', # En Space
'\u2003', # Em Space
'\u2004', # Three-per-em Space
'\u2005', # Four-per-em Space
'\u2006', # Six-per-em Space
'\u2007', # Figure Space
'\u2008', # Punctuation Space
'\u2009', # Thin Space
'\u200A', # Hair Space
'\u2028', # Line Separator
'\u2029', # Paragraph Separator
'\u202f', # Narrow No-break Space
'\u205d', # Meduam Mathematical Space
'\u3000', # Ideographic Space
]
WHITESPACE_REGEX = re.compile(rf'[{"".join(WHITESPACE)}]+', flags=re.MULTILINE)
class DateTimeSequencer:
def __init__(self, year: int, month: int, day: int) -> None:
self.current = datetime(year, month, day, 0, 0, 0)
self.current -= timedelta(seconds=1)
def __call__(self, hour: int, minute: int = 0, second: int = 0) -> datetime:
potential_new_date = datetime(self.current.year, self.current.month, self.current.day, hour, minute, second)
if (self.current > potential_new_date):
potential_new_date += timedelta(days=1)
self.current = potential_new_date
return self.current
def collapse_space(string: str) -> str:
return WHITESPACE_REGEX.sub(
' ',
string,
).strip()
def build_url(base: str, /, query: dict = {}, **kwargs):
result = base.format(**{ k: quote(str(v)) for k, v in kwargs.items() })
if query:
result += '?'
result += urlencode(query)
return result
RO_TO_EN = {
'ă': 'a',
'Ă': 'A',
'â': 'a',
'Â': 'A',
'î': 'i',
'Î': 'I',
'ș': 's',
'Ș': 'S',
'ț': 't',
'Ț': 'T',
}
def ro_letters_to_en(string: str) -> str:
return ''.join((RO_TO_EN.get(letter, letter) for letter in string))

1
server/Pipfile

@ -7,6 +7,7 @@ name = "pypi"
flask = "*"
gevent = "*"
scraper = { editable = true, path = '../scraper' }
jsonschema = "*"
[dev-packages]

114
server/Pipfile.lock generated

@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "9d422680ab15ce184b043276f5d0d2cac228ff60dfc66ec193b6314bdc0f6ce2"
"sha256": "3c7f09679bdd68674754a714ee39503cf1a3ae265400eea074fec83559246dff"
},
"pipfile-spec": 6,
"requires": {
@ -16,6 +16,14 @@
]
},
"default": {
"attrs": {
"hashes": [
"sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1",
"sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==21.2.0"
},
"beautifulsoup4": {
"hashes": [
"sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35",
@ -31,57 +39,6 @@
],
"version": "==2021.5.30"
},
"cffi": {
"hashes": [
"sha256:06c54a68935738d206570b20da5ef2b6b6d92b38ef3ec45c5422c0ebaf338d4d",
"sha256:0c0591bee64e438883b0c92a7bed78f6290d40bf02e54c5bf0978eaf36061771",
"sha256:19ca0dbdeda3b2615421d54bef8985f72af6e0c47082a8d26122adac81a95872",
"sha256:22b9c3c320171c108e903d61a3723b51e37aaa8c81255b5e7ce102775bd01e2c",
"sha256:26bb2549b72708c833f5abe62b756176022a7b9a7f689b571e74c8478ead51dc",
"sha256:33791e8a2dc2953f28b8d8d300dde42dd929ac28f974c4b4c6272cb2955cb762",
"sha256:3c8d896becff2fa653dc4438b54a5a25a971d1f4110b32bd3068db3722c80202",
"sha256:4373612d59c404baeb7cbd788a18b2b2a8331abcc84c3ba40051fcd18b17a4d5",
"sha256:487d63e1454627c8e47dd230025780e91869cfba4c753a74fda196a1f6ad6548",
"sha256:48916e459c54c4a70e52745639f1db524542140433599e13911b2f329834276a",
"sha256:4922cd707b25e623b902c86188aca466d3620892db76c0bdd7b99a3d5e61d35f",
"sha256:55af55e32ae468e9946f741a5d51f9896da6b9bf0bbdd326843fec05c730eb20",
"sha256:57e555a9feb4a8460415f1aac331a2dc833b1115284f7ded7278b54afc5bd218",
"sha256:5d4b68e216fc65e9fe4f524c177b54964af043dde734807586cf5435af84045c",
"sha256:64fda793737bc4037521d4899be780534b9aea552eb673b9833b01f945904c2e",
"sha256:6d6169cb3c6c2ad50db5b868db6491a790300ade1ed5d1da29289d73bbe40b56",
"sha256:7bcac9a2b4fdbed2c16fa5681356d7121ecabf041f18d97ed5b8e0dd38a80224",
"sha256:80b06212075346b5546b0417b9f2bf467fea3bfe7352f781ffc05a8ab24ba14a",
"sha256:818014c754cd3dba7229c0f5884396264d51ffb87ec86e927ef0be140bfdb0d2",
"sha256:8eb687582ed7cd8c4bdbff3df6c0da443eb89c3c72e6e5dcdd9c81729712791a",
"sha256:99f27fefe34c37ba9875f224a8f36e31d744d8083e00f520f133cab79ad5e819",
"sha256:9f3e33c28cd39d1b655ed1ba7247133b6f7fc16fa16887b120c0c670e35ce346",
"sha256:a8661b2ce9694ca01c529bfa204dbb144b275a31685a075ce123f12331be790b",
"sha256:a9da7010cec5a12193d1af9872a00888f396aba3dc79186604a09ea3ee7c029e",
"sha256:aedb15f0a5a5949ecb129a82b72b19df97bbbca024081ed2ef88bd5c0a610534",
"sha256:b315d709717a99f4b27b59b021e6207c64620790ca3e0bde636a6c7f14618abb",
"sha256:ba6f2b3f452e150945d58f4badd92310449876c4c954836cfb1803bdd7b422f0",
"sha256:c33d18eb6e6bc36f09d793c0dc58b0211fccc6ae5149b808da4a62660678b156",
"sha256:c9a875ce9d7fe32887784274dd533c57909b7b1dcadcc128a2ac21331a9765dd",
"sha256:c9e005e9bd57bc987764c32a1bee4364c44fdc11a3cc20a40b93b444984f2b87",
"sha256:d2ad4d668a5c0645d281dcd17aff2be3212bc109b33814bbb15c4939f44181cc",
"sha256:d950695ae4381ecd856bcaf2b1e866720e4ab9a1498cba61c602e56630ca7195",
"sha256:e22dcb48709fc51a7b58a927391b23ab37eb3737a98ac4338e2448bef8559b33",
"sha256:e8c6a99be100371dbb046880e7a282152aa5d6127ae01783e37662ef73850d8f",
"sha256:e9dc245e3ac69c92ee4c167fbdd7428ec1956d4e754223124991ef29eb57a09d",
"sha256:eb687a11f0a7a1839719edd80f41e459cc5366857ecbed383ff376c4e3cc6afd",
"sha256:eb9e2a346c5238a30a746893f23a9535e700f8192a68c07c0258e7ece6ff3728",
"sha256:ed38b924ce794e505647f7c331b22a693bee1538fdf46b0222c4717b42f744e7",
"sha256:f0010c6f9d1a4011e429109fda55a225921e3206e7f62a0c22a35344bfd13cca",
"sha256:f0c5d1acbfca6ebdd6b1e3eded8d261affb6ddcf2186205518f1428b8569bb99",
"sha256:f10afb1004f102c7868ebfe91c28f4a712227fe4cb24974350ace1f90e1febbf",
"sha256:f174135f5609428cc6e1b9090f9268f5c8935fddb1b25ccb8255a2d50de6789e",
"sha256:f3ebe6e73c319340830a9b2825d32eb6d8475c1dac020b4f0aa774ee3b898d1c",
"sha256:f627688813d0a4140153ff532537fbe4afea5a3dffce1f9deb7f91f848a832b5",
"sha256:fd4305f86f53dfd8cd3522269ed7fc34856a8ee3709a5e28b2836b2db9d4cd69"
],
"markers": "platform_python_implementation == 'CPython' and sys_platform == 'win32'",
"version": "==1.14.6"
},
"charset-normalizer": {
"hashes": [
"sha256:0c8911edd15d19223366a194a513099a302055a962bca2cec0f54b8b63175d8b",
@ -98,14 +55,6 @@
"markers": "python_version >= '3.6'",
"version": "==8.0.1"
},
"colorama": {
"hashes": [
"sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b",
"sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"
],
"markers": "platform_system == 'Windows'",
"version": "==0.4.4"
},
"flask": {
"hashes": [
"sha256:1c4c257b1892aec1398784c63791cbaa43062f1f7aeb555c4da961b20ee68f55",
@ -230,6 +179,14 @@
"markers": "python_version >= '3.6'",
"version": "==3.0.1"
},
"jsonschema": {
"hashes": [
"sha256:4e5b3cf8216f577bee9ce139cbe72eca3ea4f292ec60928ff24758ce626cd163",
"sha256:c8a85b28d377cc7737e46e2d9f2b4f44ee3c0e1deac6bf46ddefc7187d30797a"
],
"index": "pypi",
"version": "==3.2.0"
},
"markupsafe": {
"hashes": [
"sha256:01a9b8ea66f1658938f65b93a85ebe8bc016e6769611be228d797c9d998dd298",
@ -290,13 +247,32 @@
"markers": "python_version >= '3.6'",
"version": "==2.0.1"
},
"pycparser": {
"pyrsistent": {
"hashes": [
"sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0",
"sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705"
"sha256:097b96f129dd36a8c9e33594e7ebb151b1515eb52cceb08474c10a5479e799f2",
"sha256:2aaf19dc8ce517a8653746d98e962ef480ff34b6bc563fc067be6401ffb457c7",
"sha256:404e1f1d254d314d55adb8d87f4f465c8693d6f902f67eb6ef5b4526dc58e6ea",
"sha256:48578680353f41dca1ca3dc48629fb77dfc745128b56fc01096b2530c13fd426",
"sha256:4916c10896721e472ee12c95cdc2891ce5890898d2f9907b1b4ae0f53588b710",
"sha256:527be2bfa8dc80f6f8ddd65242ba476a6c4fb4e3aedbf281dfbac1b1ed4165b1",
"sha256:58a70d93fb79dc585b21f9d72487b929a6fe58da0754fa4cb9f279bb92369396",
"sha256:5e4395bbf841693eaebaa5bb5c8f5cdbb1d139e07c975c682ec4e4f8126e03d2",
"sha256:6b5eed00e597b5b5773b4ca30bd48a5774ef1e96f2a45d105db5b4ebb4bca680",
"sha256:73ff61b1411e3fb0ba144b8f08d6749749775fe89688093e1efef9839d2dcc35",
"sha256:772e94c2c6864f2cd2ffbe58bb3bdefbe2a32afa0acb1a77e472aac831f83427",
"sha256:773c781216f8c2900b42a7b638d5b517bb134ae1acbebe4d1e8f1f41ea60eb4b",
"sha256:a0c772d791c38bbc77be659af29bb14c38ced151433592e326361610250c605b",
"sha256:b29b869cf58412ca5738d23691e96d8aff535e17390128a1a52717c9a109da4f",
"sha256:c1a9ff320fa699337e05edcaae79ef8c2880b52720bc031b219e5b5008ebbdef",
"sha256:cd3caef37a415fd0dae6148a1b6957a8c5f275a62cca02e18474608cb263640c",
"sha256:d5ec194c9c573aafaceebf05fc400656722793dac57f254cd4741f3c27ae57b4",
"sha256:da6e5e818d18459fa46fac0a4a4e543507fe1110e808101277c5a2b5bab0cd2d",
"sha256:e79d94ca58fcafef6395f6352383fa1a76922268fa02caa2272fff501c2fdc78",
"sha256:f3ef98d7b76da5eb19c37fda834d50262ff9167c65658d1d8f974d2e4d90676b",
"sha256:f4c8cabb46ff8e5d61f56a037974228e978f26bfefce4f61a4b1ac0ba7a2ab72"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==2.20"
"markers": "python_version >= '3.6'",
"version": "==0.18.0"
},
"pytz": {
"hashes": [
@ -317,6 +293,14 @@
"editable": true,
"path": "../scraper"
},
"six": {
"hashes": [
"sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
"sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.16.0"
},
"soupsieve": {
"hashes": [
"sha256:052774848f448cf19c7e959adf5566904d525f33a3f8b6ba6f6f8f26ec7de0cc",

106
server/server/db.py

@ -1,6 +1,9 @@
# Globals
stations = []
trains = []
db_data = {
'version': 2,
}
# Examples
example_station = {
@ -20,38 +23,100 @@ example_train = {
import json
import os
from os import path, stat
from contextlib import contextmanager
from .utils import take_while
DB_DIR = os.environ.get('DB_DIR', '') or './db'
if not path.exists(DB_DIR):
os.mkdir(DB_DIR)
DB_FILE = path.join(DB_DIR, 'db.json')
STATIONS_FILE = path.join(DB_DIR, 'stations.json')
if path.exists(STATIONS_FILE):
TRAINS_FILE = path.join(DB_DIR, 'trains.json')
def migration():
global db_data
global trains
global stations
if not path.exists(DB_FILE):
print('[Migration] Migrating DB version 1 -> 2')
if path.exists(STATIONS_FILE):
with open(STATIONS_FILE) as f:
stations = json.load(f)
for i in range(len(stations)):
stations[i]['stoppedAtBy'] = [str(num) for num in stations[i]['stoppedAtBy']]
with open(STATIONS_FILE, 'w') as f:
json.dump(stations, f)
if path.exists(TRAINS_FILE):
with open(TRAINS_FILE) as f:
trains = json.load(f)
for i in range(len(trains)):
trains[i]['number'] = trains[i]['numberString']
del trains[i]['numberString']
with open(TRAINS_FILE, 'w') as f:
json.dump(trains, f)
db_data = {
'version': 2,
}
with open(DB_FILE, 'w') as f:
json.dump(db_data, f)
migration()
else:
with open(DB_FILE) as f:
db_data = json.load(f)
if db_data['version'] == 2:
print('[Migration] DB Version: 2, noop')
TRAINS_FILE = path.join(DB_DIR, 'trains.json')
migration()
if path.exists(DB_FILE):
with open(DB_FILE) as f:
db_data = json.load(f)
else:
with open(DB_FILE, 'w') as f:
json.dump(db_data, f)
if path.exists(STATIONS_FILE):
with open(STATIONS_FILE) as f:
stations = json.load(f)
if path.exists(TRAINS_FILE):
with open(TRAINS_FILE) as f:
trains = json.load(f)
_should_commit_on_every_change = True
@contextmanager
def db_transaction():
global _should_commit_on_every_change
_should_commit_on_every_change = False
yield
with open(DB_FILE, 'w') as f:
json.dump(db_data, f)
with open(STATIONS_FILE, 'w') as f:
stations.sort(key=lambda s: len(s['stoppedAtBy']), reverse=True)
json.dump(stations, f)
with open(TRAINS_FILE, 'w') as f:
json.dump(trains, f)
_should_commit_on_every_change = True
def found_train(rank: str, number: str, company: str) -> int:
number_int = int(''.join(take_while(lambda s: str(s).isnumeric(), number)))
number = ''.join(take_while(lambda s: str(s).isnumeric(), number))
try:
next(filter(lambda tr: tr['number'] == number_int, trains))
next(filter(lambda tr: tr['number'] == number, trains))
except StopIteration:
trains.append({
'number': number_int,
'numberString': number,
'number': number,
'company': company,
'rank': rank,
})
if _should_commit_on_every_change:
with open(TRAINS_FILE, 'w') as f:
json.dump(trains, f)
return number_int
return number
def found_station(name: str):
try:
@ -61,25 +126,46 @@ def found_station(name: str):
'name': name,
'stoppedAtBy': [],
})
if _should_commit_on_every_change:
stations.sort(key=lambda s: len(s['stoppedAtBy']), reverse=True)
with open(STATIONS_FILE, 'w') as f:
json.dump(stations, f)
def found_train_at_station(station_name: str, train_number: int):
def found_train_at_station(station_name: str, train_number: str):
train_number = ''.join(take_while(lambda s: str(s).isnumeric(), train_number))
found_station(station_name)
for i in range(len(stations)):
if stations[i]['name'] == station_name:
if train_number not in stations[i]['stoppedAtBy']:
stations[i]['stoppedAtBy'].append(train_number)
break
if _should_commit_on_every_change:
stations.sort(key=lambda s: len(s['stoppedAtBy']), reverse=True)
with open(STATIONS_FILE, 'w') as f:
json.dump(stations, f)
break
def on_train_data(train_data: dict):
with db_transaction():
train_no = found_train(train_data['rank'], train_data['number'], train_data['operator'])
for station in train_data['stations']:
found_train_at_station(station['name'], train_no)
def on_train_lookup_failure(train_no: int):
def on_train_lookup_failure(train_no: str):
pass
def on_station(station_data: dict):
station_name = station_data['stationName']
def process_train(train_data: dict):
train_number = train_data['train']['number']
train_number = found_train(train_data['train']['rank'], train_number, train_data['train']['operator'])
found_train_at_station(station_name, train_number)
if 'route' in train_data['train'] and train_data['train']['route']:
for station in train_data['train']['route']:
found_train_at_station(station, train_number)
with db_transaction():
for train in station_data['arrivals']:
process_train(train)
for train in station_data['departures']:
process_train(train)

29
server/server/flask_utils.py

@ -0,0 +1,29 @@
from flask import request as _f_request
from .utils import filter_result as _filter_result
def filtered_data(fn):
def filterer(*args, **kwargs):
filters = _f_request.args.get('filters', None)
if filters:
filters_raw = [f.split(':', 1) for f in filters.split(',')]
filters = {'.': []}
for key, value in filters_raw:
def add_to(obj, key, value):
if '.' in key:
prop, key = key.split('.', 1)
if prop not in filters:
obj[prop] = {'.': []}
add_to(obj[prop], key, value)
else:
obj['.'].append({key: value})
add_to(filters, key, value)
properties = _f_request.args.get('properties', None)
if properties:
properties = properties.split(',')
data = fn(*args, **kwargs)
return _filter_result(data, properties, filters)
return filterer

16
server/server/server.py

@ -1,9 +1,13 @@
print(f'Server {__name__=}')
import datetime
from flask import Flask, json, request, jsonify
from flask import Flask, jsonify, url_for
from jsonschema import validate
from .cache import CachedData
from .scraper.schemas import TRAIN_INFO_SCHEMA
from .utils import get_hostname
app = Flask(__name__)
@ -14,14 +18,18 @@ app.register_blueprint(v2.bp)
def root():
return 'Test'
@app.route('/train/.schema.json')
def get_train_info_schema():
return jsonify(TRAIN_INFO_SCHEMA['v1'])
train_data_cache = {}
@app.route('/train/<int:train_no>')
def get_train_info(train_no: int):
def get_data():
from .scraper.scraper import scrape
from .scraper.scraper import scrape_train
use_yesterday = False
result = scrape(train_no, use_yesterday=use_yesterday)
result = scrape_train(train_no, use_yesterday=use_yesterday)
from . import db
db.on_train_data(result)
@ -40,6 +48,8 @@ def get_train_info(train_no: int):
if train_no not in train_data_cache:
train_data_cache[train_no] = CachedData(get_data, validity=1000 * 30)
data, fetch_time = train_data_cache[train_no]()
data['$schema'] = get_hostname() + url_for('.get_train_info_schema')
validate(data, schema=TRAIN_INFO_SCHEMA['v1'])
resp = jsonify(data)
resp.headers['X-Last-Fetched'] = fetch_time.isoformat()
return resp

23
server/server/utils.py

@ -16,3 +16,26 @@ def check_yes_no(input: str, default=_NO_DEFAULT, considered_yes=None) -> bool:
considered_yes = ['y', 'yes', 't', 'true', '1']
return input in considered_yes
def get_hostname():
import os
import platform
return os.getenv('HOSTNAME', os.getenv('COMPUTERNAME', platform.node()))
def filter_result(data, properties=None, filters=None):
is_array = not hasattr(data, 'get')
result = data if is_array else [data]
if filters:
# Todo: implement filters
pass
# def f(lst, filters):
# def condition(item):
# return list(filter(condition, lst))
# result = f(result, filters)
if properties:
for i in range(len(result)):
result[i] = {p:result[i].get(p, None) for p in properties}
return result if is_array else result[0]

67
server/server/v2/v2.py

@ -1,32 +1,87 @@
import json
from flask import Blueprint, jsonify, request
from flask.helpers import url_for
from jsonschema import validate
from .. import db
from ..cache import CachedData
from ..utils import check_yes_no
from ..utils import check_yes_no, get_hostname
from ..flask_utils import filtered_data
from ..scraper.utils import ro_letters_to_en
from ..scraper.schemas import STATION_SCHEMA, TRAIN_INFO_SCHEMA
bp = Blueprint('v2', __name__, url_prefix='/v2')
@bp.get('/trains')
def get_known_trains():
return jsonify(db.trains)
@filtered_data
def get_data():
return db.trains
result = get_data()
return jsonify(result)
@bp.get('/stations')
def get_known_stations():
return jsonify(db.stations)
@filtered_data
def get_data():
return db.stations
result = get_data()
return jsonify(result)
train_data_cache = {}
@bp.route('/train/.schema.json')
def get_train_info_schema():
return jsonify(TRAIN_INFO_SCHEMA['v2'])
@bp.route('/train/<int:train_no>')
def get_train_info(train_no: int):
use_yesterday = check_yes_no(request.args.get('use_yesterday', ''), default=False)
@filtered_data
def get_data():
from ..scraper.scraper import scrape
result = scrape(train_no, use_yesterday=use_yesterday)
from ..scraper.scraper import scrape_train
result = scrape_train(train_no, use_yesterday=use_yesterday)
db.on_train_data(result)
return result
if train_no not in train_data_cache:
if (train_no, use_yesterday) not in train_data_cache:
train_data_cache[(train_no, use_yesterday)] = CachedData(get_data, validity=1000 * 30)
data, fetch_time = train_data_cache[(train_no, use_yesterday)]()
data['$schema'] = get_hostname() + url_for('.get_train_info_schema')
validate(data, schema=TRAIN_INFO_SCHEMA['v2'])
resp = jsonify(data)
resp.headers['X-Last-Fetched'] = fetch_time.isoformat()
return resp
station_cache = {}
@bp.route('/station/.schema.json')
def get_station_schema():
return jsonify(STATION_SCHEMA['v2'])
@bp.route('/station/<station_name>')
def get_station(station_name: str):
station_name = ro_letters_to_en(station_name.lower().replace(' ', '-'))
def get_data():
from ..scraper.scraper import scrape_station
result = scrape_station(station_name)
db.on_station(result)
return result
if station_name not in train_data_cache:
station_cache[station_name] = CachedData(get_data, validity=1000 * 30)
data, fetch_time = station_cache[station_name]()
data['$schema'] = get_hostname() + url_for('.get_station_schema')
validate(data, schema=STATION_SCHEMA['v2'])
@filtered_data
def filter(data):
return data
resp = jsonify(filter(data))
resp.headers['X-Last-Fetched'] = fetch_time.isoformat()
return resp

Loading…
Cancel
Save