From 0a7e2b25687bd6f46a94b5ef507dd751d188adac Mon Sep 17 00:00:00 2001 From: Dan Cojocaru Date: Fri, 27 Aug 2021 15:38:24 +0300 Subject: [PATCH] Added station arr/dep scrapper Added scrapper for arrivals and departures at station --- scraper/schemas.py | 20 ++ scraper/scrape_station.py | 87 +++++++++ scraper/scrape_station_schema_v2.json | 137 ++++++++++++++ scraper/scrape_train.py | 143 +++++++++++++++ scraper/scrape_train_schema.json | 134 ++++++++++++++ ...chema.json => scrape_train_schema_v2.json} | 22 ++- scraper/scraper.py | 171 +----------------- scraper/utils.py | 79 ++++++++ server/Pipfile | 1 + server/Pipfile.lock | 114 +++++------- server/server/db.py | 126 +++++++++++-- server/server/flask_utils.py | 29 +++ server/server/server.py | 16 +- server/server/utils.py | 23 +++ server/server/v2/v2.py | 67 ++++++- 15 files changed, 897 insertions(+), 272 deletions(-) create mode 100644 scraper/schemas.py create mode 100644 scraper/scrape_station.py create mode 100644 scraper/scrape_station_schema_v2.json create mode 100644 scraper/scrape_train.py create mode 100644 scraper/scrape_train_schema.json rename scraper/{trainInfoScrapResultSchema.json => scrape_train_schema_v2.json} (88%) create mode 100644 scraper/utils.py create mode 100644 server/server/flask_utils.py diff --git a/scraper/schemas.py b/scraper/schemas.py new file mode 100644 index 0000000..6509204 --- /dev/null +++ b/scraper/schemas.py @@ -0,0 +1,20 @@ +from contextlib import ExitStack as _ExitStack + +_es = _ExitStack() + +def _load_file(name: str): + import json + from os.path import join, dirname + dir = dirname(__file__) + + return json.load(_es.enter_context(open(join(dir, name)))) + +TRAIN_INFO_SCHEMA = { + 'v1': _load_file('scrape_train_schema.json'), + 'v2': _load_file('scrape_train_schema_v2.json'), +} +STATION_SCHEMA = { + 'v2': _load_file('scrape_station_schema_v2.json'), +} + +_es.close() diff --git a/scraper/scrape_station.py b/scraper/scrape_station.py new file mode 100644 index 0000000..83bf7fa --- /dev/null +++ b/scraper/scrape_station.py @@ -0,0 +1,87 @@ +import re + +from datetime import datetime, timedelta + +import pytz +import requests +from bs4 import BeautifulSoup + +from .utils import * + +# region regex definitions + +RO_LETTERS = r'A-Za-zăâîșțĂÂÎȚȘ' + +STATION_INFO_REGEX = re.compile(rf'^([{RO_LETTERS} ]+) în ([0-9.]+)$') + +STOPPING_TIME_REGEX = re.compile(r'^(necunoscută \(stație terminus\))|(?:([0-9]+) min \((?:începând cu|până la) ([0-9]{1,2}:[0-9]{2})\))$') + +# endregion + +def scrape(station_name: str): + station_name = ro_letters_to_en(station_name) + # Start scrapping session + s = requests.Session() + + r = s.get(build_url( + 'https://mersultrenurilor.infofer.ro/ro-RO/Statie/{station}', + station=station_name.replace(' ', '-'), + )) + + soup = BeautifulSoup(r.text, features='html.parser') + sform = soup.find(id='form-search') + result_data = { elem['name']: elem['value'] for elem in sform('input') } + + r = s.post('https://mersultrenurilor.infofer.ro/ro-RO/Stations/StationsResult', data=result_data) + soup = BeautifulSoup(r.text, features='html.parser') + + scraped = {} + + station_info_div, _, departures_div, arrivals_div, *_ = soup('div', recursive=False) + + scraped['stationName'], scraped['date'] = STATION_INFO_REGEX.match(collapse_space(station_info_div.h2.text)).groups() + date_d, date_m, date_y = (int(comp) for comp in scraped['date'].split('.')) + date = datetime(date_y, date_m, date_d) + dt_seq = DateTimeSequencer(date.year, date.month, date.day) + tz = pytz.timezone('Europe/Bucharest') + + def parse_arrdep_list(elem, end_station_field_name): + def parse_item(elem): + result = {} + + try: + data_div, status_div = elem('div', recursive=False) + except ValueError: + data_div, *_ = elem('div', recursive=False) + status_div = None + data_main_div, data_details_div = data_div('div', recursive=False) + time_div, dest_div, train_div, *_ = data_main_div('div', recursive=False) + operator_div, route_div, stopping_time_div = data_details_div.div('div', recursive=False) + + result['time'] = collapse_space(time_div.div.div('div', recursive=False)[1].text) + st_hr, st_min = (int(comp) for comp in result['time'].split(':')) + result['time'] = tz.localize(dt_seq(st_hr, st_min)).isoformat() + + unknown_st, st, st_opposite_time = STOPPING_TIME_REGEX.match( + collapse_space(stopping_time_div.div('div', recursive=False)[1].text) + ).groups() + if unknown_st: + result['stoppingTime'] = None + elif st: + result['stoppingTime'] = int(st) + + result['train'] = {} + result['train']['rank'] = collapse_space(train_div.div.div('div', recursive=False)[1].span.text) + result['train']['number'] = collapse_space(train_div.div.div('div', recursive=False)[1].a.text) + result['train'][end_station_field_name] = collapse_space(dest_div.div.div('div', recursive=False)[1].text) + result['train']['operator'] = collapse_space(operator_div.div('div', recursive=False)[1].text) + result['train']['route'] = collapse_space(route_div.div('div', recursive=False)[1].text).split(' - ') + + return result + + return [parse_item(elem) for elem in elem.div.ul('li', recursive=False)] + + scraped['departures'] = parse_arrdep_list(departures_div, 'destination') + scraped['arrivals'] = parse_arrdep_list(arrivals_div, 'origin') + + return scraped diff --git a/scraper/scrape_station_schema_v2.json b/scraper/scrape_station_schema_v2.json new file mode 100644 index 0000000..75e36c8 --- /dev/null +++ b/scraper/scrape_station_schema_v2.json @@ -0,0 +1,137 @@ +{ + "$schema": "http://json-schema.org/schema", + "title": "Train Info InfoFer Scrap Station Schema", + "description": "Results of scrapping InfoFer website for station arrival/departure info", + "definitions": { + "arrDepItem": { + "type": "object", + "properties": { + "time": { + "description": "Time of arrival/departure", + "type": "string", + "format": "date-time" + }, + "train": { + "type": "object", + "properties": { + "rank": { + "type": "string", + "examples": [ + "R", + "R-E", + "IR", + "IRN" + ] + }, + "number": { + "type": "string", + "examples": [ + "74", + "15934" + ] + }, + "operator": { + "type": "string", + "examples": [ + "CFR Călători", + "Softrans", + "Regio Călători" + ] + }, + "route": { + "description": "All the stations the train stops at", + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": [ + "rank", + "number", + "operator" + ] + }, + "stoppingTime": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + } + }, + "required": [ + "time", + "train", + "stoppingTime" + ] + } + }, + "type": "object", + "properties": { + "arrivals": { + "type": "array", + "items": { + "allOf": [ + { + "$ref": "#/definitions/arrDepItem" + }, + { + "type": "object", + "properties": { + "train": { + "type": "object", + "properties": { + "origin": { + "type": "string" + } + }, + "required": ["origin"] + } + }, + "required": ["train"] + } + ] + } + }, + "departures": { + "type": "array", + "items": { + "allOf": [ + { + "$ref": "#/definitions/arrDepItem" + }, + { + "type": "object", + "properties": { + "train": { + "type": "object", + "properties": { + "destination": { + "type": "string" + } + }, + "required": ["destination"] + } + }, + "required": ["train"] + } + ] + } + }, + "stationName": { + "type": "string" + }, + "date": { + "description": "Date for which the data is provided (likely today)", + "type": "string", + "pattern": "^[0-9]{1,2}\\.[0-9]{2}\\.[0-9]{4}$" + } + }, + "required": [ + "arrivals", + "departures", + "stationName", + "date" + ] +} \ No newline at end of file diff --git a/scraper/scrape_train.py b/scraper/scrape_train.py new file mode 100644 index 0000000..df9c7ff --- /dev/null +++ b/scraper/scrape_train.py @@ -0,0 +1,143 @@ +import re + +from datetime import datetime, timedelta + +import pytz +import requests +from bs4 import BeautifulSoup + +from .utils import * + +# region regex definitions + +TRAIN_INFO_REGEX = re.compile(r'^([A-Z-]+) ([0-9]+) în ([0-9.]+)$') + +OPERATOR_REGEX = re.compile(r'^Operat de (.+)$') + +SL_REGEX = re.compile(r'^(?:Fără|([0-9]+) min) (întârziere|mai devreme) la (trecerea fără oprire prin|sosirea în|plecarea din) (.+)\.$') +SL_STATE_MAP = { + 't': 'passing', + 's': 'arrival', + 'p': 'departure', +} + +RO_LETTERS = r'A-Za-zăâîșțĂÂÎȚȘ' + +ROUTE_REGEX = re.compile(rf'^Parcurs tren ([{RO_LETTERS} ]+)[-–]([{RO_LETTERS} ]+)$') + +KM_REGEX = re.compile(r'^km ([0-9]+)$') + +PLATFORM_REGEX = re.compile(r'^linia (.+)$') + +STOPPING_TIME_REGEX = re.compile(r'^([0-9]+) min oprire$') + +STATION_DEPARR_STATUS_REGEX = re.compile(r'^(?:(la timp)|(?:((?:\+|-)[0-9]+) min \((?:(?:întârziere)|(?:mai devreme))\)))(\*?)$') + +# endregion + +def scrape(train_no: int, use_yesterday=False, date_override=None): + # Start scrapping session + s = requests.Session() + + date = datetime.today() + if use_yesterday: + date -= timedelta(days=1) + if date_override: + date = date_override + + r = s.get(build_url( + 'https://mersultrenurilor.infofer.ro/ro-RO/Tren/{train_no}', + train_no=train_no, + query=[ + ('Date', date.strftime('%d.%m.%Y')), + ], + )) + + soup = BeautifulSoup(r.text, features='html.parser') + sform = soup.find(id='form-search') + result_data = { elem['name']: elem['value'] for elem in sform('input') } + + r = s.post('https://mersultrenurilor.infofer.ro/ro-RO/Trains/TrainsResult', data=result_data) + soup = BeautifulSoup(r.text, features='html.parser') + + scraped = {} + + train_info_div, _, _, results_div, *_ = soup('div', recursive=False) + + train_info_div = train_info_div.div('div', recursive=False)[0] + + scraped['rank'], scraped['number'], scraped['date'] = TRAIN_INFO_REGEX.match(collapse_space(train_info_div.h2.text)).groups() + date_d, date_m, date_y = (int(comp) for comp in scraped['date'].split('.')) + date = datetime(date_y, date_m, date_d) + + scraped['operator'] = OPERATOR_REGEX.match(collapse_space(train_info_div.p.text)).groups()[0] + + results_div = results_div.div + status_div = results_div('div', recursive=False)[0] + route_text = collapse_space(status_div.h4.text) + route_from, route_to = ROUTE_REGEX.match(route_text).groups() + scraped['route'] = { + 'from': route_from, + 'to': route_to, + } + try: + status_line_match = SL_REGEX.match(collapse_space(status_div.div.text)) + slm_delay, slm_late, slm_arrival, slm_station = status_line_match.groups() + scraped['status'] = { + 'delay': (int(slm_delay) if slm_late == 'întârziere' else -int(slm_delay)) if slm_delay else 0, + 'station': slm_station, + 'state': SL_STATE_MAP[slm_arrival[0]], + } + except Exception: + scraped['status'] = None + + stations = status_div.ul('li', recursive=False) + scraped['stations'] = [] + dt_seq = DateTimeSequencer(date.year, date.month, date.day) + tz = pytz.timezone('Europe/Bucharest') + for station in stations: + station_scraped = {} + + left, middle, right = station.div('div', recursive=False) + station_scraped['name'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[0].text) + station_scraped['km'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[1].text) + station_scraped['km'] = int(KM_REGEX.match(station_scraped['km']).groups()[0]) + station_scraped['stoppingTime'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[2].text) + if not station_scraped['stoppingTime']: + station_scraped['stoppingTime'] = None + else: + station_scraped['stoppingTime'] = int(STOPPING_TIME_REGEX.match(station_scraped['stoppingTime']).groups()[0]) + station_scraped['platform'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[3].text) + if not station_scraped['platform']: + station_scraped['platform'] = None + else: + station_scraped['platform'] = PLATFORM_REGEX.match(station_scraped['platform']).groups()[0] + + def scrape_time(elem, setter): + parts = elem.div.div('div', recursive=False) + if parts: + result = {} + + time, *_ = parts + result['scheduleTime'] = collapse_space(time.text) + st_hr, st_min = (int(comp) for comp in result['scheduleTime'].split(':')) + result['scheduleTime'] = tz.localize(dt_seq(st_hr, st_min)).isoformat() + if len(parts) >= 2: + _, status, *_ = parts + result['status'] = {} + on_time, delay, approx = STATION_DEPARR_STATUS_REGEX.match(collapse_space(status.text)).groups() + result['status']['delay'] = 0 if on_time else int(delay) + result['status']['real'] = not approx + else: + result['status'] = None + + setter(result) + else: + setter(None) + + scrape_time(left, lambda value: station_scraped.update(arrival=value)) + scrape_time(right, lambda value: station_scraped.update(departure=value)) + + scraped['stations'].append(station_scraped) + + return scraped diff --git a/scraper/scrape_train_schema.json b/scraper/scrape_train_schema.json new file mode 100644 index 0000000..541a657 --- /dev/null +++ b/scraper/scrape_train_schema.json @@ -0,0 +1,134 @@ +{ + "$schema": "http://json-schema.org/schema", + "title": "Train Info InfoFer Scrap Train Schema", + "description": "Results of scrapping InfoFer website for train info", + "definitions": { + "delayType": { + "description": "Delay of the train (negative for being early)", + "type": "integer" + }, + "stationArrDepTime": { + "description": "Time of arrival at/departure from station", + "type": ["object", "null"], + "properties": { + "scheduleTime": { + "description": "The time the train is scheduled to arrive/depart", + "type": "string", + "pattern": "^[0-9]{1,2}:[0-9]{2}$" + }, + "status": { + "type": ["object", "null"], + "properties": { + "delay": { + "$ref": "#/definitions/delayType" + }, + "real": { + "description": "Determines whether delay was actually reported or is an approximation", + "type": "boolean" + } + }, + "required": ["delay", "real"] + } + }, + "required": ["scheduleTime"] + } + }, + "type": "object", + "properties": { + "rank": { + "description": "The rank of the train", + "type": "string", + "examples": [ + "R", + "R-E", + "IR", + "IRN" + ] + }, + "number": { + "description": "The number of the train", + "type": "string", + "examples": [ + "74", + "15934" + ] + }, + "date": { + "description": "Date of departure from the first station (dd.mm.yyyy)", + "type": "string", + "pattern": "^[0-9]{1,2}\\.[0-9]{2}\\.[0-9]{4}$" + }, + "operator": { + "description": "Operator of the train", + "type": "string", + "examples": [ + "CFR Călători", + "Softrans", + "Regio Călători" + ] + }, + "route": { + "description": "Route of the train", + "type": "object", + "properties": { + "from": { + "type": "string" + }, + "to": { + "type": "string" + } + }, + "required": ["from", "to"] + }, + "status": { + "description": "Current status of the train", + "type": ["object", "null"], + "properties": { + "delay": { + "$ref": "#/definitions/delayType" + }, + "station": { + "type": "string" + }, + "state": { + "type": "string", + "enum": ["passing", "arrival", "departure"] + } + }, + "required": ["delay", "station", "state"] + }, + "stations": { + "description": "List of stations the train stops at", + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "km": { + "description": "The distance the train travelled until reaching this station", + "type": "integer" + }, + "stoppingTime": { + "description": "The number of minutes the train is scheduled to stop in this station", + "type": ["integer", "null"], + "minimum": 1 + }, + "platform": { + "description": "The platform the train stopped at", + "type": ["string", "null"] + }, + "arrival": { + "$ref": "#/definitions/stationArrDepTime" + }, + "departure": { + "$ref": "#/definitions/stationArrDepTime" + } + }, + "required": ["name", "km"] + } + } + }, + "required": ["route", "stations", "rank", "number", "date", "operator"] +} \ No newline at end of file diff --git a/scraper/trainInfoScrapResultSchema.json b/scraper/scrape_train_schema_v2.json similarity index 88% rename from scraper/trainInfoScrapResultSchema.json rename to scraper/scrape_train_schema_v2.json index d25307d..837ea40 100644 --- a/scraper/trainInfoScrapResultSchema.json +++ b/scraper/scrape_train_schema_v2.json @@ -1,6 +1,6 @@ { "$schema": "http://json-schema.org/schema", - "title": "Train Info InfoFer Scrap Result Schema", + "title": "Train Info InfoFer Scrap Train Schema", "description": "Results of scrapping InfoFer website for train info", "definitions": { "delayType": { @@ -13,7 +13,8 @@ "properties": { "scheduleTime": { "description": "The time the train is scheduled to arrive/depart", - "type": "string" + "type": "string", + "format": "date-time" }, "status": { "type": ["object", "null"], @@ -38,23 +39,24 @@ "description": "The rank of the train", "type": "string", "examples": [ - "74", - "15934" + "R", + "R-E", + "IR", + "IRN" ] }, "number": { "description": "The number of the train", "type": "string", "examples": [ - "R", - "R-E", - "IR", - "IRN" + "74", + "15934" ] }, "date": { - "description": "Date of departure from the first station", - "type": "string" + "description": "Date of departure from the first station (dd.mm.yyyy)", + "type": "string", + "pattern": "^[0-9]{1,2}\\.[0-9]{2}\\.[0-9]{4}$" }, "operator": { "description": "Operator of the train", diff --git a/scraper/scraper.py b/scraper/scraper.py index 9545b31..8a594d9 100644 --- a/scraper/scraper.py +++ b/scraper/scraper.py @@ -1,177 +1,12 @@ #! /usr/bin/env python3 - -from datetime import datetime, timedelta -import re - -import pytz -import requests -from bs4 import BeautifulSoup -from urllib.parse import quote, urlencode - -TRAIN_INFO_REGEX = re.compile(r'^([A-Z-]+) ([0-9]+) în ([0-9.]+)$') - -OPERATOR_REGEX = re.compile(r'^Operat de (.+)$') - -SL_REGEX = re.compile(r'^(?:Fără|([0-9]+) min) (întârziere|mai devreme) la (trecerea fără oprire prin|sosirea în|plecarea din) (.+)\.$') -SL_STATE_MAP = { - 't': 'passing', - 's': 'arrival', - 'p': 'departure', -} - -RO_LETTERS = r'A-Za-zăâîșțĂÂÎȚȘ' - -ROUTE_REGEX = re.compile(rf'^Parcurs tren ([{RO_LETTERS} ]+)[-–]([{RO_LETTERS} ]+)$') - -KM_REGEX = re.compile(r'^km ([0-9]+)$') - -PLATFORM_REGEX = re.compile(r'^linia (.+)$') - -STOPPING_TIME_REGEX = re.compile(r'^([0-9]+) min oprire$') - -STATION_DEPARR_STATUS_REGEX = re.compile(r'^(?:(la timp)|(?:((?:\+|-)[0-9]+) min \((?:(?:întârziere)|(?:mai devreme))\)))(\*?)$') - -class DateTimeSequencer: - def __init__(self, year: int, month: int, day: int) -> None: - self.current = datetime(year, month, day, 0, 0, 0) - self.current -= timedelta(seconds=1) - - def __call__(self, hour: int, minute: int = 0, second: int = 0) -> datetime: - potential_new_date = datetime(self.current.year, self.current.month, self.current.day, hour, minute, second) - if (self.current > potential_new_date): - potential_new_date += timedelta(days=1) - self.current = potential_new_date - return self.current - -def collapse_space(string: str) -> str: - return re.sub( - rf'[{BeautifulSoup.ASCII_SPACES}]+', - ' ', - string, - flags=re.MULTILINE - ).strip() - -def build_url(base: str, /, query: dict, **kwargs): - result = base.format(**{ k: quote(str(v)) for k, v in kwargs.items() }) - if query: - result += '?' - result += urlencode(query) - return result - -def scrape(train_no: int, use_yesterday=False, date_override=None): - # Start scrapping session - s = requests.Session() - - date = datetime.today() - if use_yesterday: - date -= timedelta(days=1) - if date_override: - date = date_override - - r = s.get(build_url( - 'https://mersultrenurilor.infofer.ro/ro-RO/Tren/{train_no}', - train_no=train_no, - query=[ - ('Date', date.strftime('%d.%m.%Y')), - ], - )) - - soup = BeautifulSoup(r.text, features='html.parser') - sform = soup.find(id='form-search') - result_data = { elem['name']: elem['value'] for elem in sform('input') } - - r = s.post('https://mersultrenurilor.infofer.ro/ro-RO/Trains/TrainsResult', data=result_data) - soup = BeautifulSoup(r.text, features='html.parser') - - scraped = {} - - train_info_div, _, _, results_div, *_ = soup('div', recursive=False) - - train_info_div = train_info_div.div('div', recursive=False)[0] - - scraped['rank'], scraped['number'], scraped['date'] = TRAIN_INFO_REGEX.match(collapse_space(train_info_div.h2.text)).groups() - date_d, date_m, date_y = (int(comp) for comp in scraped['date'].split('.')) - date = datetime(date_y, date_m, date_d) - - scraped['operator'] = OPERATOR_REGEX.match(collapse_space(train_info_div.p.text)).groups()[0] - - results_div = results_div.div - status_div = results_div('div', recursive=False)[0] - route_text = collapse_space(status_div.h4.text) - route_from, route_to = ROUTE_REGEX.match(route_text).groups() - scraped['route'] = { - 'from': route_from, - 'to': route_to, - } - try: - status_line_match = SL_REGEX.match(collapse_space(status_div.div.text)) - slm_delay, slm_late, slm_arrival, slm_station = status_line_match.groups() - scraped['status'] = { - 'delay': (int(slm_delay) if slm_late == 'întârziere' else -int(slm_delay)) if slm_delay else 0, - 'station': slm_station, - 'state': SL_STATE_MAP[slm_arrival[0]], - } - except Exception: - scraped['status'] = None - - stations = status_div.ul('li', recursive=False) - scraped['stations'] = [] - dt_seq = DateTimeSequencer(date.year, date.month, date.day) - tz = pytz.timezone('Europe/Bucharest') - for station in stations: - station_scraped = {} - - left, middle, right = station.div('div', recursive=False) - station_scraped['name'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[0].text) - station_scraped['km'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[1].text) - station_scraped['km'] = int(KM_REGEX.match(station_scraped['km']).groups()[0]) - station_scraped['stoppingTime'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[2].text) - if not station_scraped['stoppingTime']: - station_scraped['stoppingTime'] = None - else: - station_scraped['stoppingTime'] = int(STOPPING_TIME_REGEX.match(station_scraped['stoppingTime']).groups()[0]) - station_scraped['platform'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[3].text) - if not station_scraped['platform']: - station_scraped['platform'] = None - else: - station_scraped['platform'] = PLATFORM_REGEX.match(station_scraped['platform']).groups()[0] - - def scrape_time(elem, setter): - parts = elem.div.div('div', recursive=False) - if parts: - result = {} - - time, *_ = parts - result['scheduleTime'] = collapse_space(time.text) - st_hr, st_min = (int(comp) for comp in result['scheduleTime'].split(':')) - result['scheduleTime'] = tz.localize(dt_seq(st_hr, st_min)).isoformat() - if len(parts) >= 2: - _, status, *_ = parts - result['status'] = {} - on_time, delay, approx = STATION_DEPARR_STATUS_REGEX.match(collapse_space(status.text)).groups() - result['status']['delay'] = 0 if on_time else int(delay) - result['status']['real'] = not approx - else: - result['status'] = None - - setter(result) - else: - setter(None) - - scrape_time(left, lambda value: station_scraped.update(arrival=value)) - scrape_time(right, lambda value: station_scraped.update(departure=value)) - - scraped['stations'].append(station_scraped) - - return scraped - +from .scrape_train import scrape as scrape_train +from .scrape_station import scrape as scrape_station def main(): train_no = 1538 print(f'Testing package with train number {train_no}') from pprint import pprint - # pprint(scrape('473')) - pprint(scrape(train_no)) + pprint(scrape_train(train_no)) if __name__ == '__main__': main() diff --git a/scraper/utils.py b/scraper/utils.py new file mode 100644 index 0000000..ad314be --- /dev/null +++ b/scraper/utils.py @@ -0,0 +1,79 @@ +import re + +from datetime import datetime, timedelta +from urllib.parse import urlencode, quote + +# From: https://en.wikipedia.org/wiki/Whitespace_character#Unicode +ASCII_WHITESPACE = [ + '\u0009', # HT; Character Tabulation + '\u000a', # LF + '\u000b', # VT; Line Tabulation + '\u000c', # FF; Form Feed + '\u000d', # CR + '\u0020', # Space +] + +WHITESPACE = ASCII_WHITESPACE + [ + '\u0085', # NEL; Next Line + '\u00a0', # No-break Space;   + '\u1680', # Ogham Space Mark + '\u2000', # En Quad + '\u2001', # Em Quad + '\u2002', # En Space + '\u2003', # Em Space + '\u2004', # Three-per-em Space + '\u2005', # Four-per-em Space + '\u2006', # Six-per-em Space + '\u2007', # Figure Space + '\u2008', # Punctuation Space + '\u2009', # Thin Space + '\u200A', # Hair Space + '\u2028', # Line Separator + '\u2029', # Paragraph Separator + '\u202f', # Narrow No-break Space + '\u205d', # Meduam Mathematical Space + '\u3000', # Ideographic Space +] + +WHITESPACE_REGEX = re.compile(rf'[{"".join(WHITESPACE)}]+', flags=re.MULTILINE) + +class DateTimeSequencer: + def __init__(self, year: int, month: int, day: int) -> None: + self.current = datetime(year, month, day, 0, 0, 0) + self.current -= timedelta(seconds=1) + + def __call__(self, hour: int, minute: int = 0, second: int = 0) -> datetime: + potential_new_date = datetime(self.current.year, self.current.month, self.current.day, hour, minute, second) + if (self.current > potential_new_date): + potential_new_date += timedelta(days=1) + self.current = potential_new_date + return self.current + +def collapse_space(string: str) -> str: + return WHITESPACE_REGEX.sub( + ' ', + string, + ).strip() + +def build_url(base: str, /, query: dict = {}, **kwargs): + result = base.format(**{ k: quote(str(v)) for k, v in kwargs.items() }) + if query: + result += '?' + result += urlencode(query) + return result + +RO_TO_EN = { + 'ă': 'a', + 'Ă': 'A', + 'â': 'a', + 'Â': 'A', + 'î': 'i', + 'Î': 'I', + 'ș': 's', + 'Ș': 'S', + 'ț': 't', + 'Ț': 'T', +} + +def ro_letters_to_en(string: str) -> str: + return ''.join((RO_TO_EN.get(letter, letter) for letter in string)) diff --git a/server/Pipfile b/server/Pipfile index 48a585d..3bcaa3b 100644 --- a/server/Pipfile +++ b/server/Pipfile @@ -7,6 +7,7 @@ name = "pypi" flask = "*" gevent = "*" scraper = { editable = true, path = '../scraper' } +jsonschema = "*" [dev-packages] diff --git a/server/Pipfile.lock b/server/Pipfile.lock index b1bfb9f..7de0a9d 100644 --- a/server/Pipfile.lock +++ b/server/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "9d422680ab15ce184b043276f5d0d2cac228ff60dfc66ec193b6314bdc0f6ce2" + "sha256": "3c7f09679bdd68674754a714ee39503cf1a3ae265400eea074fec83559246dff" }, "pipfile-spec": 6, "requires": { @@ -16,6 +16,14 @@ ] }, "default": { + "attrs": { + "hashes": [ + "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1", + "sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==21.2.0" + }, "beautifulsoup4": { "hashes": [ "sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35", @@ -31,57 +39,6 @@ ], "version": "==2021.5.30" }, - "cffi": { - "hashes": [ - "sha256:06c54a68935738d206570b20da5ef2b6b6d92b38ef3ec45c5422c0ebaf338d4d", - "sha256:0c0591bee64e438883b0c92a7bed78f6290d40bf02e54c5bf0978eaf36061771", - "sha256:19ca0dbdeda3b2615421d54bef8985f72af6e0c47082a8d26122adac81a95872", - "sha256:22b9c3c320171c108e903d61a3723b51e37aaa8c81255b5e7ce102775bd01e2c", - "sha256:26bb2549b72708c833f5abe62b756176022a7b9a7f689b571e74c8478ead51dc", - "sha256:33791e8a2dc2953f28b8d8d300dde42dd929ac28f974c4b4c6272cb2955cb762", - "sha256:3c8d896becff2fa653dc4438b54a5a25a971d1f4110b32bd3068db3722c80202", - "sha256:4373612d59c404baeb7cbd788a18b2b2a8331abcc84c3ba40051fcd18b17a4d5", - "sha256:487d63e1454627c8e47dd230025780e91869cfba4c753a74fda196a1f6ad6548", - "sha256:48916e459c54c4a70e52745639f1db524542140433599e13911b2f329834276a", - "sha256:4922cd707b25e623b902c86188aca466d3620892db76c0bdd7b99a3d5e61d35f", - "sha256:55af55e32ae468e9946f741a5d51f9896da6b9bf0bbdd326843fec05c730eb20", - "sha256:57e555a9feb4a8460415f1aac331a2dc833b1115284f7ded7278b54afc5bd218", - "sha256:5d4b68e216fc65e9fe4f524c177b54964af043dde734807586cf5435af84045c", - "sha256:64fda793737bc4037521d4899be780534b9aea552eb673b9833b01f945904c2e", - "sha256:6d6169cb3c6c2ad50db5b868db6491a790300ade1ed5d1da29289d73bbe40b56", - "sha256:7bcac9a2b4fdbed2c16fa5681356d7121ecabf041f18d97ed5b8e0dd38a80224", - "sha256:80b06212075346b5546b0417b9f2bf467fea3bfe7352f781ffc05a8ab24ba14a", - "sha256:818014c754cd3dba7229c0f5884396264d51ffb87ec86e927ef0be140bfdb0d2", - "sha256:8eb687582ed7cd8c4bdbff3df6c0da443eb89c3c72e6e5dcdd9c81729712791a", - "sha256:99f27fefe34c37ba9875f224a8f36e31d744d8083e00f520f133cab79ad5e819", - "sha256:9f3e33c28cd39d1b655ed1ba7247133b6f7fc16fa16887b120c0c670e35ce346", - "sha256:a8661b2ce9694ca01c529bfa204dbb144b275a31685a075ce123f12331be790b", - "sha256:a9da7010cec5a12193d1af9872a00888f396aba3dc79186604a09ea3ee7c029e", - "sha256:aedb15f0a5a5949ecb129a82b72b19df97bbbca024081ed2ef88bd5c0a610534", - "sha256:b315d709717a99f4b27b59b021e6207c64620790ca3e0bde636a6c7f14618abb", - "sha256:ba6f2b3f452e150945d58f4badd92310449876c4c954836cfb1803bdd7b422f0", - "sha256:c33d18eb6e6bc36f09d793c0dc58b0211fccc6ae5149b808da4a62660678b156", - "sha256:c9a875ce9d7fe32887784274dd533c57909b7b1dcadcc128a2ac21331a9765dd", - "sha256:c9e005e9bd57bc987764c32a1bee4364c44fdc11a3cc20a40b93b444984f2b87", - "sha256:d2ad4d668a5c0645d281dcd17aff2be3212bc109b33814bbb15c4939f44181cc", - "sha256:d950695ae4381ecd856bcaf2b1e866720e4ab9a1498cba61c602e56630ca7195", - "sha256:e22dcb48709fc51a7b58a927391b23ab37eb3737a98ac4338e2448bef8559b33", - "sha256:e8c6a99be100371dbb046880e7a282152aa5d6127ae01783e37662ef73850d8f", - "sha256:e9dc245e3ac69c92ee4c167fbdd7428ec1956d4e754223124991ef29eb57a09d", - "sha256:eb687a11f0a7a1839719edd80f41e459cc5366857ecbed383ff376c4e3cc6afd", - "sha256:eb9e2a346c5238a30a746893f23a9535e700f8192a68c07c0258e7ece6ff3728", - "sha256:ed38b924ce794e505647f7c331b22a693bee1538fdf46b0222c4717b42f744e7", - "sha256:f0010c6f9d1a4011e429109fda55a225921e3206e7f62a0c22a35344bfd13cca", - "sha256:f0c5d1acbfca6ebdd6b1e3eded8d261affb6ddcf2186205518f1428b8569bb99", - "sha256:f10afb1004f102c7868ebfe91c28f4a712227fe4cb24974350ace1f90e1febbf", - "sha256:f174135f5609428cc6e1b9090f9268f5c8935fddb1b25ccb8255a2d50de6789e", - "sha256:f3ebe6e73c319340830a9b2825d32eb6d8475c1dac020b4f0aa774ee3b898d1c", - "sha256:f627688813d0a4140153ff532537fbe4afea5a3dffce1f9deb7f91f848a832b5", - "sha256:fd4305f86f53dfd8cd3522269ed7fc34856a8ee3709a5e28b2836b2db9d4cd69" - ], - "markers": "platform_python_implementation == 'CPython' and sys_platform == 'win32'", - "version": "==1.14.6" - }, "charset-normalizer": { "hashes": [ "sha256:0c8911edd15d19223366a194a513099a302055a962bca2cec0f54b8b63175d8b", @@ -98,14 +55,6 @@ "markers": "python_version >= '3.6'", "version": "==8.0.1" }, - "colorama": { - "hashes": [ - "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b", - "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2" - ], - "markers": "platform_system == 'Windows'", - "version": "==0.4.4" - }, "flask": { "hashes": [ "sha256:1c4c257b1892aec1398784c63791cbaa43062f1f7aeb555c4da961b20ee68f55", @@ -230,6 +179,14 @@ "markers": "python_version >= '3.6'", "version": "==3.0.1" }, + "jsonschema": { + "hashes": [ + "sha256:4e5b3cf8216f577bee9ce139cbe72eca3ea4f292ec60928ff24758ce626cd163", + "sha256:c8a85b28d377cc7737e46e2d9f2b4f44ee3c0e1deac6bf46ddefc7187d30797a" + ], + "index": "pypi", + "version": "==3.2.0" + }, "markupsafe": { "hashes": [ "sha256:01a9b8ea66f1658938f65b93a85ebe8bc016e6769611be228d797c9d998dd298", @@ -290,13 +247,32 @@ "markers": "python_version >= '3.6'", "version": "==2.0.1" }, - "pycparser": { + "pyrsistent": { "hashes": [ - "sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0", - "sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705" + "sha256:097b96f129dd36a8c9e33594e7ebb151b1515eb52cceb08474c10a5479e799f2", + "sha256:2aaf19dc8ce517a8653746d98e962ef480ff34b6bc563fc067be6401ffb457c7", + "sha256:404e1f1d254d314d55adb8d87f4f465c8693d6f902f67eb6ef5b4526dc58e6ea", + "sha256:48578680353f41dca1ca3dc48629fb77dfc745128b56fc01096b2530c13fd426", + "sha256:4916c10896721e472ee12c95cdc2891ce5890898d2f9907b1b4ae0f53588b710", + "sha256:527be2bfa8dc80f6f8ddd65242ba476a6c4fb4e3aedbf281dfbac1b1ed4165b1", + "sha256:58a70d93fb79dc585b21f9d72487b929a6fe58da0754fa4cb9f279bb92369396", + "sha256:5e4395bbf841693eaebaa5bb5c8f5cdbb1d139e07c975c682ec4e4f8126e03d2", + "sha256:6b5eed00e597b5b5773b4ca30bd48a5774ef1e96f2a45d105db5b4ebb4bca680", + "sha256:73ff61b1411e3fb0ba144b8f08d6749749775fe89688093e1efef9839d2dcc35", + "sha256:772e94c2c6864f2cd2ffbe58bb3bdefbe2a32afa0acb1a77e472aac831f83427", + "sha256:773c781216f8c2900b42a7b638d5b517bb134ae1acbebe4d1e8f1f41ea60eb4b", + "sha256:a0c772d791c38bbc77be659af29bb14c38ced151433592e326361610250c605b", + "sha256:b29b869cf58412ca5738d23691e96d8aff535e17390128a1a52717c9a109da4f", + "sha256:c1a9ff320fa699337e05edcaae79ef8c2880b52720bc031b219e5b5008ebbdef", + "sha256:cd3caef37a415fd0dae6148a1b6957a8c5f275a62cca02e18474608cb263640c", + "sha256:d5ec194c9c573aafaceebf05fc400656722793dac57f254cd4741f3c27ae57b4", + "sha256:da6e5e818d18459fa46fac0a4a4e543507fe1110e808101277c5a2b5bab0cd2d", + "sha256:e79d94ca58fcafef6395f6352383fa1a76922268fa02caa2272fff501c2fdc78", + "sha256:f3ef98d7b76da5eb19c37fda834d50262ff9167c65658d1d8f974d2e4d90676b", + "sha256:f4c8cabb46ff8e5d61f56a037974228e978f26bfefce4f61a4b1ac0ba7a2ab72" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==2.20" + "markers": "python_version >= '3.6'", + "version": "==0.18.0" }, "pytz": { "hashes": [ @@ -317,6 +293,14 @@ "editable": true, "path": "../scraper" }, + "six": { + "hashes": [ + "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", + "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==1.16.0" + }, "soupsieve": { "hashes": [ "sha256:052774848f448cf19c7e959adf5566904d525f33a3f8b6ba6f6f8f26ec7de0cc", diff --git a/server/server/db.py b/server/server/db.py index aa30109..05ec249 100644 --- a/server/server/db.py +++ b/server/server/db.py @@ -1,6 +1,9 @@ # Globals stations = [] trains = [] +db_data = { + 'version': 2, +} # Examples example_station = { @@ -20,38 +23,100 @@ example_train = { import json import os from os import path, stat +from contextlib import contextmanager + from .utils import take_while DB_DIR = os.environ.get('DB_DIR', '') or './db' if not path.exists(DB_DIR): os.mkdir(DB_DIR) +DB_FILE = path.join(DB_DIR, 'db.json') + STATIONS_FILE = path.join(DB_DIR, 'stations.json') +TRAINS_FILE = path.join(DB_DIR, 'trains.json') + +def migration(): + global db_data + global trains + global stations + if not path.exists(DB_FILE): + print('[Migration] Migrating DB version 1 -> 2') + if path.exists(STATIONS_FILE): + with open(STATIONS_FILE) as f: + stations = json.load(f) + for i in range(len(stations)): + stations[i]['stoppedAtBy'] = [str(num) for num in stations[i]['stoppedAtBy']] + with open(STATIONS_FILE, 'w') as f: + json.dump(stations, f) + if path.exists(TRAINS_FILE): + with open(TRAINS_FILE) as f: + trains = json.load(f) + for i in range(len(trains)): + trains[i]['number'] = trains[i]['numberString'] + del trains[i]['numberString'] + with open(TRAINS_FILE, 'w') as f: + json.dump(trains, f) + db_data = { + 'version': 2, + } + with open(DB_FILE, 'w') as f: + json.dump(db_data, f) + migration() + else: + with open(DB_FILE) as f: + db_data = json.load(f) + if db_data['version'] == 2: + print('[Migration] DB Version: 2, noop') + +migration() + +if path.exists(DB_FILE): + with open(DB_FILE) as f: + db_data = json.load(f) +else: + with open(DB_FILE, 'w') as f: + json.dump(db_data, f) + if path.exists(STATIONS_FILE): with open(STATIONS_FILE) as f: stations = json.load(f) -TRAINS_FILE = path.join(DB_DIR, 'trains.json') - if path.exists(TRAINS_FILE): with open(TRAINS_FILE) as f: trains = json.load(f) +_should_commit_on_every_change = True + +@contextmanager +def db_transaction(): + global _should_commit_on_every_change + _should_commit_on_every_change = False + yield + with open(DB_FILE, 'w') as f: + json.dump(db_data, f) + with open(STATIONS_FILE, 'w') as f: + stations.sort(key=lambda s: len(s['stoppedAtBy']), reverse=True) + json.dump(stations, f) + with open(TRAINS_FILE, 'w') as f: + json.dump(trains, f) + _should_commit_on_every_change = True + def found_train(rank: str, number: str, company: str) -> int: - number_int = int(''.join(take_while(lambda s: str(s).isnumeric(), number))) + number = ''.join(take_while(lambda s: str(s).isnumeric(), number)) try: - next(filter(lambda tr: tr['number'] == number_int, trains)) + next(filter(lambda tr: tr['number'] == number, trains)) except StopIteration: trains.append({ - 'number': number_int, - 'numberString': number, + 'number': number, 'company': company, 'rank': rank, }) - with open(TRAINS_FILE, 'w') as f: - json.dump(trains, f) - return number_int + if _should_commit_on_every_change: + with open(TRAINS_FILE, 'w') as f: + json.dump(trains, f) + return number def found_station(name: str): try: @@ -61,25 +126,46 @@ def found_station(name: str): 'name': name, 'stoppedAtBy': [], }) - stations.sort(key=lambda s: len(s['stoppedAtBy']), reverse=True) - with open(STATIONS_FILE, 'w') as f: - json.dump(stations, f) + if _should_commit_on_every_change: + stations.sort(key=lambda s: len(s['stoppedAtBy']), reverse=True) + with open(STATIONS_FILE, 'w') as f: + json.dump(stations, f) -def found_train_at_station(station_name: str, train_number: int): +def found_train_at_station(station_name: str, train_number: str): + train_number = ''.join(take_while(lambda s: str(s).isnumeric(), train_number)) found_station(station_name) for i in range(len(stations)): if stations[i]['name'] == station_name: if train_number not in stations[i]['stoppedAtBy']: stations[i]['stoppedAtBy'].append(train_number) - stations.sort(key=lambda s: len(s['stoppedAtBy']), reverse=True) - with open(STATIONS_FILE, 'w') as f: - json.dump(stations, f) break + if _should_commit_on_every_change: + stations.sort(key=lambda s: len(s['stoppedAtBy']), reverse=True) + with open(STATIONS_FILE, 'w') as f: + json.dump(stations, f) def on_train_data(train_data: dict): - train_no = found_train(train_data['rank'], train_data['number'], train_data['operator']) - for station in train_data['stations']: - found_train_at_station(station['name'], train_no) + with db_transaction(): + train_no = found_train(train_data['rank'], train_data['number'], train_data['operator']) + for station in train_data['stations']: + found_train_at_station(station['name'], train_no) -def on_train_lookup_failure(train_no: int): +def on_train_lookup_failure(train_no: str): pass + +def on_station(station_data: dict): + station_name = station_data['stationName'] + + def process_train(train_data: dict): + train_number = train_data['train']['number'] + train_number = found_train(train_data['train']['rank'], train_number, train_data['train']['operator']) + found_train_at_station(station_name, train_number) + if 'route' in train_data['train'] and train_data['train']['route']: + for station in train_data['train']['route']: + found_train_at_station(station, train_number) + + with db_transaction(): + for train in station_data['arrivals']: + process_train(train) + for train in station_data['departures']: + process_train(train) diff --git a/server/server/flask_utils.py b/server/server/flask_utils.py new file mode 100644 index 0000000..c43f00c --- /dev/null +++ b/server/server/flask_utils.py @@ -0,0 +1,29 @@ +from flask import request as _f_request + +from .utils import filter_result as _filter_result + +def filtered_data(fn): + def filterer(*args, **kwargs): + filters = _f_request.args.get('filters', None) + if filters: + filters_raw = [f.split(':', 1) for f in filters.split(',')] + filters = {'.': []} + for key, value in filters_raw: + def add_to(obj, key, value): + if '.' in key: + prop, key = key.split('.', 1) + if prop not in filters: + obj[prop] = {'.': []} + add_to(obj[prop], key, value) + else: + obj['.'].append({key: value}) + add_to(filters, key, value) + properties = _f_request.args.get('properties', None) + if properties: + properties = properties.split(',') + + data = fn(*args, **kwargs) + + return _filter_result(data, properties, filters) + + return filterer diff --git a/server/server/server.py b/server/server/server.py index aaf5a46..3380fcb 100644 --- a/server/server/server.py +++ b/server/server/server.py @@ -1,9 +1,13 @@ print(f'Server {__name__=}') import datetime -from flask import Flask, json, request, jsonify + +from flask import Flask, jsonify, url_for +from jsonschema import validate from .cache import CachedData +from .scraper.schemas import TRAIN_INFO_SCHEMA +from .utils import get_hostname app = Flask(__name__) @@ -14,14 +18,18 @@ app.register_blueprint(v2.bp) def root(): return 'Test' +@app.route('/train/.schema.json') +def get_train_info_schema(): + return jsonify(TRAIN_INFO_SCHEMA['v1']) + train_data_cache = {} @app.route('/train/') def get_train_info(train_no: int): def get_data(): - from .scraper.scraper import scrape + from .scraper.scraper import scrape_train use_yesterday = False - result = scrape(train_no, use_yesterday=use_yesterday) + result = scrape_train(train_no, use_yesterday=use_yesterday) from . import db db.on_train_data(result) @@ -40,6 +48,8 @@ def get_train_info(train_no: int): if train_no not in train_data_cache: train_data_cache[train_no] = CachedData(get_data, validity=1000 * 30) data, fetch_time = train_data_cache[train_no]() + data['$schema'] = get_hostname() + url_for('.get_train_info_schema') + validate(data, schema=TRAIN_INFO_SCHEMA['v1']) resp = jsonify(data) resp.headers['X-Last-Fetched'] = fetch_time.isoformat() return resp diff --git a/server/server/utils.py b/server/server/utils.py index 81fcb30..8ebc85d 100644 --- a/server/server/utils.py +++ b/server/server/utils.py @@ -16,3 +16,26 @@ def check_yes_no(input: str, default=_NO_DEFAULT, considered_yes=None) -> bool: considered_yes = ['y', 'yes', 't', 'true', '1'] return input in considered_yes +def get_hostname(): + import os + import platform + return os.getenv('HOSTNAME', os.getenv('COMPUTERNAME', platform.node())) + +def filter_result(data, properties=None, filters=None): + is_array = not hasattr(data, 'get') + result = data if is_array else [data] + + if filters: + # Todo: implement filters + pass + # def f(lst, filters): + # def condition(item): + + # return list(filter(condition, lst)) + # result = f(result, filters) + + if properties: + for i in range(len(result)): + result[i] = {p:result[i].get(p, None) for p in properties} + + return result if is_array else result[0] diff --git a/server/server/v2/v2.py b/server/server/v2/v2.py index d9a3fd1..2261fa4 100644 --- a/server/server/v2/v2.py +++ b/server/server/v2/v2.py @@ -1,32 +1,87 @@ +import json from flask import Blueprint, jsonify, request +from flask.helpers import url_for +from jsonschema import validate from .. import db from ..cache import CachedData -from ..utils import check_yes_no +from ..utils import check_yes_no, get_hostname +from ..flask_utils import filtered_data +from ..scraper.utils import ro_letters_to_en +from ..scraper.schemas import STATION_SCHEMA, TRAIN_INFO_SCHEMA bp = Blueprint('v2', __name__, url_prefix='/v2') @bp.get('/trains') def get_known_trains(): - return jsonify(db.trains) + @filtered_data + def get_data(): + return db.trains + + result = get_data() + + return jsonify(result) @bp.get('/stations') def get_known_stations(): - return jsonify(db.stations) + @filtered_data + def get_data(): + return db.stations + + result = get_data() + + return jsonify(result) train_data_cache = {} +@bp.route('/train/.schema.json') +def get_train_info_schema(): + return jsonify(TRAIN_INFO_SCHEMA['v2']) + @bp.route('/train/') def get_train_info(train_no: int): use_yesterday = check_yes_no(request.args.get('use_yesterday', ''), default=False) + @filtered_data def get_data(): - from ..scraper.scraper import scrape - result = scrape(train_no, use_yesterday=use_yesterday) + from ..scraper.scraper import scrape_train + result = scrape_train(train_no, use_yesterday=use_yesterday) db.on_train_data(result) return result - if train_no not in train_data_cache: + if (train_no, use_yesterday) not in train_data_cache: train_data_cache[(train_no, use_yesterday)] = CachedData(get_data, validity=1000 * 30) data, fetch_time = train_data_cache[(train_no, use_yesterday)]() + data['$schema'] = get_hostname() + url_for('.get_train_info_schema') + validate(data, schema=TRAIN_INFO_SCHEMA['v2']) resp = jsonify(data) resp.headers['X-Last-Fetched'] = fetch_time.isoformat() return resp + +station_cache = {} + +@bp.route('/station/.schema.json') +def get_station_schema(): + return jsonify(STATION_SCHEMA['v2']) + +@bp.route('/station/') +def get_station(station_name: str): + station_name = ro_letters_to_en(station_name.lower().replace(' ', '-')) + + def get_data(): + from ..scraper.scraper import scrape_station + result = scrape_station(station_name) + db.on_station(result) + return result + if station_name not in train_data_cache: + station_cache[station_name] = CachedData(get_data, validity=1000 * 30) + data, fetch_time = station_cache[station_name]() + data['$schema'] = get_hostname() + url_for('.get_station_schema') + validate(data, schema=STATION_SCHEMA['v2']) + + @filtered_data + def filter(data): + return data + + resp = jsonify(filter(data)) + resp.headers['X-Last-Fetched'] = fetch_time.isoformat() + return resp +