Dan Cojocaru
3 years ago
15 changed files with 897 additions and 272 deletions
@ -0,0 +1,20 @@
|
||||
from contextlib import ExitStack as _ExitStack |
||||
|
||||
_es = _ExitStack() |
||||
|
||||
def _load_file(name: str): |
||||
import json |
||||
from os.path import join, dirname |
||||
dir = dirname(__file__) |
||||
|
||||
return json.load(_es.enter_context(open(join(dir, name)))) |
||||
|
||||
TRAIN_INFO_SCHEMA = { |
||||
'v1': _load_file('scrape_train_schema.json'), |
||||
'v2': _load_file('scrape_train_schema_v2.json'), |
||||
} |
||||
STATION_SCHEMA = { |
||||
'v2': _load_file('scrape_station_schema_v2.json'), |
||||
} |
||||
|
||||
_es.close() |
@ -0,0 +1,87 @@
|
||||
import re |
||||
|
||||
from datetime import datetime, timedelta |
||||
|
||||
import pytz |
||||
import requests |
||||
from bs4 import BeautifulSoup |
||||
|
||||
from .utils import * |
||||
|
||||
# region regex definitions |
||||
|
||||
RO_LETTERS = r'A-Za-zăâîșțĂÂÎȚȘ' |
||||
|
||||
STATION_INFO_REGEX = re.compile(rf'^([{RO_LETTERS} ]+) în ([0-9.]+)$') |
||||
|
||||
STOPPING_TIME_REGEX = re.compile(r'^(necunoscută \(stație terminus\))|(?:([0-9]+) min \((?:începând cu|până la) ([0-9]{1,2}:[0-9]{2})\))$') |
||||
|
||||
# endregion |
||||
|
||||
def scrape(station_name: str): |
||||
station_name = ro_letters_to_en(station_name) |
||||
# Start scrapping session |
||||
s = requests.Session() |
||||
|
||||
r = s.get(build_url( |
||||
'https://mersultrenurilor.infofer.ro/ro-RO/Statie/{station}', |
||||
station=station_name.replace(' ', '-'), |
||||
)) |
||||
|
||||
soup = BeautifulSoup(r.text, features='html.parser') |
||||
sform = soup.find(id='form-search') |
||||
result_data = { elem['name']: elem['value'] for elem in sform('input') } |
||||
|
||||
r = s.post('https://mersultrenurilor.infofer.ro/ro-RO/Stations/StationsResult', data=result_data) |
||||
soup = BeautifulSoup(r.text, features='html.parser') |
||||
|
||||
scraped = {} |
||||
|
||||
station_info_div, _, departures_div, arrivals_div, *_ = soup('div', recursive=False) |
||||
|
||||
scraped['stationName'], scraped['date'] = STATION_INFO_REGEX.match(collapse_space(station_info_div.h2.text)).groups() |
||||
date_d, date_m, date_y = (int(comp) for comp in scraped['date'].split('.')) |
||||
date = datetime(date_y, date_m, date_d) |
||||
dt_seq = DateTimeSequencer(date.year, date.month, date.day) |
||||
tz = pytz.timezone('Europe/Bucharest') |
||||
|
||||
def parse_arrdep_list(elem, end_station_field_name): |
||||
def parse_item(elem): |
||||
result = {} |
||||
|
||||
try: |
||||
data_div, status_div = elem('div', recursive=False) |
||||
except ValueError: |
||||
data_div, *_ = elem('div', recursive=False) |
||||
status_div = None |
||||
data_main_div, data_details_div = data_div('div', recursive=False) |
||||
time_div, dest_div, train_div, *_ = data_main_div('div', recursive=False) |
||||
operator_div, route_div, stopping_time_div = data_details_div.div('div', recursive=False) |
||||
|
||||
result['time'] = collapse_space(time_div.div.div('div', recursive=False)[1].text) |
||||
st_hr, st_min = (int(comp) for comp in result['time'].split(':')) |
||||
result['time'] = tz.localize(dt_seq(st_hr, st_min)).isoformat() |
||||
|
||||
unknown_st, st, st_opposite_time = STOPPING_TIME_REGEX.match( |
||||
collapse_space(stopping_time_div.div('div', recursive=False)[1].text) |
||||
).groups() |
||||
if unknown_st: |
||||
result['stoppingTime'] = None |
||||
elif st: |
||||
result['stoppingTime'] = int(st) |
||||
|
||||
result['train'] = {} |
||||
result['train']['rank'] = collapse_space(train_div.div.div('div', recursive=False)[1].span.text) |
||||
result['train']['number'] = collapse_space(train_div.div.div('div', recursive=False)[1].a.text) |
||||
result['train'][end_station_field_name] = collapse_space(dest_div.div.div('div', recursive=False)[1].text) |
||||
result['train']['operator'] = collapse_space(operator_div.div('div', recursive=False)[1].text) |
||||
result['train']['route'] = collapse_space(route_div.div('div', recursive=False)[1].text).split(' - ') |
||||
|
||||
return result |
||||
|
||||
return [parse_item(elem) for elem in elem.div.ul('li', recursive=False)] |
||||
|
||||
scraped['departures'] = parse_arrdep_list(departures_div, 'destination') |
||||
scraped['arrivals'] = parse_arrdep_list(arrivals_div, 'origin') |
||||
|
||||
return scraped |
@ -0,0 +1,137 @@
|
||||
{ |
||||
"$schema": "http://json-schema.org/schema", |
||||
"title": "Train Info InfoFer Scrap Station Schema", |
||||
"description": "Results of scrapping InfoFer website for station arrival/departure info", |
||||
"definitions": { |
||||
"arrDepItem": { |
||||
"type": "object", |
||||
"properties": { |
||||
"time": { |
||||
"description": "Time of arrival/departure", |
||||
"type": "string", |
||||
"format": "date-time" |
||||
}, |
||||
"train": { |
||||
"type": "object", |
||||
"properties": { |
||||
"rank": { |
||||
"type": "string", |
||||
"examples": [ |
||||
"R", |
||||
"R-E", |
||||
"IR", |
||||
"IRN" |
||||
] |
||||
}, |
||||
"number": { |
||||
"type": "string", |
||||
"examples": [ |
||||
"74", |
||||
"15934" |
||||
] |
||||
}, |
||||
"operator": { |
||||
"type": "string", |
||||
"examples": [ |
||||
"CFR Călători", |
||||
"Softrans", |
||||
"Regio Călători" |
||||
] |
||||
}, |
||||
"route": { |
||||
"description": "All the stations the train stops at", |
||||
"type": "array", |
||||
"items": { |
||||
"type": "string" |
||||
} |
||||
} |
||||
}, |
||||
"required": [ |
||||
"rank", |
||||
"number", |
||||
"operator" |
||||
] |
||||
}, |
||||
"stoppingTime": { |
||||
"type": [ |
||||
"integer", |
||||
"null" |
||||
], |
||||
"minimum": 1 |
||||
} |
||||
}, |
||||
"required": [ |
||||
"time", |
||||
"train", |
||||
"stoppingTime" |
||||
] |
||||
} |
||||
}, |
||||
"type": "object", |
||||
"properties": { |
||||
"arrivals": { |
||||
"type": "array", |
||||
"items": { |
||||
"allOf": [ |
||||
{ |
||||
"$ref": "#/definitions/arrDepItem" |
||||
}, |
||||
{ |
||||
"type": "object", |
||||
"properties": { |
||||
"train": { |
||||
"type": "object", |
||||
"properties": { |
||||
"origin": { |
||||
"type": "string" |
||||
} |
||||
}, |
||||
"required": ["origin"] |
||||
} |
||||
}, |
||||
"required": ["train"] |
||||
} |
||||
] |
||||
} |
||||
}, |
||||
"departures": { |
||||
"type": "array", |
||||
"items": { |
||||
"allOf": [ |
||||
{ |
||||
"$ref": "#/definitions/arrDepItem" |
||||
}, |
||||
{ |
||||
"type": "object", |
||||
"properties": { |
||||
"train": { |
||||
"type": "object", |
||||
"properties": { |
||||
"destination": { |
||||
"type": "string" |
||||
} |
||||
}, |
||||
"required": ["destination"] |
||||
} |
||||
}, |
||||
"required": ["train"] |
||||
} |
||||
] |
||||
} |
||||
}, |
||||
"stationName": { |
||||
"type": "string" |
||||
}, |
||||
"date": { |
||||
"description": "Date for which the data is provided (likely today)", |
||||
"type": "string", |
||||
"pattern": "^[0-9]{1,2}\\.[0-9]{2}\\.[0-9]{4}$" |
||||
} |
||||
}, |
||||
"required": [ |
||||
"arrivals", |
||||
"departures", |
||||
"stationName", |
||||
"date" |
||||
] |
||||
} |
@ -0,0 +1,143 @@
|
||||
import re |
||||
|
||||
from datetime import datetime, timedelta |
||||
|
||||
import pytz |
||||
import requests |
||||
from bs4 import BeautifulSoup |
||||
|
||||
from .utils import * |
||||
|
||||
# region regex definitions |
||||
|
||||
TRAIN_INFO_REGEX = re.compile(r'^([A-Z-]+) ([0-9]+) în ([0-9.]+)$') |
||||
|
||||
OPERATOR_REGEX = re.compile(r'^Operat de (.+)$') |
||||
|
||||
SL_REGEX = re.compile(r'^(?:Fără|([0-9]+) min) (întârziere|mai devreme) la (trecerea fără oprire prin|sosirea în|plecarea din) (.+)\.$') |
||||
SL_STATE_MAP = { |
||||
't': 'passing', |
||||
's': 'arrival', |
||||
'p': 'departure', |
||||
} |
||||
|
||||
RO_LETTERS = r'A-Za-zăâîșțĂÂÎȚȘ' |
||||
|
||||
ROUTE_REGEX = re.compile(rf'^Parcurs tren ([{RO_LETTERS} ]+)[-–]([{RO_LETTERS} ]+)$') |
||||
|
||||
KM_REGEX = re.compile(r'^km ([0-9]+)$') |
||||
|
||||
PLATFORM_REGEX = re.compile(r'^linia (.+)$') |
||||
|
||||
STOPPING_TIME_REGEX = re.compile(r'^([0-9]+) min oprire$') |
||||
|
||||
STATION_DEPARR_STATUS_REGEX = re.compile(r'^(?:(la timp)|(?:((?:\+|-)[0-9]+) min \((?:(?:întârziere)|(?:mai devreme))\)))(\*?)$') |
||||
|
||||
# endregion |
||||
|
||||
def scrape(train_no: int, use_yesterday=False, date_override=None): |
||||
# Start scrapping session |
||||
s = requests.Session() |
||||
|
||||
date = datetime.today() |
||||
if use_yesterday: |
||||
date -= timedelta(days=1) |
||||
if date_override: |
||||
date = date_override |
||||
|
||||
r = s.get(build_url( |
||||
'https://mersultrenurilor.infofer.ro/ro-RO/Tren/{train_no}', |
||||
train_no=train_no, |
||||
query=[ |
||||
('Date', date.strftime('%d.%m.%Y')), |
||||
], |
||||
)) |
||||
|
||||
soup = BeautifulSoup(r.text, features='html.parser') |
||||
sform = soup.find(id='form-search') |
||||
result_data = { elem['name']: elem['value'] for elem in sform('input') } |
||||
|
||||
r = s.post('https://mersultrenurilor.infofer.ro/ro-RO/Trains/TrainsResult', data=result_data) |
||||
soup = BeautifulSoup(r.text, features='html.parser') |
||||
|
||||
scraped = {} |
||||
|
||||
train_info_div, _, _, results_div, *_ = soup('div', recursive=False) |
||||
|
||||
train_info_div = train_info_div.div('div', recursive=False)[0] |
||||
|
||||
scraped['rank'], scraped['number'], scraped['date'] = TRAIN_INFO_REGEX.match(collapse_space(train_info_div.h2.text)).groups() |
||||
date_d, date_m, date_y = (int(comp) for comp in scraped['date'].split('.')) |
||||
date = datetime(date_y, date_m, date_d) |
||||
|
||||
scraped['operator'] = OPERATOR_REGEX.match(collapse_space(train_info_div.p.text)).groups()[0] |
||||
|
||||
results_div = results_div.div |
||||
status_div = results_div('div', recursive=False)[0] |
||||
route_text = collapse_space(status_div.h4.text) |
||||
route_from, route_to = ROUTE_REGEX.match(route_text).groups() |
||||
scraped['route'] = { |
||||
'from': route_from, |
||||
'to': route_to, |
||||
} |
||||
try: |
||||
status_line_match = SL_REGEX.match(collapse_space(status_div.div.text)) |
||||
slm_delay, slm_late, slm_arrival, slm_station = status_line_match.groups() |
||||
scraped['status'] = { |
||||
'delay': (int(slm_delay) if slm_late == 'întârziere' else -int(slm_delay)) if slm_delay else 0, |
||||
'station': slm_station, |
||||
'state': SL_STATE_MAP[slm_arrival[0]], |
||||
} |
||||
except Exception: |
||||
scraped['status'] = None |
||||
|
||||
stations = status_div.ul('li', recursive=False) |
||||
scraped['stations'] = [] |
||||
dt_seq = DateTimeSequencer(date.year, date.month, date.day) |
||||
tz = pytz.timezone('Europe/Bucharest') |
||||
for station in stations: |
||||
station_scraped = {} |
||||
|
||||
left, middle, right = station.div('div', recursive=False) |
||||
station_scraped['name'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[0].text) |
||||
station_scraped['km'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[1].text) |
||||
station_scraped['km'] = int(KM_REGEX.match(station_scraped['km']).groups()[0]) |
||||
station_scraped['stoppingTime'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[2].text) |
||||
if not station_scraped['stoppingTime']: |
||||
station_scraped['stoppingTime'] = None |
||||
else: |
||||
station_scraped['stoppingTime'] = int(STOPPING_TIME_REGEX.match(station_scraped['stoppingTime']).groups()[0]) |
||||
station_scraped['platform'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[3].text) |
||||
if not station_scraped['platform']: |
||||
station_scraped['platform'] = None |
||||
else: |
||||
station_scraped['platform'] = PLATFORM_REGEX.match(station_scraped['platform']).groups()[0] |
||||
|
||||
def scrape_time(elem, setter): |
||||
parts = elem.div.div('div', recursive=False) |
||||
if parts: |
||||
result = {} |
||||
|
||||
time, *_ = parts |
||||
result['scheduleTime'] = collapse_space(time.text) |
||||
st_hr, st_min = (int(comp) for comp in result['scheduleTime'].split(':')) |
||||
result['scheduleTime'] = tz.localize(dt_seq(st_hr, st_min)).isoformat() |
||||
if len(parts) >= 2: |
||||
_, status, *_ = parts |
||||
result['status'] = {} |
||||
on_time, delay, approx = STATION_DEPARR_STATUS_REGEX.match(collapse_space(status.text)).groups() |
||||
result['status']['delay'] = 0 if on_time else int(delay) |
||||
result['status']['real'] = not approx |
||||
else: |
||||
result['status'] = None |
||||
|
||||
setter(result) |
||||
else: |
||||
setter(None) |
||||
|
||||
scrape_time(left, lambda value: station_scraped.update(arrival=value)) |
||||
scrape_time(right, lambda value: station_scraped.update(departure=value)) |
||||
|
||||
scraped['stations'].append(station_scraped) |
||||
|
||||
return scraped |
@ -0,0 +1,134 @@
|
||||
{ |
||||
"$schema": "http://json-schema.org/schema", |
||||
"title": "Train Info InfoFer Scrap Train Schema", |
||||
"description": "Results of scrapping InfoFer website for train info", |
||||
"definitions": { |
||||
"delayType": { |
||||
"description": "Delay of the train (negative for being early)", |
||||
"type": "integer" |
||||
}, |
||||
"stationArrDepTime": { |
||||
"description": "Time of arrival at/departure from station", |
||||
"type": ["object", "null"], |
||||
"properties": { |
||||
"scheduleTime": { |
||||
"description": "The time the train is scheduled to arrive/depart", |
||||
"type": "string", |
||||
"pattern": "^[0-9]{1,2}:[0-9]{2}$" |
||||
}, |
||||
"status": { |
||||
"type": ["object", "null"], |
||||
"properties": { |
||||
"delay": { |
||||
"$ref": "#/definitions/delayType" |
||||
}, |
||||
"real": { |
||||
"description": "Determines whether delay was actually reported or is an approximation", |
||||
"type": "boolean" |
||||
} |
||||
}, |
||||
"required": ["delay", "real"] |
||||
} |
||||
}, |
||||
"required": ["scheduleTime"] |
||||
} |
||||
}, |
||||
"type": "object", |
||||
"properties": { |
||||
"rank": { |
||||
"description": "The rank of the train", |
||||
"type": "string", |
||||
"examples": [ |
||||
"R", |
||||
"R-E", |
||||
"IR", |
||||
"IRN" |
||||
] |
||||
}, |
||||
"number": { |
||||
"description": "The number of the train", |
||||
"type": "string", |
||||
"examples": [ |
||||
"74", |
||||
"15934" |
||||
] |
||||
}, |
||||
"date": { |
||||
"description": "Date of departure from the first station (dd.mm.yyyy)", |
||||
"type": "string", |
||||
"pattern": "^[0-9]{1,2}\\.[0-9]{2}\\.[0-9]{4}$" |
||||
}, |
||||
"operator": { |
||||
"description": "Operator of the train", |
||||
"type": "string", |
||||
"examples": [ |
||||
"CFR Călători", |
||||
"Softrans", |
||||
"Regio Călători" |
||||
] |
||||
}, |
||||
"route": { |
||||
"description": "Route of the train", |
||||
"type": "object", |
||||
"properties": { |
||||
"from": { |
||||
"type": "string" |
||||
}, |
||||
"to": { |
||||
"type": "string" |
||||
} |
||||
}, |
||||
"required": ["from", "to"] |
||||
}, |
||||
"status": { |
||||
"description": "Current status of the train", |
||||
"type": ["object", "null"], |
||||
"properties": { |
||||
"delay": { |
||||
"$ref": "#/definitions/delayType" |
||||
}, |
||||
"station": { |
||||
"type": "string" |
||||
}, |
||||
"state": { |
||||
"type": "string", |
||||
"enum": ["passing", "arrival", "departure"] |
||||
} |
||||
}, |
||||
"required": ["delay", "station", "state"] |
||||
}, |
||||
"stations": { |
||||
"description": "List of stations the train stops at", |
||||
"type": "array", |
||||
"items": { |
||||
"type": "object", |
||||
"properties": { |
||||
"name": { |
||||
"type": "string" |
||||
}, |
||||
"km": { |
||||
"description": "The distance the train travelled until reaching this station", |
||||
"type": "integer" |
||||
}, |
||||
"stoppingTime": { |
||||
"description": "The number of minutes the train is scheduled to stop in this station", |
||||
"type": ["integer", "null"], |
||||
"minimum": 1 |
||||
}, |
||||
"platform": { |
||||
"description": "The platform the train stopped at", |
||||
"type": ["string", "null"] |
||||
}, |
||||
"arrival": { |
||||
"$ref": "#/definitions/stationArrDepTime" |
||||
}, |
||||
"departure": { |
||||
"$ref": "#/definitions/stationArrDepTime" |
||||
} |
||||
}, |
||||
"required": ["name", "km"] |
||||
} |
||||
} |
||||
}, |
||||
"required": ["route", "stations", "rank", "number", "date", "operator"] |
||||
} |
@ -1,177 +1,12 @@
|
||||
#! /usr/bin/env python3 |
||||
|
||||
from datetime import datetime, timedelta |
||||
import re |
||||
|
||||
import pytz |
||||
import requests |
||||
from bs4 import BeautifulSoup |
||||
from urllib.parse import quote, urlencode |
||||
|
||||
TRAIN_INFO_REGEX = re.compile(r'^([A-Z-]+) ([0-9]+) în ([0-9.]+)$') |
||||
|
||||
OPERATOR_REGEX = re.compile(r'^Operat de (.+)$') |
||||
|
||||
SL_REGEX = re.compile(r'^(?:Fără|([0-9]+) min) (întârziere|mai devreme) la (trecerea fără oprire prin|sosirea în|plecarea din) (.+)\.$') |
||||
SL_STATE_MAP = { |
||||
't': 'passing', |
||||
's': 'arrival', |
||||
'p': 'departure', |
||||
} |
||||
|
||||
RO_LETTERS = r'A-Za-zăâîșțĂÂÎȚȘ' |
||||
|
||||
ROUTE_REGEX = re.compile(rf'^Parcurs tren ([{RO_LETTERS} ]+)[-–]([{RO_LETTERS} ]+)$') |
||||
|
||||
KM_REGEX = re.compile(r'^km ([0-9]+)$') |
||||
|
||||
PLATFORM_REGEX = re.compile(r'^linia (.+)$') |
||||
|
||||
STOPPING_TIME_REGEX = re.compile(r'^([0-9]+) min oprire$') |
||||
|
||||
STATION_DEPARR_STATUS_REGEX = re.compile(r'^(?:(la timp)|(?:((?:\+|-)[0-9]+) min \((?:(?:întârziere)|(?:mai devreme))\)))(\*?)$') |
||||
|
||||
class DateTimeSequencer: |
||||
def __init__(self, year: int, month: int, day: int) -> None: |
||||
self.current = datetime(year, month, day, 0, 0, 0) |
||||
self.current -= timedelta(seconds=1) |
||||
|
||||
def __call__(self, hour: int, minute: int = 0, second: int = 0) -> datetime: |
||||
potential_new_date = datetime(self.current.year, self.current.month, self.current.day, hour, minute, second) |
||||
if (self.current > potential_new_date): |
||||
potential_new_date += timedelta(days=1) |
||||
self.current = potential_new_date |
||||
return self.current |
||||
|
||||
def collapse_space(string: str) -> str: |
||||
return re.sub( |
||||
rf'[{BeautifulSoup.ASCII_SPACES}]+', |
||||
' ', |
||||
string, |
||||
flags=re.MULTILINE |
||||
).strip() |
||||
|
||||
def build_url(base: str, /, query: dict, **kwargs): |
||||
result = base.format(**{ k: quote(str(v)) for k, v in kwargs.items() }) |
||||
if query: |
||||
result += '?' |
||||
result += urlencode(query) |
||||
return result |
||||
|
||||
def scrape(train_no: int, use_yesterday=False, date_override=None): |
||||
# Start scrapping session |
||||
s = requests.Session() |
||||
|
||||
date = datetime.today() |
||||
if use_yesterday: |
||||
date -= timedelta(days=1) |
||||
if date_override: |
||||
date = date_override |
||||
|
||||
r = s.get(build_url( |
||||
'https://mersultrenurilor.infofer.ro/ro-RO/Tren/{train_no}', |
||||
train_no=train_no, |
||||
query=[ |
||||
('Date', date.strftime('%d.%m.%Y')), |
||||
], |
||||
)) |
||||
|
||||
soup = BeautifulSoup(r.text, features='html.parser') |
||||
sform = soup.find(id='form-search') |
||||
result_data = { elem['name']: elem['value'] for elem in sform('input') } |
||||
|
||||
r = s.post('https://mersultrenurilor.infofer.ro/ro-RO/Trains/TrainsResult', data=result_data) |
||||
soup = BeautifulSoup(r.text, features='html.parser') |
||||
|
||||
scraped = {} |
||||
|
||||
train_info_div, _, _, results_div, *_ = soup('div', recursive=False) |
||||
|
||||
train_info_div = train_info_div.div('div', recursive=False)[0] |
||||
|
||||
scraped['rank'], scraped['number'], scraped['date'] = TRAIN_INFO_REGEX.match(collapse_space(train_info_div.h2.text)).groups() |
||||
date_d, date_m, date_y = (int(comp) for comp in scraped['date'].split('.')) |
||||
date = datetime(date_y, date_m, date_d) |
||||
|
||||
scraped['operator'] = OPERATOR_REGEX.match(collapse_space(train_info_div.p.text)).groups()[0] |
||||
|
||||
results_div = results_div.div |
||||
status_div = results_div('div', recursive=False)[0] |
||||
route_text = collapse_space(status_div.h4.text) |
||||
route_from, route_to = ROUTE_REGEX.match(route_text).groups() |
||||
scraped['route'] = { |
||||
'from': route_from, |
||||
'to': route_to, |
||||
} |
||||
try: |
||||
status_line_match = SL_REGEX.match(collapse_space(status_div.div.text)) |
||||
slm_delay, slm_late, slm_arrival, slm_station = status_line_match.groups() |
||||
scraped['status'] = { |
||||
'delay': (int(slm_delay) if slm_late == 'întârziere' else -int(slm_delay)) if slm_delay else 0, |
||||
'station': slm_station, |
||||
'state': SL_STATE_MAP[slm_arrival[0]], |
||||
} |
||||
except Exception: |
||||
scraped['status'] = None |
||||
|
||||
stations = status_div.ul('li', recursive=False) |
||||
scraped['stations'] = [] |
||||
dt_seq = DateTimeSequencer(date.year, date.month, date.day) |
||||
tz = pytz.timezone('Europe/Bucharest') |
||||
for station in stations: |
||||
station_scraped = {} |
||||
|
||||
left, middle, right = station.div('div', recursive=False) |
||||
station_scraped['name'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[0].text) |
||||
station_scraped['km'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[1].text) |
||||
station_scraped['km'] = int(KM_REGEX.match(station_scraped['km']).groups()[0]) |
||||
station_scraped['stoppingTime'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[2].text) |
||||
if not station_scraped['stoppingTime']: |
||||
station_scraped['stoppingTime'] = None |
||||
else: |
||||
station_scraped['stoppingTime'] = int(STOPPING_TIME_REGEX.match(station_scraped['stoppingTime']).groups()[0]) |
||||
station_scraped['platform'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[3].text) |
||||
if not station_scraped['platform']: |
||||
station_scraped['platform'] = None |
||||
else: |
||||
station_scraped['platform'] = PLATFORM_REGEX.match(station_scraped['platform']).groups()[0] |
||||
|
||||
def scrape_time(elem, setter): |
||||
parts = elem.div.div('div', recursive=False) |
||||
if parts: |
||||
result = {} |
||||
|
||||
time, *_ = parts |
||||
result['scheduleTime'] = collapse_space(time.text) |
||||
st_hr, st_min = (int(comp) for comp in result['scheduleTime'].split(':')) |
||||
result['scheduleTime'] = tz.localize(dt_seq(st_hr, st_min)).isoformat() |
||||
if len(parts) >= 2: |
||||
_, status, *_ = parts |
||||
result['status'] = {} |
||||
on_time, delay, approx = STATION_DEPARR_STATUS_REGEX.match(collapse_space(status.text)).groups() |
||||
result['status']['delay'] = 0 if on_time else int(delay) |
||||
result['status']['real'] = not approx |
||||
else: |
||||
result['status'] = None |
||||
|
||||
setter(result) |
||||
else: |
||||
setter(None) |
||||
|
||||
scrape_time(left, lambda value: station_scraped.update(arrival=value)) |
||||
scrape_time(right, lambda value: station_scraped.update(departure=value)) |
||||
|
||||
scraped['stations'].append(station_scraped) |
||||
|
||||
return scraped |
||||
|
||||
from .scrape_train import scrape as scrape_train |
||||
from .scrape_station import scrape as scrape_station |
||||
|
||||
def main(): |
||||
train_no = 1538 |
||||
print(f'Testing package with train number {train_no}') |
||||
from pprint import pprint |
||||
# pprint(scrape('473')) |
||||
pprint(scrape(train_no)) |
||||
pprint(scrape_train(train_no)) |
||||
|
||||
if __name__ == '__main__': |
||||
main() |
||||
|
@ -0,0 +1,79 @@
|
||||
import re |
||||
|
||||
from datetime import datetime, timedelta |
||||
from urllib.parse import urlencode, quote |
||||
|
||||
# From: https://en.wikipedia.org/wiki/Whitespace_character#Unicode |
||||
ASCII_WHITESPACE = [ |
||||
'\u0009', # HT; Character Tabulation |
||||
'\u000a', # LF |
||||
'\u000b', # VT; Line Tabulation |
||||
'\u000c', # FF; Form Feed |
||||
'\u000d', # CR |
||||
'\u0020', # Space |
||||
] |
||||
|
||||
WHITESPACE = ASCII_WHITESPACE + [ |
||||
'\u0085', # NEL; Next Line |
||||
'\u00a0', # No-break Space; |
||||
'\u1680', # Ogham Space Mark |
||||
'\u2000', # En Quad |
||||
'\u2001', # Em Quad |
||||
'\u2002', # En Space |
||||
'\u2003', # Em Space |
||||
'\u2004', # Three-per-em Space |
||||
'\u2005', # Four-per-em Space |
||||
'\u2006', # Six-per-em Space |
||||
'\u2007', # Figure Space |
||||
'\u2008', # Punctuation Space |
||||
'\u2009', # Thin Space |
||||
'\u200A', # Hair Space |
||||
'\u2028', # Line Separator |
||||
'\u2029', # Paragraph Separator |
||||
'\u202f', # Narrow No-break Space |
||||
'\u205d', # Meduam Mathematical Space |
||||
'\u3000', # Ideographic Space |
||||
] |
||||
|
||||
WHITESPACE_REGEX = re.compile(rf'[{"".join(WHITESPACE)}]+', flags=re.MULTILINE) |
||||
|
||||
class DateTimeSequencer: |
||||
def __init__(self, year: int, month: int, day: int) -> None: |
||||
self.current = datetime(year, month, day, 0, 0, 0) |
||||
self.current -= timedelta(seconds=1) |
||||
|
||||
def __call__(self, hour: int, minute: int = 0, second: int = 0) -> datetime: |
||||
potential_new_date = datetime(self.current.year, self.current.month, self.current.day, hour, minute, second) |
||||
if (self.current > potential_new_date): |
||||
potential_new_date += timedelta(days=1) |
||||
self.current = potential_new_date |
||||
return self.current |
||||
|
||||
def collapse_space(string: str) -> str: |
||||
return WHITESPACE_REGEX.sub( |
||||
' ', |
||||
string, |
||||
).strip() |
||||
|
||||
def build_url(base: str, /, query: dict = {}, **kwargs): |
||||
result = base.format(**{ k: quote(str(v)) for k, v in kwargs.items() }) |
||||
if query: |
||||
result += '?' |
||||
result += urlencode(query) |
||||
return result |
||||
|
||||
RO_TO_EN = { |
||||
'ă': 'a', |
||||
'Ă': 'A', |
||||
'â': 'a', |
||||
'Â': 'A', |
||||
'î': 'i', |
||||
'Î': 'I', |
||||
'ș': 's', |
||||
'Ș': 'S', |
||||
'ț': 't', |
||||
'Ț': 'T', |
||||
} |
||||
|
||||
def ro_letters_to_en(string: str) -> str: |
||||
return ''.join((RO_TO_EN.get(letter, letter) for letter in string)) |
@ -0,0 +1,29 @@
|
||||
from flask import request as _f_request |
||||
|
||||
from .utils import filter_result as _filter_result |
||||
|
||||
def filtered_data(fn): |
||||
def filterer(*args, **kwargs): |
||||
filters = _f_request.args.get('filters', None) |
||||
if filters: |
||||
filters_raw = [f.split(':', 1) for f in filters.split(',')] |
||||
filters = {'.': []} |
||||
for key, value in filters_raw: |
||||
def add_to(obj, key, value): |
||||
if '.' in key: |
||||
prop, key = key.split('.', 1) |
||||
if prop not in filters: |
||||
obj[prop] = {'.': []} |
||||
add_to(obj[prop], key, value) |
||||
else: |
||||
obj['.'].append({key: value}) |
||||
add_to(filters, key, value) |
||||
properties = _f_request.args.get('properties', None) |
||||
if properties: |
||||
properties = properties.split(',') |
||||
|
||||
data = fn(*args, **kwargs) |
||||
|
||||
return _filter_result(data, properties, filters) |
||||
|
||||
return filterer |
@ -1,32 +1,87 @@
|
||||
import json |
||||
from flask import Blueprint, jsonify, request |
||||
from flask.helpers import url_for |
||||
from jsonschema import validate |
||||
|
||||
from .. import db |
||||
from ..cache import CachedData |
||||
from ..utils import check_yes_no |
||||
from ..utils import check_yes_no, get_hostname |
||||
from ..flask_utils import filtered_data |
||||
from ..scraper.utils import ro_letters_to_en |
||||
from ..scraper.schemas import STATION_SCHEMA, TRAIN_INFO_SCHEMA |
||||
|
||||
bp = Blueprint('v2', __name__, url_prefix='/v2') |
||||
|
||||
@bp.get('/trains') |
||||
def get_known_trains(): |
||||
return jsonify(db.trains) |
||||
@filtered_data |
||||
def get_data(): |
||||
return db.trains |
||||
|
||||
result = get_data() |
||||
|
||||
return jsonify(result) |
||||
|
||||
@bp.get('/stations') |
||||
def get_known_stations(): |
||||
return jsonify(db.stations) |
||||
@filtered_data |
||||
def get_data(): |
||||
return db.stations |
||||
|
||||
result = get_data() |
||||
|
||||
return jsonify(result) |
||||
|
||||
train_data_cache = {} |
||||
|
||||
@bp.route('/train/.schema.json') |
||||
def get_train_info_schema(): |
||||
return jsonify(TRAIN_INFO_SCHEMA['v2']) |
||||
|
||||
@bp.route('/train/<int:train_no>') |
||||
def get_train_info(train_no: int): |
||||
use_yesterday = check_yes_no(request.args.get('use_yesterday', ''), default=False) |
||||
@filtered_data |
||||
def get_data(): |
||||
from ..scraper.scraper import scrape |
||||
result = scrape(train_no, use_yesterday=use_yesterday) |
||||
from ..scraper.scraper import scrape_train |
||||
result = scrape_train(train_no, use_yesterday=use_yesterday) |
||||
db.on_train_data(result) |
||||
return result |
||||
if train_no not in train_data_cache: |
||||
if (train_no, use_yesterday) not in train_data_cache: |
||||
train_data_cache[(train_no, use_yesterday)] = CachedData(get_data, validity=1000 * 30) |
||||
data, fetch_time = train_data_cache[(train_no, use_yesterday)]() |
||||
data['$schema'] = get_hostname() + url_for('.get_train_info_schema') |
||||
validate(data, schema=TRAIN_INFO_SCHEMA['v2']) |
||||
resp = jsonify(data) |
||||
resp.headers['X-Last-Fetched'] = fetch_time.isoformat() |
||||
return resp |
||||
|
||||
station_cache = {} |
||||
|
||||
@bp.route('/station/.schema.json') |
||||
def get_station_schema(): |
||||
return jsonify(STATION_SCHEMA['v2']) |
||||
|
||||
@bp.route('/station/<station_name>') |
||||
def get_station(station_name: str): |
||||
station_name = ro_letters_to_en(station_name.lower().replace(' ', '-')) |
||||
|
||||
def get_data(): |
||||
from ..scraper.scraper import scrape_station |
||||
result = scrape_station(station_name) |
||||
db.on_station(result) |
||||
return result |
||||
if station_name not in train_data_cache: |
||||
station_cache[station_name] = CachedData(get_data, validity=1000 * 30) |
||||
data, fetch_time = station_cache[station_name]() |
||||
data['$schema'] = get_hostname() + url_for('.get_station_schema') |
||||
validate(data, schema=STATION_SCHEMA['v2']) |
||||
|
||||
@filtered_data |
||||
def filter(data): |
||||
return data |
||||
|
||||
resp = jsonify(filter(data)) |
||||
resp.headers['X-Last-Fetched'] = fetch_time.isoformat() |
||||
return resp |
||||
|
||||
|
Loading…
Reference in new issue