Dan Cojocaru
3 years ago
15 changed files with 897 additions and 272 deletions
@ -0,0 +1,20 @@ |
|||||||
|
from contextlib import ExitStack as _ExitStack |
||||||
|
|
||||||
|
_es = _ExitStack() |
||||||
|
|
||||||
|
def _load_file(name: str): |
||||||
|
import json |
||||||
|
from os.path import join, dirname |
||||||
|
dir = dirname(__file__) |
||||||
|
|
||||||
|
return json.load(_es.enter_context(open(join(dir, name)))) |
||||||
|
|
||||||
|
TRAIN_INFO_SCHEMA = { |
||||||
|
'v1': _load_file('scrape_train_schema.json'), |
||||||
|
'v2': _load_file('scrape_train_schema_v2.json'), |
||||||
|
} |
||||||
|
STATION_SCHEMA = { |
||||||
|
'v2': _load_file('scrape_station_schema_v2.json'), |
||||||
|
} |
||||||
|
|
||||||
|
_es.close() |
@ -0,0 +1,87 @@ |
|||||||
|
import re |
||||||
|
|
||||||
|
from datetime import datetime, timedelta |
||||||
|
|
||||||
|
import pytz |
||||||
|
import requests |
||||||
|
from bs4 import BeautifulSoup |
||||||
|
|
||||||
|
from .utils import * |
||||||
|
|
||||||
|
# region regex definitions |
||||||
|
|
||||||
|
RO_LETTERS = r'A-Za-zăâîșțĂÂÎȚȘ' |
||||||
|
|
||||||
|
STATION_INFO_REGEX = re.compile(rf'^([{RO_LETTERS} ]+) în ([0-9.]+)$') |
||||||
|
|
||||||
|
STOPPING_TIME_REGEX = re.compile(r'^(necunoscută \(stație terminus\))|(?:([0-9]+) min \((?:începând cu|până la) ([0-9]{1,2}:[0-9]{2})\))$') |
||||||
|
|
||||||
|
# endregion |
||||||
|
|
||||||
|
def scrape(station_name: str): |
||||||
|
station_name = ro_letters_to_en(station_name) |
||||||
|
# Start scrapping session |
||||||
|
s = requests.Session() |
||||||
|
|
||||||
|
r = s.get(build_url( |
||||||
|
'https://mersultrenurilor.infofer.ro/ro-RO/Statie/{station}', |
||||||
|
station=station_name.replace(' ', '-'), |
||||||
|
)) |
||||||
|
|
||||||
|
soup = BeautifulSoup(r.text, features='html.parser') |
||||||
|
sform = soup.find(id='form-search') |
||||||
|
result_data = { elem['name']: elem['value'] for elem in sform('input') } |
||||||
|
|
||||||
|
r = s.post('https://mersultrenurilor.infofer.ro/ro-RO/Stations/StationsResult', data=result_data) |
||||||
|
soup = BeautifulSoup(r.text, features='html.parser') |
||||||
|
|
||||||
|
scraped = {} |
||||||
|
|
||||||
|
station_info_div, _, departures_div, arrivals_div, *_ = soup('div', recursive=False) |
||||||
|
|
||||||
|
scraped['stationName'], scraped['date'] = STATION_INFO_REGEX.match(collapse_space(station_info_div.h2.text)).groups() |
||||||
|
date_d, date_m, date_y = (int(comp) for comp in scraped['date'].split('.')) |
||||||
|
date = datetime(date_y, date_m, date_d) |
||||||
|
dt_seq = DateTimeSequencer(date.year, date.month, date.day) |
||||||
|
tz = pytz.timezone('Europe/Bucharest') |
||||||
|
|
||||||
|
def parse_arrdep_list(elem, end_station_field_name): |
||||||
|
def parse_item(elem): |
||||||
|
result = {} |
||||||
|
|
||||||
|
try: |
||||||
|
data_div, status_div = elem('div', recursive=False) |
||||||
|
except ValueError: |
||||||
|
data_div, *_ = elem('div', recursive=False) |
||||||
|
status_div = None |
||||||
|
data_main_div, data_details_div = data_div('div', recursive=False) |
||||||
|
time_div, dest_div, train_div, *_ = data_main_div('div', recursive=False) |
||||||
|
operator_div, route_div, stopping_time_div = data_details_div.div('div', recursive=False) |
||||||
|
|
||||||
|
result['time'] = collapse_space(time_div.div.div('div', recursive=False)[1].text) |
||||||
|
st_hr, st_min = (int(comp) for comp in result['time'].split(':')) |
||||||
|
result['time'] = tz.localize(dt_seq(st_hr, st_min)).isoformat() |
||||||
|
|
||||||
|
unknown_st, st, st_opposite_time = STOPPING_TIME_REGEX.match( |
||||||
|
collapse_space(stopping_time_div.div('div', recursive=False)[1].text) |
||||||
|
).groups() |
||||||
|
if unknown_st: |
||||||
|
result['stoppingTime'] = None |
||||||
|
elif st: |
||||||
|
result['stoppingTime'] = int(st) |
||||||
|
|
||||||
|
result['train'] = {} |
||||||
|
result['train']['rank'] = collapse_space(train_div.div.div('div', recursive=False)[1].span.text) |
||||||
|
result['train']['number'] = collapse_space(train_div.div.div('div', recursive=False)[1].a.text) |
||||||
|
result['train'][end_station_field_name] = collapse_space(dest_div.div.div('div', recursive=False)[1].text) |
||||||
|
result['train']['operator'] = collapse_space(operator_div.div('div', recursive=False)[1].text) |
||||||
|
result['train']['route'] = collapse_space(route_div.div('div', recursive=False)[1].text).split(' - ') |
||||||
|
|
||||||
|
return result |
||||||
|
|
||||||
|
return [parse_item(elem) for elem in elem.div.ul('li', recursive=False)] |
||||||
|
|
||||||
|
scraped['departures'] = parse_arrdep_list(departures_div, 'destination') |
||||||
|
scraped['arrivals'] = parse_arrdep_list(arrivals_div, 'origin') |
||||||
|
|
||||||
|
return scraped |
@ -0,0 +1,137 @@ |
|||||||
|
{ |
||||||
|
"$schema": "http://json-schema.org/schema", |
||||||
|
"title": "Train Info InfoFer Scrap Station Schema", |
||||||
|
"description": "Results of scrapping InfoFer website for station arrival/departure info", |
||||||
|
"definitions": { |
||||||
|
"arrDepItem": { |
||||||
|
"type": "object", |
||||||
|
"properties": { |
||||||
|
"time": { |
||||||
|
"description": "Time of arrival/departure", |
||||||
|
"type": "string", |
||||||
|
"format": "date-time" |
||||||
|
}, |
||||||
|
"train": { |
||||||
|
"type": "object", |
||||||
|
"properties": { |
||||||
|
"rank": { |
||||||
|
"type": "string", |
||||||
|
"examples": [ |
||||||
|
"R", |
||||||
|
"R-E", |
||||||
|
"IR", |
||||||
|
"IRN" |
||||||
|
] |
||||||
|
}, |
||||||
|
"number": { |
||||||
|
"type": "string", |
||||||
|
"examples": [ |
||||||
|
"74", |
||||||
|
"15934" |
||||||
|
] |
||||||
|
}, |
||||||
|
"operator": { |
||||||
|
"type": "string", |
||||||
|
"examples": [ |
||||||
|
"CFR Călători", |
||||||
|
"Softrans", |
||||||
|
"Regio Călători" |
||||||
|
] |
||||||
|
}, |
||||||
|
"route": { |
||||||
|
"description": "All the stations the train stops at", |
||||||
|
"type": "array", |
||||||
|
"items": { |
||||||
|
"type": "string" |
||||||
|
} |
||||||
|
} |
||||||
|
}, |
||||||
|
"required": [ |
||||||
|
"rank", |
||||||
|
"number", |
||||||
|
"operator" |
||||||
|
] |
||||||
|
}, |
||||||
|
"stoppingTime": { |
||||||
|
"type": [ |
||||||
|
"integer", |
||||||
|
"null" |
||||||
|
], |
||||||
|
"minimum": 1 |
||||||
|
} |
||||||
|
}, |
||||||
|
"required": [ |
||||||
|
"time", |
||||||
|
"train", |
||||||
|
"stoppingTime" |
||||||
|
] |
||||||
|
} |
||||||
|
}, |
||||||
|
"type": "object", |
||||||
|
"properties": { |
||||||
|
"arrivals": { |
||||||
|
"type": "array", |
||||||
|
"items": { |
||||||
|
"allOf": [ |
||||||
|
{ |
||||||
|
"$ref": "#/definitions/arrDepItem" |
||||||
|
}, |
||||||
|
{ |
||||||
|
"type": "object", |
||||||
|
"properties": { |
||||||
|
"train": { |
||||||
|
"type": "object", |
||||||
|
"properties": { |
||||||
|
"origin": { |
||||||
|
"type": "string" |
||||||
|
} |
||||||
|
}, |
||||||
|
"required": ["origin"] |
||||||
|
} |
||||||
|
}, |
||||||
|
"required": ["train"] |
||||||
|
} |
||||||
|
] |
||||||
|
} |
||||||
|
}, |
||||||
|
"departures": { |
||||||
|
"type": "array", |
||||||
|
"items": { |
||||||
|
"allOf": [ |
||||||
|
{ |
||||||
|
"$ref": "#/definitions/arrDepItem" |
||||||
|
}, |
||||||
|
{ |
||||||
|
"type": "object", |
||||||
|
"properties": { |
||||||
|
"train": { |
||||||
|
"type": "object", |
||||||
|
"properties": { |
||||||
|
"destination": { |
||||||
|
"type": "string" |
||||||
|
} |
||||||
|
}, |
||||||
|
"required": ["destination"] |
||||||
|
} |
||||||
|
}, |
||||||
|
"required": ["train"] |
||||||
|
} |
||||||
|
] |
||||||
|
} |
||||||
|
}, |
||||||
|
"stationName": { |
||||||
|
"type": "string" |
||||||
|
}, |
||||||
|
"date": { |
||||||
|
"description": "Date for which the data is provided (likely today)", |
||||||
|
"type": "string", |
||||||
|
"pattern": "^[0-9]{1,2}\\.[0-9]{2}\\.[0-9]{4}$" |
||||||
|
} |
||||||
|
}, |
||||||
|
"required": [ |
||||||
|
"arrivals", |
||||||
|
"departures", |
||||||
|
"stationName", |
||||||
|
"date" |
||||||
|
] |
||||||
|
} |
@ -0,0 +1,143 @@ |
|||||||
|
import re |
||||||
|
|
||||||
|
from datetime import datetime, timedelta |
||||||
|
|
||||||
|
import pytz |
||||||
|
import requests |
||||||
|
from bs4 import BeautifulSoup |
||||||
|
|
||||||
|
from .utils import * |
||||||
|
|
||||||
|
# region regex definitions |
||||||
|
|
||||||
|
TRAIN_INFO_REGEX = re.compile(r'^([A-Z-]+) ([0-9]+) în ([0-9.]+)$') |
||||||
|
|
||||||
|
OPERATOR_REGEX = re.compile(r'^Operat de (.+)$') |
||||||
|
|
||||||
|
SL_REGEX = re.compile(r'^(?:Fără|([0-9]+) min) (întârziere|mai devreme) la (trecerea fără oprire prin|sosirea în|plecarea din) (.+)\.$') |
||||||
|
SL_STATE_MAP = { |
||||||
|
't': 'passing', |
||||||
|
's': 'arrival', |
||||||
|
'p': 'departure', |
||||||
|
} |
||||||
|
|
||||||
|
RO_LETTERS = r'A-Za-zăâîșțĂÂÎȚȘ' |
||||||
|
|
||||||
|
ROUTE_REGEX = re.compile(rf'^Parcurs tren ([{RO_LETTERS} ]+)[-–]([{RO_LETTERS} ]+)$') |
||||||
|
|
||||||
|
KM_REGEX = re.compile(r'^km ([0-9]+)$') |
||||||
|
|
||||||
|
PLATFORM_REGEX = re.compile(r'^linia (.+)$') |
||||||
|
|
||||||
|
STOPPING_TIME_REGEX = re.compile(r'^([0-9]+) min oprire$') |
||||||
|
|
||||||
|
STATION_DEPARR_STATUS_REGEX = re.compile(r'^(?:(la timp)|(?:((?:\+|-)[0-9]+) min \((?:(?:întârziere)|(?:mai devreme))\)))(\*?)$') |
||||||
|
|
||||||
|
# endregion |
||||||
|
|
||||||
|
def scrape(train_no: int, use_yesterday=False, date_override=None): |
||||||
|
# Start scrapping session |
||||||
|
s = requests.Session() |
||||||
|
|
||||||
|
date = datetime.today() |
||||||
|
if use_yesterday: |
||||||
|
date -= timedelta(days=1) |
||||||
|
if date_override: |
||||||
|
date = date_override |
||||||
|
|
||||||
|
r = s.get(build_url( |
||||||
|
'https://mersultrenurilor.infofer.ro/ro-RO/Tren/{train_no}', |
||||||
|
train_no=train_no, |
||||||
|
query=[ |
||||||
|
('Date', date.strftime('%d.%m.%Y')), |
||||||
|
], |
||||||
|
)) |
||||||
|
|
||||||
|
soup = BeautifulSoup(r.text, features='html.parser') |
||||||
|
sform = soup.find(id='form-search') |
||||||
|
result_data = { elem['name']: elem['value'] for elem in sform('input') } |
||||||
|
|
||||||
|
r = s.post('https://mersultrenurilor.infofer.ro/ro-RO/Trains/TrainsResult', data=result_data) |
||||||
|
soup = BeautifulSoup(r.text, features='html.parser') |
||||||
|
|
||||||
|
scraped = {} |
||||||
|
|
||||||
|
train_info_div, _, _, results_div, *_ = soup('div', recursive=False) |
||||||
|
|
||||||
|
train_info_div = train_info_div.div('div', recursive=False)[0] |
||||||
|
|
||||||
|
scraped['rank'], scraped['number'], scraped['date'] = TRAIN_INFO_REGEX.match(collapse_space(train_info_div.h2.text)).groups() |
||||||
|
date_d, date_m, date_y = (int(comp) for comp in scraped['date'].split('.')) |
||||||
|
date = datetime(date_y, date_m, date_d) |
||||||
|
|
||||||
|
scraped['operator'] = OPERATOR_REGEX.match(collapse_space(train_info_div.p.text)).groups()[0] |
||||||
|
|
||||||
|
results_div = results_div.div |
||||||
|
status_div = results_div('div', recursive=False)[0] |
||||||
|
route_text = collapse_space(status_div.h4.text) |
||||||
|
route_from, route_to = ROUTE_REGEX.match(route_text).groups() |
||||||
|
scraped['route'] = { |
||||||
|
'from': route_from, |
||||||
|
'to': route_to, |
||||||
|
} |
||||||
|
try: |
||||||
|
status_line_match = SL_REGEX.match(collapse_space(status_div.div.text)) |
||||||
|
slm_delay, slm_late, slm_arrival, slm_station = status_line_match.groups() |
||||||
|
scraped['status'] = { |
||||||
|
'delay': (int(slm_delay) if slm_late == 'întârziere' else -int(slm_delay)) if slm_delay else 0, |
||||||
|
'station': slm_station, |
||||||
|
'state': SL_STATE_MAP[slm_arrival[0]], |
||||||
|
} |
||||||
|
except Exception: |
||||||
|
scraped['status'] = None |
||||||
|
|
||||||
|
stations = status_div.ul('li', recursive=False) |
||||||
|
scraped['stations'] = [] |
||||||
|
dt_seq = DateTimeSequencer(date.year, date.month, date.day) |
||||||
|
tz = pytz.timezone('Europe/Bucharest') |
||||||
|
for station in stations: |
||||||
|
station_scraped = {} |
||||||
|
|
||||||
|
left, middle, right = station.div('div', recursive=False) |
||||||
|
station_scraped['name'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[0].text) |
||||||
|
station_scraped['km'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[1].text) |
||||||
|
station_scraped['km'] = int(KM_REGEX.match(station_scraped['km']).groups()[0]) |
||||||
|
station_scraped['stoppingTime'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[2].text) |
||||||
|
if not station_scraped['stoppingTime']: |
||||||
|
station_scraped['stoppingTime'] = None |
||||||
|
else: |
||||||
|
station_scraped['stoppingTime'] = int(STOPPING_TIME_REGEX.match(station_scraped['stoppingTime']).groups()[0]) |
||||||
|
station_scraped['platform'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[3].text) |
||||||
|
if not station_scraped['platform']: |
||||||
|
station_scraped['platform'] = None |
||||||
|
else: |
||||||
|
station_scraped['platform'] = PLATFORM_REGEX.match(station_scraped['platform']).groups()[0] |
||||||
|
|
||||||
|
def scrape_time(elem, setter): |
||||||
|
parts = elem.div.div('div', recursive=False) |
||||||
|
if parts: |
||||||
|
result = {} |
||||||
|
|
||||||
|
time, *_ = parts |
||||||
|
result['scheduleTime'] = collapse_space(time.text) |
||||||
|
st_hr, st_min = (int(comp) for comp in result['scheduleTime'].split(':')) |
||||||
|
result['scheduleTime'] = tz.localize(dt_seq(st_hr, st_min)).isoformat() |
||||||
|
if len(parts) >= 2: |
||||||
|
_, status, *_ = parts |
||||||
|
result['status'] = {} |
||||||
|
on_time, delay, approx = STATION_DEPARR_STATUS_REGEX.match(collapse_space(status.text)).groups() |
||||||
|
result['status']['delay'] = 0 if on_time else int(delay) |
||||||
|
result['status']['real'] = not approx |
||||||
|
else: |
||||||
|
result['status'] = None |
||||||
|
|
||||||
|
setter(result) |
||||||
|
else: |
||||||
|
setter(None) |
||||||
|
|
||||||
|
scrape_time(left, lambda value: station_scraped.update(arrival=value)) |
||||||
|
scrape_time(right, lambda value: station_scraped.update(departure=value)) |
||||||
|
|
||||||
|
scraped['stations'].append(station_scraped) |
||||||
|
|
||||||
|
return scraped |
@ -0,0 +1,134 @@ |
|||||||
|
{ |
||||||
|
"$schema": "http://json-schema.org/schema", |
||||||
|
"title": "Train Info InfoFer Scrap Train Schema", |
||||||
|
"description": "Results of scrapping InfoFer website for train info", |
||||||
|
"definitions": { |
||||||
|
"delayType": { |
||||||
|
"description": "Delay of the train (negative for being early)", |
||||||
|
"type": "integer" |
||||||
|
}, |
||||||
|
"stationArrDepTime": { |
||||||
|
"description": "Time of arrival at/departure from station", |
||||||
|
"type": ["object", "null"], |
||||||
|
"properties": { |
||||||
|
"scheduleTime": { |
||||||
|
"description": "The time the train is scheduled to arrive/depart", |
||||||
|
"type": "string", |
||||||
|
"pattern": "^[0-9]{1,2}:[0-9]{2}$" |
||||||
|
}, |
||||||
|
"status": { |
||||||
|
"type": ["object", "null"], |
||||||
|
"properties": { |
||||||
|
"delay": { |
||||||
|
"$ref": "#/definitions/delayType" |
||||||
|
}, |
||||||
|
"real": { |
||||||
|
"description": "Determines whether delay was actually reported or is an approximation", |
||||||
|
"type": "boolean" |
||||||
|
} |
||||||
|
}, |
||||||
|
"required": ["delay", "real"] |
||||||
|
} |
||||||
|
}, |
||||||
|
"required": ["scheduleTime"] |
||||||
|
} |
||||||
|
}, |
||||||
|
"type": "object", |
||||||
|
"properties": { |
||||||
|
"rank": { |
||||||
|
"description": "The rank of the train", |
||||||
|
"type": "string", |
||||||
|
"examples": [ |
||||||
|
"R", |
||||||
|
"R-E", |
||||||
|
"IR", |
||||||
|
"IRN" |
||||||
|
] |
||||||
|
}, |
||||||
|
"number": { |
||||||
|
"description": "The number of the train", |
||||||
|
"type": "string", |
||||||
|
"examples": [ |
||||||
|
"74", |
||||||
|
"15934" |
||||||
|
] |
||||||
|
}, |
||||||
|
"date": { |
||||||
|
"description": "Date of departure from the first station (dd.mm.yyyy)", |
||||||
|
"type": "string", |
||||||
|
"pattern": "^[0-9]{1,2}\\.[0-9]{2}\\.[0-9]{4}$" |
||||||
|
}, |
||||||
|
"operator": { |
||||||
|
"description": "Operator of the train", |
||||||
|
"type": "string", |
||||||
|
"examples": [ |
||||||
|
"CFR Călători", |
||||||
|
"Softrans", |
||||||
|
"Regio Călători" |
||||||
|
] |
||||||
|
}, |
||||||
|
"route": { |
||||||
|
"description": "Route of the train", |
||||||
|
"type": "object", |
||||||
|
"properties": { |
||||||
|
"from": { |
||||||
|
"type": "string" |
||||||
|
}, |
||||||
|
"to": { |
||||||
|
"type": "string" |
||||||
|
} |
||||||
|
}, |
||||||
|
"required": ["from", "to"] |
||||||
|
}, |
||||||
|
"status": { |
||||||
|
"description": "Current status of the train", |
||||||
|
"type": ["object", "null"], |
||||||
|
"properties": { |
||||||
|
"delay": { |
||||||
|
"$ref": "#/definitions/delayType" |
||||||
|
}, |
||||||
|
"station": { |
||||||
|
"type": "string" |
||||||
|
}, |
||||||
|
"state": { |
||||||
|
"type": "string", |
||||||
|
"enum": ["passing", "arrival", "departure"] |
||||||
|
} |
||||||
|
}, |
||||||
|
"required": ["delay", "station", "state"] |
||||||
|
}, |
||||||
|
"stations": { |
||||||
|
"description": "List of stations the train stops at", |
||||||
|
"type": "array", |
||||||
|
"items": { |
||||||
|
"type": "object", |
||||||
|
"properties": { |
||||||
|
"name": { |
||||||
|
"type": "string" |
||||||
|
}, |
||||||
|
"km": { |
||||||
|
"description": "The distance the train travelled until reaching this station", |
||||||
|
"type": "integer" |
||||||
|
}, |
||||||
|
"stoppingTime": { |
||||||
|
"description": "The number of minutes the train is scheduled to stop in this station", |
||||||
|
"type": ["integer", "null"], |
||||||
|
"minimum": 1 |
||||||
|
}, |
||||||
|
"platform": { |
||||||
|
"description": "The platform the train stopped at", |
||||||
|
"type": ["string", "null"] |
||||||
|
}, |
||||||
|
"arrival": { |
||||||
|
"$ref": "#/definitions/stationArrDepTime" |
||||||
|
}, |
||||||
|
"departure": { |
||||||
|
"$ref": "#/definitions/stationArrDepTime" |
||||||
|
} |
||||||
|
}, |
||||||
|
"required": ["name", "km"] |
||||||
|
} |
||||||
|
} |
||||||
|
}, |
||||||
|
"required": ["route", "stations", "rank", "number", "date", "operator"] |
||||||
|
} |
@ -1,177 +1,12 @@ |
|||||||
#! /usr/bin/env python3 |
#! /usr/bin/env python3 |
||||||
|
from .scrape_train import scrape as scrape_train |
||||||
from datetime import datetime, timedelta |
from .scrape_station import scrape as scrape_station |
||||||
import re |
|
||||||
|
|
||||||
import pytz |
|
||||||
import requests |
|
||||||
from bs4 import BeautifulSoup |
|
||||||
from urllib.parse import quote, urlencode |
|
||||||
|
|
||||||
TRAIN_INFO_REGEX = re.compile(r'^([A-Z-]+) ([0-9]+) în ([0-9.]+)$') |
|
||||||
|
|
||||||
OPERATOR_REGEX = re.compile(r'^Operat de (.+)$') |
|
||||||
|
|
||||||
SL_REGEX = re.compile(r'^(?:Fără|([0-9]+) min) (întârziere|mai devreme) la (trecerea fără oprire prin|sosirea în|plecarea din) (.+)\.$') |
|
||||||
SL_STATE_MAP = { |
|
||||||
't': 'passing', |
|
||||||
's': 'arrival', |
|
||||||
'p': 'departure', |
|
||||||
} |
|
||||||
|
|
||||||
RO_LETTERS = r'A-Za-zăâîșțĂÂÎȚȘ' |
|
||||||
|
|
||||||
ROUTE_REGEX = re.compile(rf'^Parcurs tren ([{RO_LETTERS} ]+)[-–]([{RO_LETTERS} ]+)$') |
|
||||||
|
|
||||||
KM_REGEX = re.compile(r'^km ([0-9]+)$') |
|
||||||
|
|
||||||
PLATFORM_REGEX = re.compile(r'^linia (.+)$') |
|
||||||
|
|
||||||
STOPPING_TIME_REGEX = re.compile(r'^([0-9]+) min oprire$') |
|
||||||
|
|
||||||
STATION_DEPARR_STATUS_REGEX = re.compile(r'^(?:(la timp)|(?:((?:\+|-)[0-9]+) min \((?:(?:întârziere)|(?:mai devreme))\)))(\*?)$') |
|
||||||
|
|
||||||
class DateTimeSequencer: |
|
||||||
def __init__(self, year: int, month: int, day: int) -> None: |
|
||||||
self.current = datetime(year, month, day, 0, 0, 0) |
|
||||||
self.current -= timedelta(seconds=1) |
|
||||||
|
|
||||||
def __call__(self, hour: int, minute: int = 0, second: int = 0) -> datetime: |
|
||||||
potential_new_date = datetime(self.current.year, self.current.month, self.current.day, hour, minute, second) |
|
||||||
if (self.current > potential_new_date): |
|
||||||
potential_new_date += timedelta(days=1) |
|
||||||
self.current = potential_new_date |
|
||||||
return self.current |
|
||||||
|
|
||||||
def collapse_space(string: str) -> str: |
|
||||||
return re.sub( |
|
||||||
rf'[{BeautifulSoup.ASCII_SPACES}]+', |
|
||||||
' ', |
|
||||||
string, |
|
||||||
flags=re.MULTILINE |
|
||||||
).strip() |
|
||||||
|
|
||||||
def build_url(base: str, /, query: dict, **kwargs): |
|
||||||
result = base.format(**{ k: quote(str(v)) for k, v in kwargs.items() }) |
|
||||||
if query: |
|
||||||
result += '?' |
|
||||||
result += urlencode(query) |
|
||||||
return result |
|
||||||
|
|
||||||
def scrape(train_no: int, use_yesterday=False, date_override=None): |
|
||||||
# Start scrapping session |
|
||||||
s = requests.Session() |
|
||||||
|
|
||||||
date = datetime.today() |
|
||||||
if use_yesterday: |
|
||||||
date -= timedelta(days=1) |
|
||||||
if date_override: |
|
||||||
date = date_override |
|
||||||
|
|
||||||
r = s.get(build_url( |
|
||||||
'https://mersultrenurilor.infofer.ro/ro-RO/Tren/{train_no}', |
|
||||||
train_no=train_no, |
|
||||||
query=[ |
|
||||||
('Date', date.strftime('%d.%m.%Y')), |
|
||||||
], |
|
||||||
)) |
|
||||||
|
|
||||||
soup = BeautifulSoup(r.text, features='html.parser') |
|
||||||
sform = soup.find(id='form-search') |
|
||||||
result_data = { elem['name']: elem['value'] for elem in sform('input') } |
|
||||||
|
|
||||||
r = s.post('https://mersultrenurilor.infofer.ro/ro-RO/Trains/TrainsResult', data=result_data) |
|
||||||
soup = BeautifulSoup(r.text, features='html.parser') |
|
||||||
|
|
||||||
scraped = {} |
|
||||||
|
|
||||||
train_info_div, _, _, results_div, *_ = soup('div', recursive=False) |
|
||||||
|
|
||||||
train_info_div = train_info_div.div('div', recursive=False)[0] |
|
||||||
|
|
||||||
scraped['rank'], scraped['number'], scraped['date'] = TRAIN_INFO_REGEX.match(collapse_space(train_info_div.h2.text)).groups() |
|
||||||
date_d, date_m, date_y = (int(comp) for comp in scraped['date'].split('.')) |
|
||||||
date = datetime(date_y, date_m, date_d) |
|
||||||
|
|
||||||
scraped['operator'] = OPERATOR_REGEX.match(collapse_space(train_info_div.p.text)).groups()[0] |
|
||||||
|
|
||||||
results_div = results_div.div |
|
||||||
status_div = results_div('div', recursive=False)[0] |
|
||||||
route_text = collapse_space(status_div.h4.text) |
|
||||||
route_from, route_to = ROUTE_REGEX.match(route_text).groups() |
|
||||||
scraped['route'] = { |
|
||||||
'from': route_from, |
|
||||||
'to': route_to, |
|
||||||
} |
|
||||||
try: |
|
||||||
status_line_match = SL_REGEX.match(collapse_space(status_div.div.text)) |
|
||||||
slm_delay, slm_late, slm_arrival, slm_station = status_line_match.groups() |
|
||||||
scraped['status'] = { |
|
||||||
'delay': (int(slm_delay) if slm_late == 'întârziere' else -int(slm_delay)) if slm_delay else 0, |
|
||||||
'station': slm_station, |
|
||||||
'state': SL_STATE_MAP[slm_arrival[0]], |
|
||||||
} |
|
||||||
except Exception: |
|
||||||
scraped['status'] = None |
|
||||||
|
|
||||||
stations = status_div.ul('li', recursive=False) |
|
||||||
scraped['stations'] = [] |
|
||||||
dt_seq = DateTimeSequencer(date.year, date.month, date.day) |
|
||||||
tz = pytz.timezone('Europe/Bucharest') |
|
||||||
for station in stations: |
|
||||||
station_scraped = {} |
|
||||||
|
|
||||||
left, middle, right = station.div('div', recursive=False) |
|
||||||
station_scraped['name'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[0].text) |
|
||||||
station_scraped['km'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[1].text) |
|
||||||
station_scraped['km'] = int(KM_REGEX.match(station_scraped['km']).groups()[0]) |
|
||||||
station_scraped['stoppingTime'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[2].text) |
|
||||||
if not station_scraped['stoppingTime']: |
|
||||||
station_scraped['stoppingTime'] = None |
|
||||||
else: |
|
||||||
station_scraped['stoppingTime'] = int(STOPPING_TIME_REGEX.match(station_scraped['stoppingTime']).groups()[0]) |
|
||||||
station_scraped['platform'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[3].text) |
|
||||||
if not station_scraped['platform']: |
|
||||||
station_scraped['platform'] = None |
|
||||||
else: |
|
||||||
station_scraped['platform'] = PLATFORM_REGEX.match(station_scraped['platform']).groups()[0] |
|
||||||
|
|
||||||
def scrape_time(elem, setter): |
|
||||||
parts = elem.div.div('div', recursive=False) |
|
||||||
if parts: |
|
||||||
result = {} |
|
||||||
|
|
||||||
time, *_ = parts |
|
||||||
result['scheduleTime'] = collapse_space(time.text) |
|
||||||
st_hr, st_min = (int(comp) for comp in result['scheduleTime'].split(':')) |
|
||||||
result['scheduleTime'] = tz.localize(dt_seq(st_hr, st_min)).isoformat() |
|
||||||
if len(parts) >= 2: |
|
||||||
_, status, *_ = parts |
|
||||||
result['status'] = {} |
|
||||||
on_time, delay, approx = STATION_DEPARR_STATUS_REGEX.match(collapse_space(status.text)).groups() |
|
||||||
result['status']['delay'] = 0 if on_time else int(delay) |
|
||||||
result['status']['real'] = not approx |
|
||||||
else: |
|
||||||
result['status'] = None |
|
||||||
|
|
||||||
setter(result) |
|
||||||
else: |
|
||||||
setter(None) |
|
||||||
|
|
||||||
scrape_time(left, lambda value: station_scraped.update(arrival=value)) |
|
||||||
scrape_time(right, lambda value: station_scraped.update(departure=value)) |
|
||||||
|
|
||||||
scraped['stations'].append(station_scraped) |
|
||||||
|
|
||||||
return scraped |
|
||||||
|
|
||||||
|
|
||||||
def main(): |
def main(): |
||||||
train_no = 1538 |
train_no = 1538 |
||||||
print(f'Testing package with train number {train_no}') |
print(f'Testing package with train number {train_no}') |
||||||
from pprint import pprint |
from pprint import pprint |
||||||
# pprint(scrape('473')) |
pprint(scrape_train(train_no)) |
||||||
pprint(scrape(train_no)) |
|
||||||
|
|
||||||
if __name__ == '__main__': |
if __name__ == '__main__': |
||||||
main() |
main() |
||||||
|
@ -0,0 +1,79 @@ |
|||||||
|
import re |
||||||
|
|
||||||
|
from datetime import datetime, timedelta |
||||||
|
from urllib.parse import urlencode, quote |
||||||
|
|
||||||
|
# From: https://en.wikipedia.org/wiki/Whitespace_character#Unicode |
||||||
|
ASCII_WHITESPACE = [ |
||||||
|
'\u0009', # HT; Character Tabulation |
||||||
|
'\u000a', # LF |
||||||
|
'\u000b', # VT; Line Tabulation |
||||||
|
'\u000c', # FF; Form Feed |
||||||
|
'\u000d', # CR |
||||||
|
'\u0020', # Space |
||||||
|
] |
||||||
|
|
||||||
|
WHITESPACE = ASCII_WHITESPACE + [ |
||||||
|
'\u0085', # NEL; Next Line |
||||||
|
'\u00a0', # No-break Space; |
||||||
|
'\u1680', # Ogham Space Mark |
||||||
|
'\u2000', # En Quad |
||||||
|
'\u2001', # Em Quad |
||||||
|
'\u2002', # En Space |
||||||
|
'\u2003', # Em Space |
||||||
|
'\u2004', # Three-per-em Space |
||||||
|
'\u2005', # Four-per-em Space |
||||||
|
'\u2006', # Six-per-em Space |
||||||
|
'\u2007', # Figure Space |
||||||
|
'\u2008', # Punctuation Space |
||||||
|
'\u2009', # Thin Space |
||||||
|
'\u200A', # Hair Space |
||||||
|
'\u2028', # Line Separator |
||||||
|
'\u2029', # Paragraph Separator |
||||||
|
'\u202f', # Narrow No-break Space |
||||||
|
'\u205d', # Meduam Mathematical Space |
||||||
|
'\u3000', # Ideographic Space |
||||||
|
] |
||||||
|
|
||||||
|
WHITESPACE_REGEX = re.compile(rf'[{"".join(WHITESPACE)}]+', flags=re.MULTILINE) |
||||||
|
|
||||||
|
class DateTimeSequencer: |
||||||
|
def __init__(self, year: int, month: int, day: int) -> None: |
||||||
|
self.current = datetime(year, month, day, 0, 0, 0) |
||||||
|
self.current -= timedelta(seconds=1) |
||||||
|
|
||||||
|
def __call__(self, hour: int, minute: int = 0, second: int = 0) -> datetime: |
||||||
|
potential_new_date = datetime(self.current.year, self.current.month, self.current.day, hour, minute, second) |
||||||
|
if (self.current > potential_new_date): |
||||||
|
potential_new_date += timedelta(days=1) |
||||||
|
self.current = potential_new_date |
||||||
|
return self.current |
||||||
|
|
||||||
|
def collapse_space(string: str) -> str: |
||||||
|
return WHITESPACE_REGEX.sub( |
||||||
|
' ', |
||||||
|
string, |
||||||
|
).strip() |
||||||
|
|
||||||
|
def build_url(base: str, /, query: dict = {}, **kwargs): |
||||||
|
result = base.format(**{ k: quote(str(v)) for k, v in kwargs.items() }) |
||||||
|
if query: |
||||||
|
result += '?' |
||||||
|
result += urlencode(query) |
||||||
|
return result |
||||||
|
|
||||||
|
RO_TO_EN = { |
||||||
|
'ă': 'a', |
||||||
|
'Ă': 'A', |
||||||
|
'â': 'a', |
||||||
|
'Â': 'A', |
||||||
|
'î': 'i', |
||||||
|
'Î': 'I', |
||||||
|
'ș': 's', |
||||||
|
'Ș': 'S', |
||||||
|
'ț': 't', |
||||||
|
'Ț': 'T', |
||||||
|
} |
||||||
|
|
||||||
|
def ro_letters_to_en(string: str) -> str: |
||||||
|
return ''.join((RO_TO_EN.get(letter, letter) for letter in string)) |
@ -0,0 +1,29 @@ |
|||||||
|
from flask import request as _f_request |
||||||
|
|
||||||
|
from .utils import filter_result as _filter_result |
||||||
|
|
||||||
|
def filtered_data(fn): |
||||||
|
def filterer(*args, **kwargs): |
||||||
|
filters = _f_request.args.get('filters', None) |
||||||
|
if filters: |
||||||
|
filters_raw = [f.split(':', 1) for f in filters.split(',')] |
||||||
|
filters = {'.': []} |
||||||
|
for key, value in filters_raw: |
||||||
|
def add_to(obj, key, value): |
||||||
|
if '.' in key: |
||||||
|
prop, key = key.split('.', 1) |
||||||
|
if prop not in filters: |
||||||
|
obj[prop] = {'.': []} |
||||||
|
add_to(obj[prop], key, value) |
||||||
|
else: |
||||||
|
obj['.'].append({key: value}) |
||||||
|
add_to(filters, key, value) |
||||||
|
properties = _f_request.args.get('properties', None) |
||||||
|
if properties: |
||||||
|
properties = properties.split(',') |
||||||
|
|
||||||
|
data = fn(*args, **kwargs) |
||||||
|
|
||||||
|
return _filter_result(data, properties, filters) |
||||||
|
|
||||||
|
return filterer |
@ -1,32 +1,87 @@ |
|||||||
|
import json |
||||||
from flask import Blueprint, jsonify, request |
from flask import Blueprint, jsonify, request |
||||||
|
from flask.helpers import url_for |
||||||
|
from jsonschema import validate |
||||||
|
|
||||||
from .. import db |
from .. import db |
||||||
from ..cache import CachedData |
from ..cache import CachedData |
||||||
from ..utils import check_yes_no |
from ..utils import check_yes_no, get_hostname |
||||||
|
from ..flask_utils import filtered_data |
||||||
|
from ..scraper.utils import ro_letters_to_en |
||||||
|
from ..scraper.schemas import STATION_SCHEMA, TRAIN_INFO_SCHEMA |
||||||
|
|
||||||
bp = Blueprint('v2', __name__, url_prefix='/v2') |
bp = Blueprint('v2', __name__, url_prefix='/v2') |
||||||
|
|
||||||
@bp.get('/trains') |
@bp.get('/trains') |
||||||
def get_known_trains(): |
def get_known_trains(): |
||||||
return jsonify(db.trains) |
@filtered_data |
||||||
|
def get_data(): |
||||||
|
return db.trains |
||||||
|
|
||||||
|
result = get_data() |
||||||
|
|
||||||
|
return jsonify(result) |
||||||
|
|
||||||
@bp.get('/stations') |
@bp.get('/stations') |
||||||
def get_known_stations(): |
def get_known_stations(): |
||||||
return jsonify(db.stations) |
@filtered_data |
||||||
|
def get_data(): |
||||||
|
return db.stations |
||||||
|
|
||||||
|
result = get_data() |
||||||
|
|
||||||
|
return jsonify(result) |
||||||
|
|
||||||
train_data_cache = {} |
train_data_cache = {} |
||||||
|
|
||||||
|
@bp.route('/train/.schema.json') |
||||||
|
def get_train_info_schema(): |
||||||
|
return jsonify(TRAIN_INFO_SCHEMA['v2']) |
||||||
|
|
||||||
@bp.route('/train/<int:train_no>') |
@bp.route('/train/<int:train_no>') |
||||||
def get_train_info(train_no: int): |
def get_train_info(train_no: int): |
||||||
use_yesterday = check_yes_no(request.args.get('use_yesterday', ''), default=False) |
use_yesterday = check_yes_no(request.args.get('use_yesterday', ''), default=False) |
||||||
|
@filtered_data |
||||||
def get_data(): |
def get_data(): |
||||||
from ..scraper.scraper import scrape |
from ..scraper.scraper import scrape_train |
||||||
result = scrape(train_no, use_yesterday=use_yesterday) |
result = scrape_train(train_no, use_yesterday=use_yesterday) |
||||||
db.on_train_data(result) |
db.on_train_data(result) |
||||||
return result |
return result |
||||||
if train_no not in train_data_cache: |
if (train_no, use_yesterday) not in train_data_cache: |
||||||
train_data_cache[(train_no, use_yesterday)] = CachedData(get_data, validity=1000 * 30) |
train_data_cache[(train_no, use_yesterday)] = CachedData(get_data, validity=1000 * 30) |
||||||
data, fetch_time = train_data_cache[(train_no, use_yesterday)]() |
data, fetch_time = train_data_cache[(train_no, use_yesterday)]() |
||||||
|
data['$schema'] = get_hostname() + url_for('.get_train_info_schema') |
||||||
|
validate(data, schema=TRAIN_INFO_SCHEMA['v2']) |
||||||
resp = jsonify(data) |
resp = jsonify(data) |
||||||
resp.headers['X-Last-Fetched'] = fetch_time.isoformat() |
resp.headers['X-Last-Fetched'] = fetch_time.isoformat() |
||||||
return resp |
return resp |
||||||
|
|
||||||
|
station_cache = {} |
||||||
|
|
||||||
|
@bp.route('/station/.schema.json') |
||||||
|
def get_station_schema(): |
||||||
|
return jsonify(STATION_SCHEMA['v2']) |
||||||
|
|
||||||
|
@bp.route('/station/<station_name>') |
||||||
|
def get_station(station_name: str): |
||||||
|
station_name = ro_letters_to_en(station_name.lower().replace(' ', '-')) |
||||||
|
|
||||||
|
def get_data(): |
||||||
|
from ..scraper.scraper import scrape_station |
||||||
|
result = scrape_station(station_name) |
||||||
|
db.on_station(result) |
||||||
|
return result |
||||||
|
if station_name not in train_data_cache: |
||||||
|
station_cache[station_name] = CachedData(get_data, validity=1000 * 30) |
||||||
|
data, fetch_time = station_cache[station_name]() |
||||||
|
data['$schema'] = get_hostname() + url_for('.get_station_schema') |
||||||
|
validate(data, schema=STATION_SCHEMA['v2']) |
||||||
|
|
||||||
|
@filtered_data |
||||||
|
def filter(data): |
||||||
|
return data |
||||||
|
|
||||||
|
resp = jsonify(filter(data)) |
||||||
|
resp.headers['X-Last-Fetched'] = fetch_time.isoformat() |
||||||
|
return resp |
||||||
|
|
||||||
|
Loading…
Reference in new issue