Kenneth Bruen
3 years ago
commit
89cefc3fb3
7 changed files with 396 additions and 0 deletions
@ -0,0 +1,5 @@
|
||||
# CPython compiler output |
||||
*.pyc |
||||
|
||||
# VS Code |
||||
.vscode |
@ -0,0 +1,13 @@
|
||||
[[source]] |
||||
url = "https://pypi.org/simple" |
||||
verify_ssl = true |
||||
name = "pypi" |
||||
|
||||
[packages] |
||||
beautifulsoup4 = "*" |
||||
requests = "*" |
||||
|
||||
[dev-packages] |
||||
|
||||
[requires] |
||||
python_version = "3.9" |
@ -0,0 +1,77 @@
|
||||
{ |
||||
"_meta": { |
||||
"hash": { |
||||
"sha256": "e7c5f7eab5a8f9202caaaa3bdca8e911579596b8dd25319c2f50e84794eb9fa8" |
||||
}, |
||||
"pipfile-spec": 6, |
||||
"requires": { |
||||
"python_version": "3.9" |
||||
}, |
||||
"sources": [ |
||||
{ |
||||
"name": "pypi", |
||||
"url": "https://pypi.org/simple", |
||||
"verify_ssl": true |
||||
} |
||||
] |
||||
}, |
||||
"default": { |
||||
"beautifulsoup4": { |
||||
"hashes": [ |
||||
"sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35", |
||||
"sha256:84729e322ad1d5b4d25f805bfa05b902dd96450f43842c4e99067d5e1369eb25", |
||||
"sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666" |
||||
], |
||||
"index": "pypi", |
||||
"version": "==4.9.3" |
||||
}, |
||||
"certifi": { |
||||
"hashes": [ |
||||
"sha256:2bbf76fd432960138b3ef6dda3dde0544f27cbf8546c458e60baf371917ba9ee", |
||||
"sha256:50b1e4f8446b06f41be7dd6338db18e0990601dce795c2b1686458aa7e8fa7d8" |
||||
], |
||||
"version": "==2021.5.30" |
||||
}, |
||||
"charset-normalizer": { |
||||
"hashes": [ |
||||
"sha256:0c8911edd15d19223366a194a513099a302055a962bca2cec0f54b8b63175d8b", |
||||
"sha256:f23667ebe1084be45f6ae0538e4a5a865206544097e4e8bbcacf42cd02a348f3" |
||||
], |
||||
"markers": "python_version >= '3'", |
||||
"version": "==2.0.4" |
||||
}, |
||||
"idna": { |
||||
"hashes": [ |
||||
"sha256:14475042e284991034cb48e06f6851428fb14c4dc953acd9be9a5e95c7b6dd7a", |
||||
"sha256:467fbad99067910785144ce333826c71fb0e63a425657295239737f7ecd125f3" |
||||
], |
||||
"markers": "python_version >= '3'", |
||||
"version": "==3.2" |
||||
}, |
||||
"requests": { |
||||
"hashes": [ |
||||
"sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24", |
||||
"sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7" |
||||
], |
||||
"index": "pypi", |
||||
"version": "==2.26.0" |
||||
}, |
||||
"soupsieve": { |
||||
"hashes": [ |
||||
"sha256:052774848f448cf19c7e959adf5566904d525f33a3f8b6ba6f6f8f26ec7de0cc", |
||||
"sha256:c2c1c2d44f158cdbddab7824a9af8c4f83c76b1e23e049479aa432feb6c4c23b" |
||||
], |
||||
"markers": "python_version >= '3'", |
||||
"version": "==2.2.1" |
||||
}, |
||||
"urllib3": { |
||||
"hashes": [ |
||||
"sha256:39fb8672126159acb139a7718dd10806104dec1e2f0f6c88aab05d17df10c8d4", |
||||
"sha256:f57b4c16c62fa2760b7e3d97c35b255512fb6b59a259730f36ba32ce9f8e342f" |
||||
], |
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", |
||||
"version": "==1.26.6" |
||||
} |
||||
}, |
||||
"develop": {} |
||||
} |
@ -0,0 +1,44 @@
|
||||
from scraper import scrape |
||||
|
||||
_NO_DEFAULT = object() |
||||
|
||||
def check_yes_no(input: str, default=_NO_DEFAULT, considered_yes=None) -> bool: |
||||
input = str(input).strip().lower() |
||||
if not input: |
||||
if default == _NO_DEFAULT: |
||||
raise Exception('Empty input with no default') |
||||
return default |
||||
if not considered_yes: |
||||
considered_yes = ['y', 'yes', 't', 'true', '1'] |
||||
return input in considered_yes |
||||
|
||||
def main(): |
||||
train_no = int(input('Train number: ')) |
||||
use_yesterday = input('Train departed yesterday? [y/N] ') |
||||
data = scrape(train_no, use_yesterday=check_yes_no(use_yesterday, default=False)) |
||||
print(f'Train {train_no}\t{data["route"]["from"]}\t{data["route"]["to"]}') |
||||
print() |
||||
if 'status' in data and data['status']: |
||||
delay = data['status']['delay'] |
||||
if delay == 0: |
||||
delay = 'on time' |
||||
else: |
||||
delay = f'{delay} min' |
||||
state = data['status']['state'] |
||||
station = data['status']['station'] |
||||
print(f'Status: {delay}\t{state}\t{station}') |
||||
print() |
||||
for station in data['stations']: |
||||
if 'arrival' in station and station['arrival']: |
||||
print(station['arrival']['scheduleTime'], end='\t') |
||||
else: |
||||
print(end='\t') |
||||
print(station['name'], end='\t') |
||||
if 'departure' in station and station['departure']: |
||||
print(station['departure']['scheduleTime'], end='\t') |
||||
else: |
||||
print(end='\t') |
||||
print() |
||||
|
||||
if __name__ == '__main__': |
||||
main() |
@ -0,0 +1,157 @@
|
||||
#! /usr/bin/env python3 |
||||
|
||||
from datetime import datetime, timedelta |
||||
import re |
||||
|
||||
import requests |
||||
from bs4 import BeautifulSoup |
||||
from urllib.parse import quote, urlencode |
||||
|
||||
SL_REGEX = re.compile(r'^(?:Fără|([0-9]+) min) (întârziere|mai devreme) la (trecerea fără oprire prin|sosirea în|plecarea din) (.+)\.$') |
||||
SL_STATE_MAP = { |
||||
't': 'passing', |
||||
's': 'arrival', |
||||
'p': 'departure', |
||||
} |
||||
|
||||
RO_LETTERS = r'A-Za-zăâîșțĂÂÎȚȘ' |
||||
|
||||
ROUTE_REGEX = re.compile(rf'^Parcurs tren ([{RO_LETTERS} ]+)[-–]([{RO_LETTERS} ]+)$') |
||||
|
||||
KM_REGEX = re.compile(r'^km ([0-9]+)$') |
||||
|
||||
PLATFORM_REGEX = re.compile(r'^linia (.+)$') |
||||
|
||||
STOPPING_TIME_REGEX = re.compile(r'^([0-9]+) min oprire$') |
||||
|
||||
STATION_DEPARR_STATUS_REGEX = re.compile(r'^(?:(la timp)|(?:((?:\+|-)[0-9]+) min \((?:(?:întârziere)|(?:mai devreme))\)))(\*?)$') |
||||
|
||||
def collapse_space(string: str) -> str: |
||||
return re.sub( |
||||
rf'[{BeautifulSoup.ASCII_SPACES}]+', |
||||
' ', |
||||
string, |
||||
flags=re.MULTILINE |
||||
).strip() |
||||
|
||||
def build_url(base: str, /, query: dict, **kwargs): |
||||
result = base.format(**{ k: quote(str(v)) for k, v in kwargs.items() }) |
||||
if query: |
||||
result += '?' |
||||
result += urlencode(query) |
||||
return result |
||||
|
||||
def scrape(train_no: int, use_yesterday=False, date_override=None): |
||||
# Start scrapping session |
||||
s = requests.Session() |
||||
|
||||
date = datetime.today() |
||||
if use_yesterday: |
||||
date -= timedelta(days=1) |
||||
if date_override: |
||||
date = date_override |
||||
|
||||
r = s.get(build_url( |
||||
'https://mersultrenurilor.infofer.ro/ro-RO/Tren/{train_no}', |
||||
train_no=train_no, |
||||
query=[ |
||||
('Date', date.strftime('%d.%m.%Y')), |
||||
], |
||||
)) |
||||
|
||||
soup = BeautifulSoup(r.text, features='html.parser') |
||||
sform = soup.find(id='form-search') |
||||
# required_fields = [ |
||||
# 'Date', |
||||
# 'TrainRunningNumber', |
||||
# 'SelectedBranchCode', |
||||
# 'ReCaptcha', |
||||
# 'ConfirmationKey', |
||||
# 'IsSearchWanted', |
||||
# 'IsReCaptchaFailed', |
||||
# '__RequestVerificationToken', |
||||
# ] |
||||
# result_data = { field: sform.find('input', attrs={'name': field})['value'] for field in required_fields } |
||||
result_data = { elem['name']: elem['value'] for elem in sform('input') } |
||||
|
||||
r = s.post('https://mersultrenurilor.infofer.ro/ro-RO/Trains/TrainsResult', data=result_data) |
||||
soup = BeautifulSoup(r.text, features='html.parser') |
||||
|
||||
scraped = {} |
||||
|
||||
results_div = soup('div', recursive=False)[3].div |
||||
status_div = results_div('div', recursive=False)[0] |
||||
route_text = collapse_space(status_div.h4.text) |
||||
route_from, route_to = ROUTE_REGEX.match(route_text).groups() |
||||
scraped['route'] = { |
||||
'from': route_from, |
||||
'to': route_to, |
||||
} |
||||
try: |
||||
status_line_match = SL_REGEX.match(collapse_space(status_div.div.text)) |
||||
slm_delay, slm_late, slm_arrival, slm_station = status_line_match.groups() |
||||
scraped['status'] = { |
||||
'delay': (int(slm_delay) if slm_late == 'întârziere' else -int(slm_delay)) if slm_delay else 0, |
||||
'station': slm_station, |
||||
'state': SL_STATE_MAP[slm_arrival[0]], |
||||
} |
||||
except Exception: |
||||
scraped['status'] = None |
||||
|
||||
stations = status_div.ul('li', recursive=False) |
||||
scraped['stations'] = [] |
||||
for station in stations: |
||||
station_scraped = {} |
||||
|
||||
left, middle, right = station.div('div', recursive=False) |
||||
station_scraped['name'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[0].text) |
||||
station_scraped['km'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[1].text) |
||||
station_scraped['km'] = int(KM_REGEX.match(station_scraped['km']).groups()[0]) |
||||
station_scraped['stoppingTime'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[2].text) |
||||
if not station_scraped['stoppingTime']: |
||||
station_scraped['stoppingTime'] = None |
||||
else: |
||||
station_scraped['stoppingTime'] = int(STOPPING_TIME_REGEX.match(station_scraped['stoppingTime']).groups()[0]) |
||||
station_scraped['platform'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[3].text) |
||||
if not station_scraped['platform']: |
||||
station_scraped['platform'] = None |
||||
else: |
||||
station_scraped['platform'] = PLATFORM_REGEX.match(station_scraped['platform']).groups()[0] |
||||
|
||||
def scrape_time(elem, setter): |
||||
parts = elem.div.div('div', recursive=False) |
||||
if parts: |
||||
result = {} |
||||
|
||||
time, *_ = parts |
||||
result['scheduleTime'] = collapse_space(time.text) |
||||
if len(parts) >= 2: |
||||
_, status, *_ = parts |
||||
result['status'] = {} |
||||
on_time, delay, approx = STATION_DEPARR_STATUS_REGEX.match(collapse_space(status.text)).groups() |
||||
result['status']['delay'] = 0 if on_time else int(delay) |
||||
result['status']['real'] = not approx |
||||
else: |
||||
result['status'] = None |
||||
|
||||
setter(result) |
||||
else: |
||||
setter(None) |
||||
|
||||
scrape_time(left, lambda value: station_scraped.update(arrival=value)) |
||||
scrape_time(right, lambda value: station_scraped.update(departure=value)) |
||||
|
||||
scraped['stations'].append(station_scraped) |
||||
|
||||
return scraped |
||||
|
||||
|
||||
def main(): |
||||
train_no = 1538 |
||||
print(f'Testing package with train number {train_no}') |
||||
from pprint import pprint |
||||
# pprint(scrape('473')) |
||||
pprint(scrape(train_no)) |
||||
|
||||
if __name__ == '__main__': |
||||
main() |
@ -0,0 +1,99 @@
|
||||
{ |
||||
"$schema": "http://json-schema.org/schema", |
||||
"title": "Train Info InfoFer Scrap Result Schema", |
||||
"description": "Results of scrapping InfoFer website for train info", |
||||
"definitions": { |
||||
"delayType": { |
||||
"description": "Delay of the train (negative for being early)", |
||||
"type": "number" |
||||
}, |
||||
"stationArrDepTime": { |
||||
"description": "Time of arrival at/departure from station", |
||||
"type": ["object", "null"], |
||||
"properties": { |
||||
"scheduleTime": { |
||||
"description": "The time the train is scheduled to arrive/depart", |
||||
"type": "string" |
||||
}, |
||||
"status": { |
||||
"type": ["object", "null"], |
||||
"properties": { |
||||
"delay": { |
||||
"$ref": "#/definitions/delayType" |
||||
}, |
||||
"real": { |
||||
"description": "Determines whether delay was actually reported or is an approximation", |
||||
"type": "boolean" |
||||
} |
||||
}, |
||||
"required": ["delay", "real"] |
||||
} |
||||
}, |
||||
"required": ["scheduleTime"] |
||||
} |
||||
}, |
||||
"type": "object", |
||||
"properties": { |
||||
"route": { |
||||
"description": "Route of the train", |
||||
"type": "object", |
||||
"properties": { |
||||
"from": { |
||||
"type": "string" |
||||
}, |
||||
"to": { |
||||
"type": "string" |
||||
} |
||||
}, |
||||
"required": ["from", "to"] |
||||
}, |
||||
"status": { |
||||
"description": "Current status of the train", |
||||
"type": ["object", "null"], |
||||
"properties": { |
||||
"delay": { |
||||
"$ref": "#/definitions/delayType" |
||||
}, |
||||
"station": { |
||||
"type": "string" |
||||
}, |
||||
"state": { |
||||
"type": "string", |
||||
"enum": ["passing", "arrival", "departure"] |
||||
} |
||||
} |
||||
}, |
||||
"stations": { |
||||
"description": "List of stations the train stops at", |
||||
"type": "array", |
||||
"items": { |
||||
"type": "object", |
||||
"properties": { |
||||
"name": { |
||||
"type": "string" |
||||
}, |
||||
"km": { |
||||
"description": "The distance the train travelled until reaching this station", |
||||
"type": "number" |
||||
}, |
||||
"stoppingTime": { |
||||
"description": "The number of minutes the train is scheduled to stop in this station", |
||||
"type": ["number", "null"] |
||||
}, |
||||
"platform": { |
||||
"description": "The platform the train stopped at", |
||||
"type": ["string", "null"] |
||||
}, |
||||
"arrival": { |
||||
"$ref": "#/definitions/stationArrDepTime" |
||||
}, |
||||
"departure": { |
||||
"$ref": "#/definitions/stationArrDepTime" |
||||
} |
||||
}, |
||||
"required": ["name", "km"] |
||||
} |
||||
} |
||||
}, |
||||
"required": ["route", "stations"] |
||||
} |
Loading…
Reference in new issue