From 89cefc3fb3724d39e7b4585297eae9b61a6616bc Mon Sep 17 00:00:00 2001 From: Dan Cojocaru Date: Sun, 22 Aug 2021 05:55:02 +0300 Subject: [PATCH] Initial commit --- .gitignore | 5 + scrapper/Pipfile | 13 ++ scrapper/Pipfile.lock | 77 +++++++++++ scrapper/__init__.py | 1 + scrapper/main.py | 44 +++++++ scrapper/scraper.py | 157 +++++++++++++++++++++++ scrapper/trainInfoScrapResultSchema.json | 99 ++++++++++++++ 7 files changed, 396 insertions(+) create mode 100644 .gitignore create mode 100644 scrapper/Pipfile create mode 100644 scrapper/Pipfile.lock create mode 100644 scrapper/__init__.py create mode 100644 scrapper/main.py create mode 100644 scrapper/scraper.py create mode 100644 scrapper/trainInfoScrapResultSchema.json diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..725e239 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +# CPython compiler output +*.pyc + +# VS Code +.vscode diff --git a/scrapper/Pipfile b/scrapper/Pipfile new file mode 100644 index 0000000..798e84f --- /dev/null +++ b/scrapper/Pipfile @@ -0,0 +1,13 @@ +[[source]] +url = "https://pypi.org/simple" +verify_ssl = true +name = "pypi" + +[packages] +beautifulsoup4 = "*" +requests = "*" + +[dev-packages] + +[requires] +python_version = "3.9" diff --git a/scrapper/Pipfile.lock b/scrapper/Pipfile.lock new file mode 100644 index 0000000..8f06250 --- /dev/null +++ b/scrapper/Pipfile.lock @@ -0,0 +1,77 @@ +{ + "_meta": { + "hash": { + "sha256": "e7c5f7eab5a8f9202caaaa3bdca8e911579596b8dd25319c2f50e84794eb9fa8" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.9" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "beautifulsoup4": { + "hashes": [ + "sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35", + "sha256:84729e322ad1d5b4d25f805bfa05b902dd96450f43842c4e99067d5e1369eb25", + "sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666" + ], + "index": "pypi", + "version": "==4.9.3" + }, + "certifi": { + "hashes": [ + "sha256:2bbf76fd432960138b3ef6dda3dde0544f27cbf8546c458e60baf371917ba9ee", + "sha256:50b1e4f8446b06f41be7dd6338db18e0990601dce795c2b1686458aa7e8fa7d8" + ], + "version": "==2021.5.30" + }, + "charset-normalizer": { + "hashes": [ + "sha256:0c8911edd15d19223366a194a513099a302055a962bca2cec0f54b8b63175d8b", + "sha256:f23667ebe1084be45f6ae0538e4a5a865206544097e4e8bbcacf42cd02a348f3" + ], + "markers": "python_version >= '3'", + "version": "==2.0.4" + }, + "idna": { + "hashes": [ + "sha256:14475042e284991034cb48e06f6851428fb14c4dc953acd9be9a5e95c7b6dd7a", + "sha256:467fbad99067910785144ce333826c71fb0e63a425657295239737f7ecd125f3" + ], + "markers": "python_version >= '3'", + "version": "==3.2" + }, + "requests": { + "hashes": [ + "sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24", + "sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7" + ], + "index": "pypi", + "version": "==2.26.0" + }, + "soupsieve": { + "hashes": [ + "sha256:052774848f448cf19c7e959adf5566904d525f33a3f8b6ba6f6f8f26ec7de0cc", + "sha256:c2c1c2d44f158cdbddab7824a9af8c4f83c76b1e23e049479aa432feb6c4c23b" + ], + "markers": "python_version >= '3'", + "version": "==2.2.1" + }, + "urllib3": { + "hashes": [ + "sha256:39fb8672126159acb139a7718dd10806104dec1e2f0f6c88aab05d17df10c8d4", + "sha256:f57b4c16c62fa2760b7e3d97c35b255512fb6b59a259730f36ba32ce9f8e342f" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", + "version": "==1.26.6" + } + }, + "develop": {} +} diff --git a/scrapper/__init__.py b/scrapper/__init__.py new file mode 100644 index 0000000..448bf8b --- /dev/null +++ b/scrapper/__init__.py @@ -0,0 +1 @@ +__all__ = ['scrapper'] diff --git a/scrapper/main.py b/scrapper/main.py new file mode 100644 index 0000000..7de5a40 --- /dev/null +++ b/scrapper/main.py @@ -0,0 +1,44 @@ +from scraper import scrape + +_NO_DEFAULT = object() + +def check_yes_no(input: str, default=_NO_DEFAULT, considered_yes=None) -> bool: + input = str(input).strip().lower() + if not input: + if default == _NO_DEFAULT: + raise Exception('Empty input with no default') + return default + if not considered_yes: + considered_yes = ['y', 'yes', 't', 'true', '1'] + return input in considered_yes + +def main(): + train_no = int(input('Train number: ')) + use_yesterday = input('Train departed yesterday? [y/N] ') + data = scrape(train_no, use_yesterday=check_yes_no(use_yesterday, default=False)) + print(f'Train {train_no}\t{data["route"]["from"]}\t{data["route"]["to"]}') + print() + if 'status' in data and data['status']: + delay = data['status']['delay'] + if delay == 0: + delay = 'on time' + else: + delay = f'{delay} min' + state = data['status']['state'] + station = data['status']['station'] + print(f'Status: {delay}\t{state}\t{station}') + print() + for station in data['stations']: + if 'arrival' in station and station['arrival']: + print(station['arrival']['scheduleTime'], end='\t') + else: + print(end='\t') + print(station['name'], end='\t') + if 'departure' in station and station['departure']: + print(station['departure']['scheduleTime'], end='\t') + else: + print(end='\t') + print() + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/scrapper/scraper.py b/scrapper/scraper.py new file mode 100644 index 0000000..5834036 --- /dev/null +++ b/scrapper/scraper.py @@ -0,0 +1,157 @@ +#! /usr/bin/env python3 + +from datetime import datetime, timedelta +import re + +import requests +from bs4 import BeautifulSoup +from urllib.parse import quote, urlencode + +SL_REGEX = re.compile(r'^(?:Fără|([0-9]+) min) (întârziere|mai devreme) la (trecerea fără oprire prin|sosirea în|plecarea din) (.+)\.$') +SL_STATE_MAP = { + 't': 'passing', + 's': 'arrival', + 'p': 'departure', +} + +RO_LETTERS = r'A-Za-zăâîșțĂÂÎȚȘ' + +ROUTE_REGEX = re.compile(rf'^Parcurs tren ([{RO_LETTERS} ]+)[-–]([{RO_LETTERS} ]+)$') + +KM_REGEX = re.compile(r'^km ([0-9]+)$') + +PLATFORM_REGEX = re.compile(r'^linia (.+)$') + +STOPPING_TIME_REGEX = re.compile(r'^([0-9]+) min oprire$') + +STATION_DEPARR_STATUS_REGEX = re.compile(r'^(?:(la timp)|(?:((?:\+|-)[0-9]+) min \((?:(?:întârziere)|(?:mai devreme))\)))(\*?)$') + +def collapse_space(string: str) -> str: + return re.sub( + rf'[{BeautifulSoup.ASCII_SPACES}]+', + ' ', + string, + flags=re.MULTILINE + ).strip() + +def build_url(base: str, /, query: dict, **kwargs): + result = base.format(**{ k: quote(str(v)) for k, v in kwargs.items() }) + if query: + result += '?' + result += urlencode(query) + return result + +def scrape(train_no: int, use_yesterday=False, date_override=None): + # Start scrapping session + s = requests.Session() + + date = datetime.today() + if use_yesterday: + date -= timedelta(days=1) + if date_override: + date = date_override + + r = s.get(build_url( + 'https://mersultrenurilor.infofer.ro/ro-RO/Tren/{train_no}', + train_no=train_no, + query=[ + ('Date', date.strftime('%d.%m.%Y')), + ], + )) + + soup = BeautifulSoup(r.text, features='html.parser') + sform = soup.find(id='form-search') + # required_fields = [ + # 'Date', + # 'TrainRunningNumber', + # 'SelectedBranchCode', + # 'ReCaptcha', + # 'ConfirmationKey', + # 'IsSearchWanted', + # 'IsReCaptchaFailed', + # '__RequestVerificationToken', + # ] + # result_data = { field: sform.find('input', attrs={'name': field})['value'] for field in required_fields } + result_data = { elem['name']: elem['value'] for elem in sform('input') } + + r = s.post('https://mersultrenurilor.infofer.ro/ro-RO/Trains/TrainsResult', data=result_data) + soup = BeautifulSoup(r.text, features='html.parser') + + scraped = {} + + results_div = soup('div', recursive=False)[3].div + status_div = results_div('div', recursive=False)[0] + route_text = collapse_space(status_div.h4.text) + route_from, route_to = ROUTE_REGEX.match(route_text).groups() + scraped['route'] = { + 'from': route_from, + 'to': route_to, + } + try: + status_line_match = SL_REGEX.match(collapse_space(status_div.div.text)) + slm_delay, slm_late, slm_arrival, slm_station = status_line_match.groups() + scraped['status'] = { + 'delay': (int(slm_delay) if slm_late == 'întârziere' else -int(slm_delay)) if slm_delay else 0, + 'station': slm_station, + 'state': SL_STATE_MAP[slm_arrival[0]], + } + except Exception: + scraped['status'] = None + + stations = status_div.ul('li', recursive=False) + scraped['stations'] = [] + for station in stations: + station_scraped = {} + + left, middle, right = station.div('div', recursive=False) + station_scraped['name'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[0].text) + station_scraped['km'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[1].text) + station_scraped['km'] = int(KM_REGEX.match(station_scraped['km']).groups()[0]) + station_scraped['stoppingTime'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[2].text) + if not station_scraped['stoppingTime']: + station_scraped['stoppingTime'] = None + else: + station_scraped['stoppingTime'] = int(STOPPING_TIME_REGEX.match(station_scraped['stoppingTime']).groups()[0]) + station_scraped['platform'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[3].text) + if not station_scraped['platform']: + station_scraped['platform'] = None + else: + station_scraped['platform'] = PLATFORM_REGEX.match(station_scraped['platform']).groups()[0] + + def scrape_time(elem, setter): + parts = elem.div.div('div', recursive=False) + if parts: + result = {} + + time, *_ = parts + result['scheduleTime'] = collapse_space(time.text) + if len(parts) >= 2: + _, status, *_ = parts + result['status'] = {} + on_time, delay, approx = STATION_DEPARR_STATUS_REGEX.match(collapse_space(status.text)).groups() + result['status']['delay'] = 0 if on_time else int(delay) + result['status']['real'] = not approx + else: + result['status'] = None + + setter(result) + else: + setter(None) + + scrape_time(left, lambda value: station_scraped.update(arrival=value)) + scrape_time(right, lambda value: station_scraped.update(departure=value)) + + scraped['stations'].append(station_scraped) + + return scraped + + +def main(): + train_no = 1538 + print(f'Testing package with train number {train_no}') + from pprint import pprint + # pprint(scrape('473')) + pprint(scrape(train_no)) + +if __name__ == '__main__': + main() diff --git a/scrapper/trainInfoScrapResultSchema.json b/scrapper/trainInfoScrapResultSchema.json new file mode 100644 index 0000000..ec624f2 --- /dev/null +++ b/scrapper/trainInfoScrapResultSchema.json @@ -0,0 +1,99 @@ +{ + "$schema": "http://json-schema.org/schema", + "title": "Train Info InfoFer Scrap Result Schema", + "description": "Results of scrapping InfoFer website for train info", + "definitions": { + "delayType": { + "description": "Delay of the train (negative for being early)", + "type": "number" + }, + "stationArrDepTime": { + "description": "Time of arrival at/departure from station", + "type": ["object", "null"], + "properties": { + "scheduleTime": { + "description": "The time the train is scheduled to arrive/depart", + "type": "string" + }, + "status": { + "type": ["object", "null"], + "properties": { + "delay": { + "$ref": "#/definitions/delayType" + }, + "real": { + "description": "Determines whether delay was actually reported or is an approximation", + "type": "boolean" + } + }, + "required": ["delay", "real"] + } + }, + "required": ["scheduleTime"] + } + }, + "type": "object", + "properties": { + "route": { + "description": "Route of the train", + "type": "object", + "properties": { + "from": { + "type": "string" + }, + "to": { + "type": "string" + } + }, + "required": ["from", "to"] + }, + "status": { + "description": "Current status of the train", + "type": ["object", "null"], + "properties": { + "delay": { + "$ref": "#/definitions/delayType" + }, + "station": { + "type": "string" + }, + "state": { + "type": "string", + "enum": ["passing", "arrival", "departure"] + } + } + }, + "stations": { + "description": "List of stations the train stops at", + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "km": { + "description": "The distance the train travelled until reaching this station", + "type": "number" + }, + "stoppingTime": { + "description": "The number of minutes the train is scheduled to stop in this station", + "type": ["number", "null"] + }, + "platform": { + "description": "The platform the train stopped at", + "type": ["string", "null"] + }, + "arrival": { + "$ref": "#/definitions/stationArrDepTime" + }, + "departure": { + "$ref": "#/definitions/stationArrDepTime" + } + }, + "required": ["name", "km"] + } + } + }, + "required": ["route", "stations"] +} \ No newline at end of file