From 89cefc3fb3724d39e7b4585297eae9b61a6616bc Mon Sep 17 00:00:00 2001
From: Dan Cojocaru <dancojocaru2010@hotmail.com>
Date: Sun, 22 Aug 2021 05:55:02 +0300
Subject: [PATCH] Initial commit

---
 .gitignore                               |   5 +
 scrapper/Pipfile                         |  13 ++
 scrapper/Pipfile.lock                    |  77 +++++++++++
 scrapper/__init__.py                     |   1 +
 scrapper/main.py                         |  44 +++++++
 scrapper/scraper.py                      | 157 +++++++++++++++++++++++
 scrapper/trainInfoScrapResultSchema.json |  99 ++++++++++++++
 7 files changed, 396 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 scrapper/Pipfile
 create mode 100644 scrapper/Pipfile.lock
 create mode 100644 scrapper/__init__.py
 create mode 100644 scrapper/main.py
 create mode 100644 scrapper/scraper.py
 create mode 100644 scrapper/trainInfoScrapResultSchema.json

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..725e239
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+# CPython compiler output
+*.pyc
+
+# VS Code
+.vscode
diff --git a/scrapper/Pipfile b/scrapper/Pipfile
new file mode 100644
index 0000000..798e84f
--- /dev/null
+++ b/scrapper/Pipfile
@@ -0,0 +1,13 @@
+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+beautifulsoup4 = "*"
+requests = "*"
+
+[dev-packages]
+
+[requires]
+python_version = "3.9"
diff --git a/scrapper/Pipfile.lock b/scrapper/Pipfile.lock
new file mode 100644
index 0000000..8f06250
--- /dev/null
+++ b/scrapper/Pipfile.lock
@@ -0,0 +1,77 @@
+{
+    "_meta": {
+        "hash": {
+            "sha256": "e7c5f7eab5a8f9202caaaa3bdca8e911579596b8dd25319c2f50e84794eb9fa8"
+        },
+        "pipfile-spec": 6,
+        "requires": {
+            "python_version": "3.9"
+        },
+        "sources": [
+            {
+                "name": "pypi",
+                "url": "https://pypi.org/simple",
+                "verify_ssl": true
+            }
+        ]
+    },
+    "default": {
+        "beautifulsoup4": {
+            "hashes": [
+                "sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35",
+                "sha256:84729e322ad1d5b4d25f805bfa05b902dd96450f43842c4e99067d5e1369eb25",
+                "sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666"
+            ],
+            "index": "pypi",
+            "version": "==4.9.3"
+        },
+        "certifi": {
+            "hashes": [
+                "sha256:2bbf76fd432960138b3ef6dda3dde0544f27cbf8546c458e60baf371917ba9ee",
+                "sha256:50b1e4f8446b06f41be7dd6338db18e0990601dce795c2b1686458aa7e8fa7d8"
+            ],
+            "version": "==2021.5.30"
+        },
+        "charset-normalizer": {
+            "hashes": [
+                "sha256:0c8911edd15d19223366a194a513099a302055a962bca2cec0f54b8b63175d8b",
+                "sha256:f23667ebe1084be45f6ae0538e4a5a865206544097e4e8bbcacf42cd02a348f3"
+            ],
+            "markers": "python_version >= '3'",
+            "version": "==2.0.4"
+        },
+        "idna": {
+            "hashes": [
+                "sha256:14475042e284991034cb48e06f6851428fb14c4dc953acd9be9a5e95c7b6dd7a",
+                "sha256:467fbad99067910785144ce333826c71fb0e63a425657295239737f7ecd125f3"
+            ],
+            "markers": "python_version >= '3'",
+            "version": "==3.2"
+        },
+        "requests": {
+            "hashes": [
+                "sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24",
+                "sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7"
+            ],
+            "index": "pypi",
+            "version": "==2.26.0"
+        },
+        "soupsieve": {
+            "hashes": [
+                "sha256:052774848f448cf19c7e959adf5566904d525f33a3f8b6ba6f6f8f26ec7de0cc",
+                "sha256:c2c1c2d44f158cdbddab7824a9af8c4f83c76b1e23e049479aa432feb6c4c23b"
+            ],
+            "markers": "python_version >= '3'",
+            "version": "==2.2.1"
+        },
+        "urllib3": {
+            "hashes": [
+                "sha256:39fb8672126159acb139a7718dd10806104dec1e2f0f6c88aab05d17df10c8d4",
+                "sha256:f57b4c16c62fa2760b7e3d97c35b255512fb6b59a259730f36ba32ce9f8e342f"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
+            "version": "==1.26.6"
+        }
+    },
+    "develop": {}
+}
diff --git a/scrapper/__init__.py b/scrapper/__init__.py
new file mode 100644
index 0000000..448bf8b
--- /dev/null
+++ b/scrapper/__init__.py
@@ -0,0 +1 @@
+__all__ = ['scrapper']
diff --git a/scrapper/main.py b/scrapper/main.py
new file mode 100644
index 0000000..7de5a40
--- /dev/null
+++ b/scrapper/main.py
@@ -0,0 +1,44 @@
+from scraper import scrape
+
+_NO_DEFAULT = object()
+
+def check_yes_no(input: str, default=_NO_DEFAULT, considered_yes=None) -> bool:
+	input = str(input).strip().lower()
+	if not input:
+		if default == _NO_DEFAULT:
+			raise Exception('Empty input with no default')
+		return default
+	if not considered_yes:
+		considered_yes = ['y', 'yes', 't', 'true', '1']
+	return input in considered_yes
+
+def main():
+	train_no = int(input('Train number: '))
+	use_yesterday = input('Train departed yesterday? [y/N] ')
+	data = scrape(train_no, use_yesterday=check_yes_no(use_yesterday, default=False))
+	print(f'Train {train_no}\t{data["route"]["from"]}\t{data["route"]["to"]}')
+	print()
+	if 'status' in data and data['status']:
+		delay = data['status']['delay']
+		if delay == 0:
+			delay = 'on time'
+		else:
+			delay = f'{delay} min'
+		state = data['status']['state']
+		station = data['status']['station']
+		print(f'Status: {delay}\t{state}\t{station}')
+		print()
+	for station in data['stations']:
+		if 'arrival' in station and station['arrival']:
+			print(station['arrival']['scheduleTime'], end='\t')
+		else:
+			print(end='\t')
+		print(station['name'], end='\t')
+		if 'departure' in station and station['departure']:
+			print(station['departure']['scheduleTime'], end='\t')
+		else:
+			print(end='\t')
+		print()
+
+if __name__ == '__main__':
+	main()
\ No newline at end of file
diff --git a/scrapper/scraper.py b/scrapper/scraper.py
new file mode 100644
index 0000000..5834036
--- /dev/null
+++ b/scrapper/scraper.py
@@ -0,0 +1,157 @@
+#! /usr/bin/env python3
+
+from datetime import datetime, timedelta
+import re
+
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import quote, urlencode
+
+SL_REGEX = re.compile(r'^(?:Fără|([0-9]+) min) (întârziere|mai devreme) la (trecerea fără oprire prin|sosirea în|plecarea din) (.+)\.$')
+SL_STATE_MAP = {
+	't': 'passing',
+	's': 'arrival',
+	'p': 'departure',
+}
+
+RO_LETTERS = r'A-Za-zăâîșțĂÂÎȚȘ'
+
+ROUTE_REGEX = re.compile(rf'^Parcurs tren ([{RO_LETTERS} ]+)[-–]([{RO_LETTERS} ]+)$')
+
+KM_REGEX = re.compile(r'^km ([0-9]+)$')
+
+PLATFORM_REGEX = re.compile(r'^linia (.+)$')
+
+STOPPING_TIME_REGEX = re.compile(r'^([0-9]+) min oprire$')
+
+STATION_DEPARR_STATUS_REGEX = re.compile(r'^(?:(la timp)|(?:((?:\+|-)[0-9]+) min \((?:(?:întârziere)|(?:mai devreme))\)))(\*?)$')
+
+def collapse_space(string: str) -> str:
+	return re.sub(
+		rf'[{BeautifulSoup.ASCII_SPACES}]+', 
+		' ', 
+		string, 
+		flags=re.MULTILINE
+	).strip()
+
+def build_url(base: str, /, query: dict, **kwargs):
+	result = base.format(**{ k: quote(str(v)) for k, v in kwargs.items() })
+	if query:
+		result += '?'
+		result += urlencode(query)
+	return result
+
+def scrape(train_no: int, use_yesterday=False, date_override=None):
+	# Start scrapping session
+	s = requests.Session()
+
+	date = datetime.today()
+	if use_yesterday:
+		date -= timedelta(days=1)
+	if date_override:
+		date = date_override
+
+	r = s.get(build_url(
+		'https://mersultrenurilor.infofer.ro/ro-RO/Tren/{train_no}', 
+		train_no=train_no, 
+		query=[
+			('Date', date.strftime('%d.%m.%Y')),
+		],
+	))
+
+	soup = BeautifulSoup(r.text, features='html.parser')
+	sform = soup.find(id='form-search')
+	# required_fields = [
+	# 	'Date',
+	# 	'TrainRunningNumber',
+	# 	'SelectedBranchCode',
+	# 	'ReCaptcha',
+	# 	'ConfirmationKey',
+	# 	'IsSearchWanted',
+	# 	'IsReCaptchaFailed',
+	# 	'__RequestVerificationToken',
+	# ]
+	# result_data = { field: sform.find('input', attrs={'name': field})['value'] for field in required_fields }
+	result_data = { elem['name']: elem['value'] for elem in sform('input') }
+
+	r = s.post('https://mersultrenurilor.infofer.ro/ro-RO/Trains/TrainsResult', data=result_data)
+	soup = BeautifulSoup(r.text, features='html.parser')
+
+	scraped = {}
+
+	results_div = soup('div', recursive=False)[3].div
+	status_div = results_div('div', recursive=False)[0]
+	route_text = collapse_space(status_div.h4.text)
+	route_from, route_to = ROUTE_REGEX.match(route_text).groups()
+	scraped['route'] = {
+		'from': route_from,
+		'to': route_to,
+	}
+	try:
+		status_line_match = SL_REGEX.match(collapse_space(status_div.div.text))
+		slm_delay, slm_late, slm_arrival, slm_station = status_line_match.groups()
+		scraped['status'] = {
+			'delay': (int(slm_delay) if slm_late == 'întârziere' else -int(slm_delay)) if slm_delay else 0,
+			'station': slm_station,
+			'state': SL_STATE_MAP[slm_arrival[0]],
+		}
+	except Exception:
+		scraped['status'] = None
+
+	stations = status_div.ul('li', recursive=False)
+	scraped['stations'] = []
+	for station in stations:
+		station_scraped = {}
+
+		left, middle, right = station.div('div', recursive=False)
+		station_scraped['name'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[0].text)
+		station_scraped['km'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[1].text)
+		station_scraped['km'] = int(KM_REGEX.match(station_scraped['km']).groups()[0])
+		station_scraped['stoppingTime'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[2].text)
+		if not station_scraped['stoppingTime']:
+			station_scraped['stoppingTime'] = None
+		else:
+			station_scraped['stoppingTime'] = int(STOPPING_TIME_REGEX.match(station_scraped['stoppingTime']).groups()[0])
+		station_scraped['platform'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[3].text)
+		if not station_scraped['platform']:
+			station_scraped['platform'] = None
+		else:
+			station_scraped['platform'] = PLATFORM_REGEX.match(station_scraped['platform']).groups()[0]
+
+		def scrape_time(elem, setter):
+			parts = elem.div.div('div', recursive=False)
+			if parts:
+				result = {}
+
+				time, *_ = parts
+				result['scheduleTime'] = collapse_space(time.text)
+				if len(parts) >= 2:
+					_, status, *_ = parts
+					result['status'] = {}
+					on_time, delay, approx = STATION_DEPARR_STATUS_REGEX.match(collapse_space(status.text)).groups()
+					result['status']['delay'] = 0 if on_time else int(delay)
+					result['status']['real'] = not approx
+				else:
+					result['status'] = None
+
+				setter(result)
+			else:
+				setter(None)
+
+		scrape_time(left, lambda value: station_scraped.update(arrival=value))
+		scrape_time(right, lambda value: station_scraped.update(departure=value))
+
+		scraped['stations'].append(station_scraped)
+
+	return scraped
+
+
+def main():
+	train_no = 1538
+	print(f'Testing package with train number {train_no}')
+	from pprint import pprint
+	# pprint(scrape('473'))
+	pprint(scrape(train_no))
+
+if __name__ == '__main__':
+	main()
diff --git a/scrapper/trainInfoScrapResultSchema.json b/scrapper/trainInfoScrapResultSchema.json
new file mode 100644
index 0000000..ec624f2
--- /dev/null
+++ b/scrapper/trainInfoScrapResultSchema.json
@@ -0,0 +1,99 @@
+{
+	"$schema": "http://json-schema.org/schema",
+	"title": "Train Info InfoFer Scrap Result Schema",
+	"description": "Results of scrapping InfoFer website for train info",
+	"definitions": {
+		"delayType": {
+			"description": "Delay of the train (negative for being early)",
+			"type": "number"
+		},
+		"stationArrDepTime": {
+			"description": "Time of arrival at/departure from station",
+			"type": ["object", "null"],
+			"properties": {
+				"scheduleTime": {
+					"description": "The time the train is scheduled to arrive/depart",
+					"type": "string"
+				},
+				"status": {
+					"type": ["object", "null"],
+					"properties": {
+						"delay": {
+							"$ref": "#/definitions/delayType"
+						},
+						"real": {
+							"description": "Determines whether delay was actually reported or is an approximation",
+							"type": "boolean"
+						}
+					},
+					"required": ["delay", "real"]
+				}
+			},
+			"required": ["scheduleTime"]
+		}
+	},
+	"type": "object",
+	"properties": {
+		"route": {
+			"description": "Route of the train",
+			"type": "object",
+			"properties": {
+				"from": {
+					"type": "string"
+				},
+				"to": {
+					"type": "string"
+				}
+			},
+			"required": ["from", "to"]
+		},
+		"status": {
+			"description": "Current status of the train",
+			"type": ["object", "null"],
+			"properties": {
+				"delay": {
+					"$ref": "#/definitions/delayType"
+				},
+				"station": {
+					"type": "string"
+				},
+				"state": {
+					"type": "string",
+					"enum": ["passing", "arrival", "departure"]
+				}
+			}
+		},
+		"stations": {
+			"description": "List of stations the train stops at",
+			"type": "array",
+			"items": {
+				"type": "object",
+				"properties": {
+					"name": {
+						"type": "string"
+					},
+					"km": {
+						"description": "The distance the train travelled until reaching this station",
+						"type": "number"
+					},
+					"stoppingTime": {
+						"description": "The number of minutes the train is scheduled to stop in this station",
+						"type": ["number", "null"]
+					},
+					"platform": {
+						"description": "The platform the train stopped at",
+						"type": ["string", "null"]
+					},
+					"arrival": {
+						"$ref": "#/definitions/stationArrDepTime"
+					},
+					"departure": {
+						"$ref": "#/definitions/stationArrDepTime"
+					}
+				},
+				"required": ["name", "km"]
+			}
+		}
+	},
+	"required": ["route", "stations"]
+}
\ No newline at end of file