From de78a094a1a2ddc8e5dc85a40a332a1cae7a4f1c Mon Sep 17 00:00:00 2001 From: Dan Cojocaru Date: Mon, 23 Aug 2021 20:12:10 +0300 Subject: [PATCH] Added v2 API --- Dockerfile | 6 +-- scraper/Pipfile | 1 + scraper/Pipfile.lock | 10 +++- scraper/scraper.py | 19 ++++++++ scraper/setup.py | 2 +- server/Pipfile.lock | 7 +++ server/main.py | 2 +- server/server.py | 33 ------------- server/{ => server}/__init__.py | 0 server/{ => server}/cache.py | 0 server/server/db.py | 85 +++++++++++++++++++++++++++++++++ server/{ => server}/scraper | 0 server/server/server.py | 53 ++++++++++++++++++++ server/server/utils.py | 18 +++++++ server/server/v2/__init__.py | 1 + server/server/v2/v2.py | 32 +++++++++++++ 16 files changed, 230 insertions(+), 39 deletions(-) delete mode 100644 server/server.py rename server/{ => server}/__init__.py (100%) rename server/{ => server}/cache.py (100%) create mode 100644 server/server/db.py rename server/{ => server}/scraper (100%) create mode 100644 server/server/server.py create mode 100644 server/server/utils.py create mode 100644 server/server/v2/__init__.py create mode 100644 server/server/v2/v2.py diff --git a/Dockerfile b/Dockerfile index 1c25b40..364d64a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,16 +6,16 @@ WORKDIR /var/app/scraper COPY scraper/Pipfil* ./ COPY scraper/setup.py ./ WORKDIR /var/app/server -RUN ln -s /var/app/scraper scraper COPY server/Pipfil* ./ RUN pipenv install +RUN pipenv graph WORKDIR /var/app/scraper COPY scraper . WORKDIR /var/app/server COPY server . -RUN rm scraper -RUN ln -s /var/app/scraper scraper +RUN rm server/scraper +RUN ln -s /var/app/scraper ./server/scraper ENV PORT 5000 EXPOSE ${PORT} diff --git a/scraper/Pipfile b/scraper/Pipfile index 798e84f..864234d 100644 --- a/scraper/Pipfile +++ b/scraper/Pipfile @@ -6,6 +6,7 @@ name = "pypi" [packages] beautifulsoup4 = "*" requests = "*" +pytz = "*" [dev-packages] diff --git a/scraper/Pipfile.lock b/scraper/Pipfile.lock index 8f06250..1bb4905 100644 --- a/scraper/Pipfile.lock +++ b/scraper/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "e7c5f7eab5a8f9202caaaa3bdca8e911579596b8dd25319c2f50e84794eb9fa8" + "sha256": "d7e3ebca9807b4f0c9dcac014554e9d1c9cb3a0c30b5c71b0b7cd4ccdc4934e1" }, "pipfile-spec": 6, "requires": { @@ -48,6 +48,14 @@ "markers": "python_version >= '3'", "version": "==3.2" }, + "pytz": { + "hashes": [ + "sha256:83a4a90894bf38e243cf052c8b58f381bfe9a7a483f6a9cab140bc7f702ac4da", + "sha256:eb10ce3e7736052ed3623d49975ce333bcd712c7bb19a58b9e2089d4057d0798" + ], + "index": "pypi", + "version": "==2021.1" + }, "requests": { "hashes": [ "sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24", diff --git a/scraper/scraper.py b/scraper/scraper.py index b8c149f..9545b31 100644 --- a/scraper/scraper.py +++ b/scraper/scraper.py @@ -3,6 +3,7 @@ from datetime import datetime, timedelta import re +import pytz import requests from bs4 import BeautifulSoup from urllib.parse import quote, urlencode @@ -30,6 +31,18 @@ STOPPING_TIME_REGEX = re.compile(r'^([0-9]+) min oprire$') STATION_DEPARR_STATUS_REGEX = re.compile(r'^(?:(la timp)|(?:((?:\+|-)[0-9]+) min \((?:(?:întârziere)|(?:mai devreme))\)))(\*?)$') +class DateTimeSequencer: + def __init__(self, year: int, month: int, day: int) -> None: + self.current = datetime(year, month, day, 0, 0, 0) + self.current -= timedelta(seconds=1) + + def __call__(self, hour: int, minute: int = 0, second: int = 0) -> datetime: + potential_new_date = datetime(self.current.year, self.current.month, self.current.day, hour, minute, second) + if (self.current > potential_new_date): + potential_new_date += timedelta(days=1) + self.current = potential_new_date + return self.current + def collapse_space(string: str) -> str: return re.sub( rf'[{BeautifulSoup.ASCII_SPACES}]+', @@ -77,6 +90,8 @@ def scrape(train_no: int, use_yesterday=False, date_override=None): train_info_div = train_info_div.div('div', recursive=False)[0] scraped['rank'], scraped['number'], scraped['date'] = TRAIN_INFO_REGEX.match(collapse_space(train_info_div.h2.text)).groups() + date_d, date_m, date_y = (int(comp) for comp in scraped['date'].split('.')) + date = datetime(date_y, date_m, date_d) scraped['operator'] = OPERATOR_REGEX.match(collapse_space(train_info_div.p.text)).groups()[0] @@ -101,6 +116,8 @@ def scrape(train_no: int, use_yesterday=False, date_override=None): stations = status_div.ul('li', recursive=False) scraped['stations'] = [] + dt_seq = DateTimeSequencer(date.year, date.month, date.day) + tz = pytz.timezone('Europe/Bucharest') for station in stations: station_scraped = {} @@ -126,6 +143,8 @@ def scrape(train_no: int, use_yesterday=False, date_override=None): time, *_ = parts result['scheduleTime'] = collapse_space(time.text) + st_hr, st_min = (int(comp) for comp in result['scheduleTime'].split(':')) + result['scheduleTime'] = tz.localize(dt_seq(st_hr, st_min)).isoformat() if len(parts) >= 2: _, status, *_ = parts result['status'] = {} diff --git a/scraper/setup.py b/scraper/setup.py index e4c6932..ee96682 100644 --- a/scraper/setup.py +++ b/scraper/setup.py @@ -4,5 +4,5 @@ setup( name='InfoFer_Scraper', version='0.1', author='Dan Cojocaru', - install_requires=['beautifulsoup4', 'requests'] + install_requires=['beautifulsoup4', 'requests', 'pytz'] ) \ No newline at end of file diff --git a/server/Pipfile.lock b/server/Pipfile.lock index e5e4153..b1bfb9f 100644 --- a/server/Pipfile.lock +++ b/server/Pipfile.lock @@ -298,6 +298,13 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==2.20" }, + "pytz": { + "hashes": [ + "sha256:83a4a90894bf38e243cf052c8b58f381bfe9a7a483f6a9cab140bc7f702ac4da", + "sha256:eb10ce3e7736052ed3623d49975ce333bcd712c7bb19a58b9e2089d4057d0798" + ], + "version": "==2021.1" + }, "requests": { "hashes": [ "sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24", diff --git a/server/main.py b/server/main.py index d427140..7f1074b 100644 --- a/server/main.py +++ b/server/main.py @@ -1,5 +1,5 @@ from gevent.pywsgi import WSGIServer -from server import app +from server.server import app def main(): port = 5000 diff --git a/server/server.py b/server/server.py deleted file mode 100644 index 8fccdd3..0000000 --- a/server/server.py +++ /dev/null @@ -1,33 +0,0 @@ -from flask import Flask, json, request, jsonify - -from cache import CachedData - -app = Flask(__name__) - -@app.route('/') -def root(): - return 'Test' - -train_data_cache = {} - -@app.route('/train/') -def get_train_info(train_no: int): - def get_data(): - print(f'Cache miss for {train_no}') - from scraper.scraper import scrape - use_yesterday = False - return scrape(train_no, use_yesterday=use_yesterday) - if train_no not in train_data_cache: - train_data_cache[train_no] = CachedData(get_data, validity=1000 * 30) - data, fetch_time = train_data_cache[train_no]() - resp = jsonify(data) - resp.headers['X-Last-Fetched'] = fetch_time.isoformat() - return resp - -@app.route('/trains') -def get_trains(): - return jsonify(list(train_data_cache.keys())) - -if __name__ == '__main__': - print('Starting debug server on port 5001') - app.run(port=5000) diff --git a/server/__init__.py b/server/server/__init__.py similarity index 100% rename from server/__init__.py rename to server/server/__init__.py diff --git a/server/cache.py b/server/server/cache.py similarity index 100% rename from server/cache.py rename to server/server/cache.py diff --git a/server/server/db.py b/server/server/db.py new file mode 100644 index 0000000..aa30109 --- /dev/null +++ b/server/server/db.py @@ -0,0 +1,85 @@ +# Globals +stations = [] +trains = [] + +# Examples +example_station = { + 'name': 'Gară', + 'stoppedAtBy': [123, 456] +} + +example_train = { + 'rank': 'IR', + 'numberString': '74', + 'number': 74, + 'company': 'CFR Călători' +} + +# Init + +import json +import os +from os import path, stat +from .utils import take_while + +DB_DIR = os.environ.get('DB_DIR', '') or './db' +if not path.exists(DB_DIR): + os.mkdir(DB_DIR) + +STATIONS_FILE = path.join(DB_DIR, 'stations.json') + +if path.exists(STATIONS_FILE): + with open(STATIONS_FILE) as f: + stations = json.load(f) + +TRAINS_FILE = path.join(DB_DIR, 'trains.json') + +if path.exists(TRAINS_FILE): + with open(TRAINS_FILE) as f: + trains = json.load(f) + +def found_train(rank: str, number: str, company: str) -> int: + number_int = int(''.join(take_while(lambda s: str(s).isnumeric(), number))) + try: + next(filter(lambda tr: tr['number'] == number_int, trains)) + except StopIteration: + trains.append({ + 'number': number_int, + 'numberString': number, + 'company': company, + 'rank': rank, + }) + with open(TRAINS_FILE, 'w') as f: + json.dump(trains, f) + return number_int + +def found_station(name: str): + try: + next(filter(lambda s: s['name'] == name, stations)) + except StopIteration: + stations.append({ + 'name': name, + 'stoppedAtBy': [], + }) + stations.sort(key=lambda s: len(s['stoppedAtBy']), reverse=True) + with open(STATIONS_FILE, 'w') as f: + json.dump(stations, f) + +def found_train_at_station(station_name: str, train_number: int): + found_station(station_name) + for i in range(len(stations)): + if stations[i]['name'] == station_name: + if train_number not in stations[i]['stoppedAtBy']: + stations[i]['stoppedAtBy'].append(train_number) + stations.sort(key=lambda s: len(s['stoppedAtBy']), reverse=True) + with open(STATIONS_FILE, 'w') as f: + json.dump(stations, f) + break + +def on_train_data(train_data: dict): + train_no = found_train(train_data['rank'], train_data['number'], train_data['operator']) + for station in train_data['stations']: + found_train_at_station(station['name'], train_no) + +def on_train_lookup_failure(train_no: int): + pass diff --git a/server/scraper b/server/server/scraper similarity index 100% rename from server/scraper rename to server/server/scraper diff --git a/server/server/server.py b/server/server/server.py new file mode 100644 index 0000000..aaf5a46 --- /dev/null +++ b/server/server/server.py @@ -0,0 +1,53 @@ +print(f'Server {__name__=}') + +import datetime +from flask import Flask, json, request, jsonify + +from .cache import CachedData + +app = Flask(__name__) + +from .v2 import v2 +app.register_blueprint(v2.bp) + +@app.route('/') +def root(): + return 'Test' + +train_data_cache = {} + +@app.route('/train/') +def get_train_info(train_no: int): + def get_data(): + from .scraper.scraper import scrape + use_yesterday = False + result = scrape(train_no, use_yesterday=use_yesterday) + + from . import db + db.on_train_data(result) + + # Convert to v1 + # datetime ISO string to hh:mm + for i in range(len(result['stations'])): + if result['stations'][i]['arrival']: + date = datetime.datetime.fromisoformat(result['stations'][i]['arrival']['scheduleTime']) + result['stations'][i]['arrival']['scheduleTime'] = f'{date.hour}:{date.minute:02}' + if result['stations'][i]['departure']: + date = datetime.datetime.fromisoformat(result['stations'][i]['departure']['scheduleTime']) + result['stations'][i]['departure']['scheduleTime'] = f'{date.hour}:{date.minute:02}' + + return result + if train_no not in train_data_cache: + train_data_cache[train_no] = CachedData(get_data, validity=1000 * 30) + data, fetch_time = train_data_cache[train_no]() + resp = jsonify(data) + resp.headers['X-Last-Fetched'] = fetch_time.isoformat() + return resp + +@app.route('/trains') +def get_trains(): + return jsonify(list(train_data_cache.keys())) + +if __name__ == '__main__': + print('Starting debug server on port 5001') + app.run(port=5000) diff --git a/server/server/utils.py b/server/server/utils.py new file mode 100644 index 0000000..81fcb30 --- /dev/null +++ b/server/server/utils.py @@ -0,0 +1,18 @@ +def take_while(predicate, input): + for element in input: + if not predicate(element): + break + yield element + +_NO_DEFAULT = object() + +def check_yes_no(input: str, default=_NO_DEFAULT, considered_yes=None) -> bool: + input = str(input).strip().lower() + if not input: + if default == _NO_DEFAULT: + raise Exception('Empty input with no default') + return default + if not considered_yes: + considered_yes = ['y', 'yes', 't', 'true', '1'] + return input in considered_yes + diff --git a/server/server/v2/__init__.py b/server/server/v2/__init__.py new file mode 100644 index 0000000..03c5bab --- /dev/null +++ b/server/server/v2/__init__.py @@ -0,0 +1 @@ +__all__ = ['v2'] \ No newline at end of file diff --git a/server/server/v2/v2.py b/server/server/v2/v2.py new file mode 100644 index 0000000..d9a3fd1 --- /dev/null +++ b/server/server/v2/v2.py @@ -0,0 +1,32 @@ +from flask import Blueprint, jsonify, request + +from .. import db +from ..cache import CachedData +from ..utils import check_yes_no + +bp = Blueprint('v2', __name__, url_prefix='/v2') + +@bp.get('/trains') +def get_known_trains(): + return jsonify(db.trains) + +@bp.get('/stations') +def get_known_stations(): + return jsonify(db.stations) + +train_data_cache = {} + +@bp.route('/train/') +def get_train_info(train_no: int): + use_yesterday = check_yes_no(request.args.get('use_yesterday', ''), default=False) + def get_data(): + from ..scraper.scraper import scrape + result = scrape(train_no, use_yesterday=use_yesterday) + db.on_train_data(result) + return result + if train_no not in train_data_cache: + train_data_cache[(train_no, use_yesterday)] = CachedData(get_data, validity=1000 * 30) + data, fetch_time = train_data_cache[(train_no, use_yesterday)]() + resp = jsonify(data) + resp.headers['X-Last-Fetched'] = fetch_time.isoformat() + return resp