Browse Source

Added v2 API

python3
Kenneth Bruen 3 years ago
parent
commit
de78a094a1
Signed by: kbruen
GPG Key ID: CB77B9FE7F902176
  1. 6
      Dockerfile
  2. 1
      scraper/Pipfile
  3. 10
      scraper/Pipfile.lock
  4. 19
      scraper/scraper.py
  5. 2
      scraper/setup.py
  6. 7
      server/Pipfile.lock
  7. 2
      server/main.py
  8. 33
      server/server.py
  9. 0
      server/server/__init__.py
  10. 0
      server/server/cache.py
  11. 85
      server/server/db.py
  12. 0
      server/server/scraper
  13. 53
      server/server/server.py
  14. 18
      server/server/utils.py
  15. 1
      server/server/v2/__init__.py
  16. 32
      server/server/v2/v2.py

6
Dockerfile

@ -6,16 +6,16 @@ WORKDIR /var/app/scraper
COPY scraper/Pipfil* ./ COPY scraper/Pipfil* ./
COPY scraper/setup.py ./ COPY scraper/setup.py ./
WORKDIR /var/app/server WORKDIR /var/app/server
RUN ln -s /var/app/scraper scraper
COPY server/Pipfil* ./ COPY server/Pipfil* ./
RUN pipenv install RUN pipenv install
RUN pipenv graph
WORKDIR /var/app/scraper WORKDIR /var/app/scraper
COPY scraper . COPY scraper .
WORKDIR /var/app/server WORKDIR /var/app/server
COPY server . COPY server .
RUN rm scraper RUN rm server/scraper
RUN ln -s /var/app/scraper scraper RUN ln -s /var/app/scraper ./server/scraper
ENV PORT 5000 ENV PORT 5000
EXPOSE ${PORT} EXPOSE ${PORT}

1
scraper/Pipfile

@ -6,6 +6,7 @@ name = "pypi"
[packages] [packages]
beautifulsoup4 = "*" beautifulsoup4 = "*"
requests = "*" requests = "*"
pytz = "*"
[dev-packages] [dev-packages]

10
scraper/Pipfile.lock generated

@ -1,7 +1,7 @@
{ {
"_meta": { "_meta": {
"hash": { "hash": {
"sha256": "e7c5f7eab5a8f9202caaaa3bdca8e911579596b8dd25319c2f50e84794eb9fa8" "sha256": "d7e3ebca9807b4f0c9dcac014554e9d1c9cb3a0c30b5c71b0b7cd4ccdc4934e1"
}, },
"pipfile-spec": 6, "pipfile-spec": 6,
"requires": { "requires": {
@ -48,6 +48,14 @@
"markers": "python_version >= '3'", "markers": "python_version >= '3'",
"version": "==3.2" "version": "==3.2"
}, },
"pytz": {
"hashes": [
"sha256:83a4a90894bf38e243cf052c8b58f381bfe9a7a483f6a9cab140bc7f702ac4da",
"sha256:eb10ce3e7736052ed3623d49975ce333bcd712c7bb19a58b9e2089d4057d0798"
],
"index": "pypi",
"version": "==2021.1"
},
"requests": { "requests": {
"hashes": [ "hashes": [
"sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24", "sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24",

19
scraper/scraper.py

@ -3,6 +3,7 @@
from datetime import datetime, timedelta from datetime import datetime, timedelta
import re import re
import pytz
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import quote, urlencode from urllib.parse import quote, urlencode
@ -30,6 +31,18 @@ STOPPING_TIME_REGEX = re.compile(r'^([0-9]+) min oprire$')
STATION_DEPARR_STATUS_REGEX = re.compile(r'^(?:(la timp)|(?:((?:\+|-)[0-9]+) min \((?:(?:întârziere)|(?:mai devreme))\)))(\*?)$') STATION_DEPARR_STATUS_REGEX = re.compile(r'^(?:(la timp)|(?:((?:\+|-)[0-9]+) min \((?:(?:întârziere)|(?:mai devreme))\)))(\*?)$')
class DateTimeSequencer:
def __init__(self, year: int, month: int, day: int) -> None:
self.current = datetime(year, month, day, 0, 0, 0)
self.current -= timedelta(seconds=1)
def __call__(self, hour: int, minute: int = 0, second: int = 0) -> datetime:
potential_new_date = datetime(self.current.year, self.current.month, self.current.day, hour, minute, second)
if (self.current > potential_new_date):
potential_new_date += timedelta(days=1)
self.current = potential_new_date
return self.current
def collapse_space(string: str) -> str: def collapse_space(string: str) -> str:
return re.sub( return re.sub(
rf'[{BeautifulSoup.ASCII_SPACES}]+', rf'[{BeautifulSoup.ASCII_SPACES}]+',
@ -77,6 +90,8 @@ def scrape(train_no: int, use_yesterday=False, date_override=None):
train_info_div = train_info_div.div('div', recursive=False)[0] train_info_div = train_info_div.div('div', recursive=False)[0]
scraped['rank'], scraped['number'], scraped['date'] = TRAIN_INFO_REGEX.match(collapse_space(train_info_div.h2.text)).groups() scraped['rank'], scraped['number'], scraped['date'] = TRAIN_INFO_REGEX.match(collapse_space(train_info_div.h2.text)).groups()
date_d, date_m, date_y = (int(comp) for comp in scraped['date'].split('.'))
date = datetime(date_y, date_m, date_d)
scraped['operator'] = OPERATOR_REGEX.match(collapse_space(train_info_div.p.text)).groups()[0] scraped['operator'] = OPERATOR_REGEX.match(collapse_space(train_info_div.p.text)).groups()[0]
@ -101,6 +116,8 @@ def scrape(train_no: int, use_yesterday=False, date_override=None):
stations = status_div.ul('li', recursive=False) stations = status_div.ul('li', recursive=False)
scraped['stations'] = [] scraped['stations'] = []
dt_seq = DateTimeSequencer(date.year, date.month, date.day)
tz = pytz.timezone('Europe/Bucharest')
for station in stations: for station in stations:
station_scraped = {} station_scraped = {}
@ -126,6 +143,8 @@ def scrape(train_no: int, use_yesterday=False, date_override=None):
time, *_ = parts time, *_ = parts
result['scheduleTime'] = collapse_space(time.text) result['scheduleTime'] = collapse_space(time.text)
st_hr, st_min = (int(comp) for comp in result['scheduleTime'].split(':'))
result['scheduleTime'] = tz.localize(dt_seq(st_hr, st_min)).isoformat()
if len(parts) >= 2: if len(parts) >= 2:
_, status, *_ = parts _, status, *_ = parts
result['status'] = {} result['status'] = {}

2
scraper/setup.py

@ -4,5 +4,5 @@ setup(
name='InfoFer_Scraper', name='InfoFer_Scraper',
version='0.1', version='0.1',
author='Dan Cojocaru', author='Dan Cojocaru',
install_requires=['beautifulsoup4', 'requests'] install_requires=['beautifulsoup4', 'requests', 'pytz']
) )

7
server/Pipfile.lock generated

@ -298,6 +298,13 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==2.20" "version": "==2.20"
}, },
"pytz": {
"hashes": [
"sha256:83a4a90894bf38e243cf052c8b58f381bfe9a7a483f6a9cab140bc7f702ac4da",
"sha256:eb10ce3e7736052ed3623d49975ce333bcd712c7bb19a58b9e2089d4057d0798"
],
"version": "==2021.1"
},
"requests": { "requests": {
"hashes": [ "hashes": [
"sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24", "sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24",

2
server/main.py

@ -1,5 +1,5 @@
from gevent.pywsgi import WSGIServer from gevent.pywsgi import WSGIServer
from server import app from server.server import app
def main(): def main():
port = 5000 port = 5000

33
server/server.py

@ -1,33 +0,0 @@
from flask import Flask, json, request, jsonify
from cache import CachedData
app = Flask(__name__)
@app.route('/')
def root():
return 'Test'
train_data_cache = {}
@app.route('/train/<int:train_no>')
def get_train_info(train_no: int):
def get_data():
print(f'Cache miss for {train_no}')
from scraper.scraper import scrape
use_yesterday = False
return scrape(train_no, use_yesterday=use_yesterday)
if train_no not in train_data_cache:
train_data_cache[train_no] = CachedData(get_data, validity=1000 * 30)
data, fetch_time = train_data_cache[train_no]()
resp = jsonify(data)
resp.headers['X-Last-Fetched'] = fetch_time.isoformat()
return resp
@app.route('/trains')
def get_trains():
return jsonify(list(train_data_cache.keys()))
if __name__ == '__main__':
print('Starting debug server on port 5001')
app.run(port=5000)

0
server/__init__.py → server/server/__init__.py

0
server/cache.py → server/server/cache.py

85
server/server/db.py

@ -0,0 +1,85 @@
# Globals
stations = []
trains = []
# Examples
example_station = {
'name': 'Gară',
'stoppedAtBy': [123, 456]
}
example_train = {
'rank': 'IR',
'numberString': '74',
'number': 74,
'company': 'CFR Călători'
}
# Init
import json
import os
from os import path, stat
from .utils import take_while
DB_DIR = os.environ.get('DB_DIR', '') or './db'
if not path.exists(DB_DIR):
os.mkdir(DB_DIR)
STATIONS_FILE = path.join(DB_DIR, 'stations.json')
if path.exists(STATIONS_FILE):
with open(STATIONS_FILE) as f:
stations = json.load(f)
TRAINS_FILE = path.join(DB_DIR, 'trains.json')
if path.exists(TRAINS_FILE):
with open(TRAINS_FILE) as f:
trains = json.load(f)
def found_train(rank: str, number: str, company: str) -> int:
number_int = int(''.join(take_while(lambda s: str(s).isnumeric(), number)))
try:
next(filter(lambda tr: tr['number'] == number_int, trains))
except StopIteration:
trains.append({
'number': number_int,
'numberString': number,
'company': company,
'rank': rank,
})
with open(TRAINS_FILE, 'w') as f:
json.dump(trains, f)
return number_int
def found_station(name: str):
try:
next(filter(lambda s: s['name'] == name, stations))
except StopIteration:
stations.append({
'name': name,
'stoppedAtBy': [],
})
stations.sort(key=lambda s: len(s['stoppedAtBy']), reverse=True)
with open(STATIONS_FILE, 'w') as f:
json.dump(stations, f)
def found_train_at_station(station_name: str, train_number: int):
found_station(station_name)
for i in range(len(stations)):
if stations[i]['name'] == station_name:
if train_number not in stations[i]['stoppedAtBy']:
stations[i]['stoppedAtBy'].append(train_number)
stations.sort(key=lambda s: len(s['stoppedAtBy']), reverse=True)
with open(STATIONS_FILE, 'w') as f:
json.dump(stations, f)
break
def on_train_data(train_data: dict):
train_no = found_train(train_data['rank'], train_data['number'], train_data['operator'])
for station in train_data['stations']:
found_train_at_station(station['name'], train_no)
def on_train_lookup_failure(train_no: int):
pass

0
server/scraper → server/server/scraper

53
server/server/server.py

@ -0,0 +1,53 @@
print(f'Server {__name__=}')
import datetime
from flask import Flask, json, request, jsonify
from .cache import CachedData
app = Flask(__name__)
from .v2 import v2
app.register_blueprint(v2.bp)
@app.route('/')
def root():
return 'Test'
train_data_cache = {}
@app.route('/train/<int:train_no>')
def get_train_info(train_no: int):
def get_data():
from .scraper.scraper import scrape
use_yesterday = False
result = scrape(train_no, use_yesterday=use_yesterday)
from . import db
db.on_train_data(result)
# Convert to v1
# datetime ISO string to hh:mm
for i in range(len(result['stations'])):
if result['stations'][i]['arrival']:
date = datetime.datetime.fromisoformat(result['stations'][i]['arrival']['scheduleTime'])
result['stations'][i]['arrival']['scheduleTime'] = f'{date.hour}:{date.minute:02}'
if result['stations'][i]['departure']:
date = datetime.datetime.fromisoformat(result['stations'][i]['departure']['scheduleTime'])
result['stations'][i]['departure']['scheduleTime'] = f'{date.hour}:{date.minute:02}'
return result
if train_no not in train_data_cache:
train_data_cache[train_no] = CachedData(get_data, validity=1000 * 30)
data, fetch_time = train_data_cache[train_no]()
resp = jsonify(data)
resp.headers['X-Last-Fetched'] = fetch_time.isoformat()
return resp
@app.route('/trains')
def get_trains():
return jsonify(list(train_data_cache.keys()))
if __name__ == '__main__':
print('Starting debug server on port 5001')
app.run(port=5000)

18
server/server/utils.py

@ -0,0 +1,18 @@
def take_while(predicate, input):
for element in input:
if not predicate(element):
break
yield element
_NO_DEFAULT = object()
def check_yes_no(input: str, default=_NO_DEFAULT, considered_yes=None) -> bool:
input = str(input).strip().lower()
if not input:
if default == _NO_DEFAULT:
raise Exception('Empty input with no default')
return default
if not considered_yes:
considered_yes = ['y', 'yes', 't', 'true', '1']
return input in considered_yes

1
server/server/v2/__init__.py

@ -0,0 +1 @@
__all__ = ['v2']

32
server/server/v2/v2.py

@ -0,0 +1,32 @@
from flask import Blueprint, jsonify, request
from .. import db
from ..cache import CachedData
from ..utils import check_yes_no
bp = Blueprint('v2', __name__, url_prefix='/v2')
@bp.get('/trains')
def get_known_trains():
return jsonify(db.trains)
@bp.get('/stations')
def get_known_stations():
return jsonify(db.stations)
train_data_cache = {}
@bp.route('/train/<int:train_no>')
def get_train_info(train_no: int):
use_yesterday = check_yes_no(request.args.get('use_yesterday', ''), default=False)
def get_data():
from ..scraper.scraper import scrape
result = scrape(train_no, use_yesterday=use_yesterday)
db.on_train_data(result)
return result
if train_no not in train_data_cache:
train_data_cache[(train_no, use_yesterday)] = CachedData(get_data, validity=1000 * 30)
data, fetch_time = train_data_cache[(train_no, use_yesterday)]()
resp = jsonify(data)
resp.headers['X-Last-Fetched'] = fetch_time.isoformat()
return resp
Loading…
Cancel
Save