Browse Source

Added v2 API

python3
Kenneth Bruen 3 years ago
parent
commit
de78a094a1
Signed by: kbruen
GPG Key ID: CB77B9FE7F902176
  1. 6
      Dockerfile
  2. 1
      scraper/Pipfile
  3. 10
      scraper/Pipfile.lock
  4. 19
      scraper/scraper.py
  5. 2
      scraper/setup.py
  6. 7
      server/Pipfile.lock
  7. 2
      server/main.py
  8. 33
      server/server.py
  9. 0
      server/server/__init__.py
  10. 0
      server/server/cache.py
  11. 85
      server/server/db.py
  12. 0
      server/server/scraper
  13. 53
      server/server/server.py
  14. 18
      server/server/utils.py
  15. 1
      server/server/v2/__init__.py
  16. 32
      server/server/v2/v2.py

6
Dockerfile

@ -6,16 +6,16 @@ WORKDIR /var/app/scraper
COPY scraper/Pipfil* ./
COPY scraper/setup.py ./
WORKDIR /var/app/server
RUN ln -s /var/app/scraper scraper
COPY server/Pipfil* ./
RUN pipenv install
RUN pipenv graph
WORKDIR /var/app/scraper
COPY scraper .
WORKDIR /var/app/server
COPY server .
RUN rm scraper
RUN ln -s /var/app/scraper scraper
RUN rm server/scraper
RUN ln -s /var/app/scraper ./server/scraper
ENV PORT 5000
EXPOSE ${PORT}

1
scraper/Pipfile

@ -6,6 +6,7 @@ name = "pypi"
[packages]
beautifulsoup4 = "*"
requests = "*"
pytz = "*"
[dev-packages]

10
scraper/Pipfile.lock generated

@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "e7c5f7eab5a8f9202caaaa3bdca8e911579596b8dd25319c2f50e84794eb9fa8"
"sha256": "d7e3ebca9807b4f0c9dcac014554e9d1c9cb3a0c30b5c71b0b7cd4ccdc4934e1"
},
"pipfile-spec": 6,
"requires": {
@ -48,6 +48,14 @@
"markers": "python_version >= '3'",
"version": "==3.2"
},
"pytz": {
"hashes": [
"sha256:83a4a90894bf38e243cf052c8b58f381bfe9a7a483f6a9cab140bc7f702ac4da",
"sha256:eb10ce3e7736052ed3623d49975ce333bcd712c7bb19a58b9e2089d4057d0798"
],
"index": "pypi",
"version": "==2021.1"
},
"requests": {
"hashes": [
"sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24",

19
scraper/scraper.py

@ -3,6 +3,7 @@
from datetime import datetime, timedelta
import re
import pytz
import requests
from bs4 import BeautifulSoup
from urllib.parse import quote, urlencode
@ -30,6 +31,18 @@ STOPPING_TIME_REGEX = re.compile(r'^([0-9]+) min oprire$')
STATION_DEPARR_STATUS_REGEX = re.compile(r'^(?:(la timp)|(?:((?:\+|-)[0-9]+) min \((?:(?:întârziere)|(?:mai devreme))\)))(\*?)$')
class DateTimeSequencer:
def __init__(self, year: int, month: int, day: int) -> None:
self.current = datetime(year, month, day, 0, 0, 0)
self.current -= timedelta(seconds=1)
def __call__(self, hour: int, minute: int = 0, second: int = 0) -> datetime:
potential_new_date = datetime(self.current.year, self.current.month, self.current.day, hour, minute, second)
if (self.current > potential_new_date):
potential_new_date += timedelta(days=1)
self.current = potential_new_date
return self.current
def collapse_space(string: str) -> str:
return re.sub(
rf'[{BeautifulSoup.ASCII_SPACES}]+',
@ -77,6 +90,8 @@ def scrape(train_no: int, use_yesterday=False, date_override=None):
train_info_div = train_info_div.div('div', recursive=False)[0]
scraped['rank'], scraped['number'], scraped['date'] = TRAIN_INFO_REGEX.match(collapse_space(train_info_div.h2.text)).groups()
date_d, date_m, date_y = (int(comp) for comp in scraped['date'].split('.'))
date = datetime(date_y, date_m, date_d)
scraped['operator'] = OPERATOR_REGEX.match(collapse_space(train_info_div.p.text)).groups()[0]
@ -101,6 +116,8 @@ def scrape(train_no: int, use_yesterday=False, date_override=None):
stations = status_div.ul('li', recursive=False)
scraped['stations'] = []
dt_seq = DateTimeSequencer(date.year, date.month, date.day)
tz = pytz.timezone('Europe/Bucharest')
for station in stations:
station_scraped = {}
@ -126,6 +143,8 @@ def scrape(train_no: int, use_yesterday=False, date_override=None):
time, *_ = parts
result['scheduleTime'] = collapse_space(time.text)
st_hr, st_min = (int(comp) for comp in result['scheduleTime'].split(':'))
result['scheduleTime'] = tz.localize(dt_seq(st_hr, st_min)).isoformat()
if len(parts) >= 2:
_, status, *_ = parts
result['status'] = {}

2
scraper/setup.py

@ -4,5 +4,5 @@ setup(
name='InfoFer_Scraper',
version='0.1',
author='Dan Cojocaru',
install_requires=['beautifulsoup4', 'requests']
install_requires=['beautifulsoup4', 'requests', 'pytz']
)

7
server/Pipfile.lock generated

@ -298,6 +298,13 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==2.20"
},
"pytz": {
"hashes": [
"sha256:83a4a90894bf38e243cf052c8b58f381bfe9a7a483f6a9cab140bc7f702ac4da",
"sha256:eb10ce3e7736052ed3623d49975ce333bcd712c7bb19a58b9e2089d4057d0798"
],
"version": "==2021.1"
},
"requests": {
"hashes": [
"sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24",

2
server/main.py

@ -1,5 +1,5 @@
from gevent.pywsgi import WSGIServer
from server import app
from server.server import app
def main():
port = 5000

33
server/server.py

@ -1,33 +0,0 @@
from flask import Flask, json, request, jsonify
from cache import CachedData
app = Flask(__name__)
@app.route('/')
def root():
return 'Test'
train_data_cache = {}
@app.route('/train/<int:train_no>')
def get_train_info(train_no: int):
def get_data():
print(f'Cache miss for {train_no}')
from scraper.scraper import scrape
use_yesterday = False
return scrape(train_no, use_yesterday=use_yesterday)
if train_no not in train_data_cache:
train_data_cache[train_no] = CachedData(get_data, validity=1000 * 30)
data, fetch_time = train_data_cache[train_no]()
resp = jsonify(data)
resp.headers['X-Last-Fetched'] = fetch_time.isoformat()
return resp
@app.route('/trains')
def get_trains():
return jsonify(list(train_data_cache.keys()))
if __name__ == '__main__':
print('Starting debug server on port 5001')
app.run(port=5000)

0
server/__init__.py → server/server/__init__.py

0
server/cache.py → server/server/cache.py

85
server/server/db.py

@ -0,0 +1,85 @@
# Globals
stations = []
trains = []
# Examples
example_station = {
'name': 'Gară',
'stoppedAtBy': [123, 456]
}
example_train = {
'rank': 'IR',
'numberString': '74',
'number': 74,
'company': 'CFR Călători'
}
# Init
import json
import os
from os import path, stat
from .utils import take_while
DB_DIR = os.environ.get('DB_DIR', '') or './db'
if not path.exists(DB_DIR):
os.mkdir(DB_DIR)
STATIONS_FILE = path.join(DB_DIR, 'stations.json')
if path.exists(STATIONS_FILE):
with open(STATIONS_FILE) as f:
stations = json.load(f)
TRAINS_FILE = path.join(DB_DIR, 'trains.json')
if path.exists(TRAINS_FILE):
with open(TRAINS_FILE) as f:
trains = json.load(f)
def found_train(rank: str, number: str, company: str) -> int:
number_int = int(''.join(take_while(lambda s: str(s).isnumeric(), number)))
try:
next(filter(lambda tr: tr['number'] == number_int, trains))
except StopIteration:
trains.append({
'number': number_int,
'numberString': number,
'company': company,
'rank': rank,
})
with open(TRAINS_FILE, 'w') as f:
json.dump(trains, f)
return number_int
def found_station(name: str):
try:
next(filter(lambda s: s['name'] == name, stations))
except StopIteration:
stations.append({
'name': name,
'stoppedAtBy': [],
})
stations.sort(key=lambda s: len(s['stoppedAtBy']), reverse=True)
with open(STATIONS_FILE, 'w') as f:
json.dump(stations, f)
def found_train_at_station(station_name: str, train_number: int):
found_station(station_name)
for i in range(len(stations)):
if stations[i]['name'] == station_name:
if train_number not in stations[i]['stoppedAtBy']:
stations[i]['stoppedAtBy'].append(train_number)
stations.sort(key=lambda s: len(s['stoppedAtBy']), reverse=True)
with open(STATIONS_FILE, 'w') as f:
json.dump(stations, f)
break
def on_train_data(train_data: dict):
train_no = found_train(train_data['rank'], train_data['number'], train_data['operator'])
for station in train_data['stations']:
found_train_at_station(station['name'], train_no)
def on_train_lookup_failure(train_no: int):
pass

0
server/scraper → server/server/scraper

53
server/server/server.py

@ -0,0 +1,53 @@
print(f'Server {__name__=}')
import datetime
from flask import Flask, json, request, jsonify
from .cache import CachedData
app = Flask(__name__)
from .v2 import v2
app.register_blueprint(v2.bp)
@app.route('/')
def root():
return 'Test'
train_data_cache = {}
@app.route('/train/<int:train_no>')
def get_train_info(train_no: int):
def get_data():
from .scraper.scraper import scrape
use_yesterday = False
result = scrape(train_no, use_yesterday=use_yesterday)
from . import db
db.on_train_data(result)
# Convert to v1
# datetime ISO string to hh:mm
for i in range(len(result['stations'])):
if result['stations'][i]['arrival']:
date = datetime.datetime.fromisoformat(result['stations'][i]['arrival']['scheduleTime'])
result['stations'][i]['arrival']['scheduleTime'] = f'{date.hour}:{date.minute:02}'
if result['stations'][i]['departure']:
date = datetime.datetime.fromisoformat(result['stations'][i]['departure']['scheduleTime'])
result['stations'][i]['departure']['scheduleTime'] = f'{date.hour}:{date.minute:02}'
return result
if train_no not in train_data_cache:
train_data_cache[train_no] = CachedData(get_data, validity=1000 * 30)
data, fetch_time = train_data_cache[train_no]()
resp = jsonify(data)
resp.headers['X-Last-Fetched'] = fetch_time.isoformat()
return resp
@app.route('/trains')
def get_trains():
return jsonify(list(train_data_cache.keys()))
if __name__ == '__main__':
print('Starting debug server on port 5001')
app.run(port=5000)

18
server/server/utils.py

@ -0,0 +1,18 @@
def take_while(predicate, input):
for element in input:
if not predicate(element):
break
yield element
_NO_DEFAULT = object()
def check_yes_no(input: str, default=_NO_DEFAULT, considered_yes=None) -> bool:
input = str(input).strip().lower()
if not input:
if default == _NO_DEFAULT:
raise Exception('Empty input with no default')
return default
if not considered_yes:
considered_yes = ['y', 'yes', 't', 'true', '1']
return input in considered_yes

1
server/server/v2/__init__.py

@ -0,0 +1 @@
__all__ = ['v2']

32
server/server/v2/v2.py

@ -0,0 +1,32 @@
from flask import Blueprint, jsonify, request
from .. import db
from ..cache import CachedData
from ..utils import check_yes_no
bp = Blueprint('v2', __name__, url_prefix='/v2')
@bp.get('/trains')
def get_known_trains():
return jsonify(db.trains)
@bp.get('/stations')
def get_known_stations():
return jsonify(db.stations)
train_data_cache = {}
@bp.route('/train/<int:train_no>')
def get_train_info(train_no: int):
use_yesterday = check_yes_no(request.args.get('use_yesterday', ''), default=False)
def get_data():
from ..scraper.scraper import scrape
result = scrape(train_no, use_yesterday=use_yesterday)
db.on_train_data(result)
return result
if train_no not in train_data_cache:
train_data_cache[(train_no, use_yesterday)] = CachedData(get_data, validity=1000 * 30)
data, fetch_time = train_data_cache[(train_no, use_yesterday)]()
resp = jsonify(data)
resp.headers['X-Last-Fetched'] = fetch_time.isoformat()
return resp
Loading…
Cancel
Save