commit 43c61c02471a0b4901c596a7f66cb20e36ca2925 Author: Dan Cojocaru Date: Mon Feb 6 05:58:39 2023 +0100 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1b22ffa --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +# macOS +.DS_Store + +outputs \ No newline at end of file diff --git a/.ruby-version b/.ruby-version new file mode 100644 index 0000000..944880f --- /dev/null +++ b/.ruby-version @@ -0,0 +1 @@ +3.2.0 diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..3ee3fcd --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,14 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Debug Local File", + "type": "Ruby", + "request": "launch", + "program": "${workspaceRoot}/archival.rb" + } + ] +} \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..d6a0714 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,7 @@ +FROM ruby:3.2-alpine + +RUN apk add curl + +COPY archival.rb /app/archival.rb +WORKDIR /output +ENTRYPOINT [ "/app/archival.rb" ] diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..d04eece --- /dev/null +++ b/Gemfile @@ -0,0 +1,5 @@ +# frozen_string_literal: true + +source "https://rubygems.org" + +gem "solargraph", :group => :development diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 0000000..1da5df1 --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,68 @@ +GEM + remote: https://rubygems.org/ + specs: + ast (2.4.2) + backport (1.2.0) + benchmark (0.2.1) + diff-lcs (1.5.0) + e2mmap (0.1.0) + jaro_winkler (1.5.4) + json (2.6.3) + kramdown (2.4.0) + rexml + kramdown-parser-gfm (1.1.0) + kramdown (~> 2.0) + nokogiri (1.14.1-arm64-darwin) + racc (~> 1.4) + parallel (1.22.1) + parser (3.2.0.0) + ast (~> 2.4.1) + racc (1.6.2) + rainbow (3.1.1) + regexp_parser (2.6.2) + reverse_markdown (2.1.1) + nokogiri + rexml (3.2.5) + rubocop (1.44.1) + json (~> 2.3) + parallel (~> 1.10) + parser (>= 3.2.0.0) + rainbow (>= 2.2.2, < 4.0) + regexp_parser (>= 1.8, < 3.0) + rexml (>= 3.2.5, < 4.0) + rubocop-ast (>= 1.24.1, < 2.0) + ruby-progressbar (~> 1.7) + unicode-display_width (>= 2.4.0, < 3.0) + rubocop-ast (1.24.1) + parser (>= 3.1.1.0) + ruby-progressbar (1.11.0) + solargraph (0.48.0) + backport (~> 1.2) + benchmark + bundler (>= 1.17.2) + diff-lcs (~> 1.4) + e2mmap + jaro_winkler (~> 1.5) + kramdown (~> 2.3) + kramdown-parser-gfm (~> 1.1) + parser (~> 3.0) + reverse_markdown (>= 1.0.5, < 3) + rubocop (>= 0.52) + thor (~> 1.0) + tilt (~> 2.0) + yard (~> 0.9, >= 0.9.24) + thor (1.2.1) + tilt (2.0.11) + unicode-display_width (2.4.2) + webrick (1.7.0) + yard (0.9.28) + webrick (~> 1.7.0) + +PLATFORMS + arm64-darwin-21 + +DEPENDENCIES + solargraph + +BUNDLED WITH + 2.4.1 diff --git a/analysis.rb b/analysis.rb new file mode 100755 index 0000000..9b149aa --- /dev/null +++ b/analysis.rb @@ -0,0 +1,170 @@ +#! /usr/bin/env ruby + +require 'json' + +$datestr = ARGV[0] || $stdin.gets.strip + +# @param [String] station_name +def normalize_station station_name + station_name + .downcase + .gsub(/\s/, ' ') + .gsub('ă', 'a') + .gsub('â', 'a') + .gsub('î', 'i') + .gsub('ș', 's') + .gsub('ț', 't') +end + +def get_station station_name + # Try getting from file + begin + JSON.parse(File.read(File.join($datestr, 'stations', "#{normalize_station(station_name)}.json")), symbolize_names: true) + rescue + nil + end +end + +# @param [String] train_number +def get_train train_number + # Try getting from file + begin + JSON.parse(File.read(File.join($datestr, 'trains', "#{train_number}.json")), symbolize_names: true) + rescue + nil + end +end + +visited_stations = Dir.entries(File.join $datestr, 'stations') + .filter { |entry| entry != '.' and entry != '..' } + .map { |name| if name.end_with? ".json" then name[0...-5] else name end } + .map { |name| get_station(name)[:stationName] } +visited_trains = Dir.entries(File.join $datestr, 'trains') + .filter { |entry| entry != '.' and entry != '..' } + .map { |name| if name.end_with? ".json" then name[0...-5] else name end } + +trains = visited_trains.map { |train_number| get_train train_number } +r_trains = trains.filter { |train| train[:rank] == 'R' } +re_trains = trains.filter { |train| train[:rank] == 'R-E' } +ir_trains = trains.filter { |train| train[:rank] == 'IR' } +irn_trains = trains.filter { |train| train[:rank] == 'IRN' } +ic_trains = trains.filter { |train| train[:rank] == 'IC' } + +def filter_by_arrival_delay trains + trains.filter do |train| + arrival_station = train[:groups][0][:stations][-1] + arrival = arrival_station[:arrival] + if arrival[:status] and arrival[:status][:real] + yield arrival[:status][:delay] + else + false + end + end +end + +def get_early_arrival trains + filter_by_arrival_delay(trains) { |delay| delay < 0 } +end + +def get_on_time_arrival trains + filter_by_arrival_delay(trains) { |delay| delay == 0 } +end + +def get_late_arrival trains + filter_by_arrival_delay(trains) { |delay| delay > 0 } +end + +def get_late_gt5_arrival trains + filter_by_arrival_delay(trains) { |delay| delay > 5 } +end + +on_time_departure = trains.filter do |train| + departure_station = train[:groups][0][:stations][0] + departure = departure_station[:departure] + departure[:status] and departure[:status][:real] and departure[:status][:delay] == 0 +end +late_departure = trains.filter do |train| + departure_station = train[:groups][0][:stations][0] + departure = departure_station[:departure] + departure[:status] and departure[:status][:real] and departure[:status][:delay] > 0 +end + +early_arrival = get_early_arrival trains +on_time_arrival = get_on_time_arrival trains +late_arrival = get_late_arrival trains +late_arrival_gt5 = get_late_gt5_arrival trains +late_gt5_arrival = late_arrival_gt5 + +gained_delay = [] +recovered_delay = [] + +early_on_route = [] + +ongoing = trains.filter do |train| + arrival_station = train[:groups][0][:stations][-1] + arrival_station[:arrival][:status] and not arrival_station[:arrival][:status][:real] +end + +arrival_delays = {} +trains.each do |train| + arrival_station = train[:groups][0][:stations][-1] + arrival = arrival_station[:arrival] + if arrival[:status] and arrival[:status][:real] + delay = arrival[:status][:delay] + arr = arrival_delays.fetch(delay, []) + arr << train + arrival_delays[delay] = arr + end +end + +puts "#{on_time_departure.count} trains departed on time" +puts "#{late_departure.count} trains departed late" +puts +printf "%d trains arrived early\n", early_arrival.count +# printf " %4d (%3d%%) R, %4d (%3d%%) R-E, %4d (%3d%%) IR, %4d (%3d%%) IRN, %4d (%3d%%) IC\n", get_early_arrival(r_trains).count, (get_early_arrival(r_trains).count * 100 / early_arrival.count), get_early_arrival(re_trains).count, (get_early_arrival(re_trains).count * 100 / early_arrival.count), get_early_arrival(ir_trains).count, (get_early_arrival(ir_trains).count * 100 / early_arrival.count), get_early_arrival(irn_trains).count, (get_early_arrival(irn_trains).count * 100 / early_arrival.count), get_early_arrival(ic_trains).count, (get_early_arrival(ic_trains).count * 100 / early_arrival.count) +printf "%d trains arrived on time\n", on_time_arrival.count +# printf " %4d (%3d%%) R, %4d (%3d%%) R-E, %4d (%3d%%) IR, %4d (%3d%%) IRN, %4d (%3d%%) IC\n", get_on_time_arrival(r_trains).count, (get_on_time_arrival(r_trains).count * 100 / on_time_arrival.count), get_on_time_arrival(re_trains).count, (get_on_time_arrival(re_trains).count * 100 / on_time_arrival.count), get_on_time_arrival(ir_trains).count, (get_on_time_arrival(ir_trains).count * 100 / on_time_arrival.count), get_on_time_arrival(irn_trains).count, (get_on_time_arrival(irn_trains).count * 100 / on_time_arrival.count), get_on_time_arrival(ic_trains).count, (get_on_time_arrival(ic_trains).count * 100 / on_time_arrival.count) +printf "%d trains arrived late\n", late_arrival.count +# printf " %4d (%3d%%) R, %4d (%3d%%) R-E, %4d (%3d%%) IR, %4d (%3d%%) IRN, %4d (%3d%%) IC\n", get_late_arrival(r_trains).count, (get_late_arrival(r_trains).count * 100 / late_arrival.count), get_late_arrival(re_trains).count, (get_late_arrival(re_trains).count * 100 / late_arrival.count), get_late_arrival(ir_trains).count, (get_late_arrival(ir_trains).count * 100 / late_arrival.count), get_late_arrival(irn_trains).count, (get_late_arrival(irn_trains).count * 100 / late_arrival.count), get_late_arrival(ic_trains).count, (get_late_arrival(ic_trains).count * 100 / late_arrival.count) +printf "%d trains arrived with a delay greater than 5 minutes\n", late_arrival_gt5.count +# printf " %4d (%3d%%) R, %4d (%3d%%) R-E, %4d (%3d%%) IR, %4d (%3d%%) IRN, %4d (%3d%%) IC\n", get_late_gt5_arrival(r_trains).count, (get_late_gt5_arrival(r_trains).count * 100 / late_gt5_arrival.count), get_late_gt5_arrival(re_trains).count, (get_late_gt5_arrival(re_trains).count * 100 / late_gt5_arrival.count), get_late_gt5_arrival(ir_trains).count, (get_late_gt5_arrival(ir_trains).count * 100 / late_gt5_arrival.count), get_late_gt5_arrival(irn_trains).count, (get_late_gt5_arrival(irn_trains).count * 100 / late_gt5_arrival.count), get_late_gt5_arrival(ic_trains).count, (get_late_gt5_arrival(ic_trains).count * 100 / late_gt5_arrival.count) +puts +puts "Arrivals at destination by rank:" +printf " R: %4d (%3d%%) early, %4d (%3d%%) on time, %4d (%3d%%) late [%4d (%3d%%) >5min late], %4d total\n", get_early_arrival( r_trains).count, (get_early_arrival( r_trains).count * 100 / r_trains.count), get_on_time_arrival( r_trains).count, (get_on_time_arrival( r_trains).count * 100 / r_trains.count), get_late_arrival( r_trains).count, (get_late_arrival( r_trains).count * 100 / r_trains.count), get_late_gt5_arrival( r_trains).count, (get_late_gt5_arrival( r_trains).count * 100 / r_trains.count), r_trains.count +printf "R-E: %4d (%3d%%) early, %4d (%3d%%) on time, %4d (%3d%%) late [%4d (%3d%%) >5min late], %4d total\n", get_early_arrival( re_trains).count, (get_early_arrival( re_trains).count * 100 / re_trains.count), get_on_time_arrival( re_trains).count, (get_on_time_arrival( re_trains).count * 100 / re_trains.count), get_late_arrival( re_trains).count, (get_late_arrival( re_trains).count * 100 / re_trains.count), get_late_gt5_arrival( re_trains).count, (get_late_gt5_arrival( re_trains).count * 100 / re_trains.count), re_trains.count +printf " IR: %4d (%3d%%) early, %4d (%3d%%) on time, %4d (%3d%%) late [%4d (%3d%%) >5min late], %4d total\n", get_early_arrival( ir_trains).count, (get_early_arrival( ir_trains).count * 100 / ir_trains.count), get_on_time_arrival( ir_trains).count, (get_on_time_arrival( ir_trains).count * 100 / ir_trains.count), get_late_arrival( ir_trains).count, (get_late_arrival( ir_trains).count * 100 / ir_trains.count), get_late_gt5_arrival( ir_trains).count, (get_late_gt5_arrival( ir_trains).count * 100 / ir_trains.count), ir_trains.count +printf "IRN: %4d (%3d%%) early, %4d (%3d%%) on time, %4d (%3d%%) late [%4d (%3d%%) >5min late], %4d total\n", get_early_arrival(irn_trains).count, (get_early_arrival(irn_trains).count * 100 / irn_trains.count), get_on_time_arrival(irn_trains).count, (get_on_time_arrival(irn_trains).count * 100 / irn_trains.count), get_late_arrival(irn_trains).count, (get_late_arrival(irn_trains).count * 100 / irn_trains.count), get_late_gt5_arrival(irn_trains).count, (get_late_gt5_arrival(irn_trains).count * 100 / irn_trains.count), irn_trains.count +printf " IC: %4d (%3d%%) early, %4d (%3d%%) on time, %4d (%3d%%) late [%4d (%3d%%) >5min late], %4d total\n", get_early_arrival( ic_trains).count, (get_early_arrival( ic_trains).count * 100 / ic_trains.count), get_on_time_arrival( ic_trains).count, (get_on_time_arrival( ic_trains).count * 100 / ic_trains.count), get_late_arrival( ic_trains).count, (get_late_arrival( ic_trains).count * 100 / ic_trains.count), get_late_gt5_arrival( ic_trains).count, (get_late_gt5_arrival( ic_trains).count * 100 / ic_trains.count), ic_trains.count +puts +puts "Delays at arrival:" +arrival_delays.to_a.sort {|a, b| b[1].count <=> a[1].count}.each do |delay, trains| + printf "%3d min: %4d train%s", delay, trains.count, if trains.count == 1 then " " else "s" end + if trains.count <= 5 + printf " (%s)", trains.map { |train| + sprintf "%3s %5s", train[:rank], train[:number] + }.join(", ") + end + puts +end +puts +puts "#{ongoing.count} trains still travelling" + +puts +puts "#{on_time_departure.count} trenuri plecate la timp" +puts "#{late_departure.count} trenuri plecate cu întârziere" +puts +puts "#{early_arrival.count} trenuri sosite mai devreme" +puts "#{on_time_arrival.count} trenuri sosite la timp" +puts "#{late_arrival.count} trenuri sosite cu întârziere" +puts "#{late_arrival_gt5.count} trenuri sosite cu întârziere mai mare de 5 minute" +puts +puts "Întârzieri la destinație:" +arrival_delays.to_a.sort {|a, b| b[1].count <=> a[1].count}.each do |delay, trains| + printf "%3d min: %4d tren%s", delay, trains.count, if trains.count == 1 then " " else "uri" end + if trains.count <= 5 + printf " (%s)", trains.map { |train| + sprintf "%3s %5s", train[:rank], train[:number] + }.join(", ") + end + puts +# end diff --git a/archival.rb b/archival.rb new file mode 100755 index 0000000..b3fd137 --- /dev/null +++ b/archival.rb @@ -0,0 +1,307 @@ +#! /usr/bin/env ruby + +require 'erb' +require 'json' +require 'open3' +require 'set' + +$datestr = (Time.now - 86400).strftime '%F' + +class Object + def also + yield self + self + end + + def let + yield self + end +end + +# @param [String] station_name +def normalize_station station_name + station_name + .downcase + .gsub(/\s/, ' ') + .gsub('ă', 'a') + .gsub('â', 'a') + .gsub('î', 'i') + .gsub('ș', 's') + .gsub('ț', 't') +end + +$station_link_name_cache = {} + +# @param [String] station_name +def get_station station_name + # Try getting from file + begin + return JSON.parse(File.read(File.join($datestr, 'stations', "#{normalize_station(station_name)}.json")), symbolize_names: true) + rescue + end + + station_name = normalize_station station_name + station_name = $station_link_name_cache.fetch(station_name, station_name) + station_name = ERB::Util.url_encode(station_name) + stdout, status = Open3.capture2('curl', '--silent', '--fail', '--show-error', "https://scraper.infotren.dcdev.ro/v3/stations/#{station_name}?date=#{$datestr}") + if status != 0 + nil + else + JSON.parse stdout, symbolize_names: true + end +end + +# @param [String] train_number +def get_train train_number + # Try getting from file + begin + return JSON.parse(File.read(File.join($datestr, 'trains', "#{train_number}.json")), symbolize_names: true) + rescue + end + + train_number = ERB::Util.url_encode(train_number) + stdout, status = Open3.capture2('curl', '--silent', '--fail', '--show-error', "https://scraper.infotren.dcdev.ro/v3/trains/#{train_number}?date=#{$datestr}") + if status != 0 + nil + else + JSON.parse stdout, symbolize_names: true + end +end + +def populate_link_name_cache train + train[:groups].each do |g| + g[:stations].each do |s| + $station_link_name_cache[s[:name]] = s[:linkName] + end + end +end + +def get_stations_from_station station + Set.new.also do |stations| + station[:arrivals].each do |train| + train[:train][:route].each do |r| + stations << r + end + end + station[:departures].each do |train| + train[:train][:route].each do |r| + stations << r + end + end + end +end + +# @param [Integer] times +def reattempt times + times.times do |n| + result = yield + unless result.nil? + return result + end + end + nil +end + +def get_trains_from_station station + Set.new.also do |trains| + station[:arrivals].each do |train| + trains << train[:train][:number] + end + station[:departures].each do |train| + trains << train[:train][:number] + end + end +end + +def get_stations_from_train train + train[:groups].flat_map do |group| + group[:stations].map do |station| + station[:name] + end + end +end + +class Log + def initialize + @to_erase = 0 + end + + def add message + print "\e[1A\e[2K" * @to_erase + @to_erase = 0 + puts message + end + + def temporary_add message + if $stdout.isatty + puts message + @to_erase += 1 + end + end +end + +def main + log = Log.new + + log.add "Creating archive for yesterday, #{$datestr}" + + existing = if Dir.exist? $datestr + log.add 'Archive already exists. Merging.' + true + else + Dir.mkdir $datestr + Dir.mkdir(File.join $datestr, 'stations') + Dir.mkdir(File.join $datestr, 'trains') + false + end + + # @type [Array] + roots = ['București Nord', 'Brașov', 'Iași', 'Titan Sud', 'Oltenița'] + # @type [Array] + visited_stations = Dir.entries(File.join $datestr, 'stations') + .filter { |entry| entry != '.' and entry != '..' } + .map { |name| if name.end_with? ".json" then name[0...-5] else name end } + .map { |name| get_station(name)[:stationName] } + if existing + log.add "#{visited_stations.count} visited stations" + end + # @type [Array] + unvisited_stations = [] + # @type [Set] + failed_stations = Set.new + + # @type [Array] + visited_trains = Dir.entries(File.join $datestr, 'trains') + .filter { |entry| entry != '.' and entry != '..' } + .map { |name| if name.end_with? ".json" then name[0...-5] else name end } + if existing + log.add "#{visited_trains.count} visited trains" + end + # @type [Array] + unvisited_trains = [] + # @type [Set] + failed_trains = Set.new + + roots.each do |station| + unvisited_stations.push station unless visited_stations.include? station + end + + # Get unvisited from visited + visited_trains.each do |train_number| + train = get_train train_number + populate_link_name_cache train + + get_stations_from_train(train).each do |station| + unvisited_stations << station unless visited_stations.include? station or unvisited_stations.include? station + end + end + visited_stations.each do |station_name| + station = get_station station_name + + get_stations_from_station(station).each do |station| + unvisited_stations << station unless visited_stations.include? station or unvisited_stations.include? station + end + get_trains_from_station(station).each do |train| + unvisited_trains << train unless visited_trains.include? train or unvisited_trains.include? train + end + end + unless unvisited_stations.empty? + log.add "#{unvisited_stations.count} unvisited stations" + end + unless unvisited_trains.empty? + log.add "#{unvisited_trains.count} unvisited trains" + end + + start_time = Time.now + File.write(File.join($datestr, 'start_time.txt'), start_time.strftime('%FT%T%:z')) + + until unvisited_stations.empty? and unvisited_trains.empty? + # First visit all trains + if not unvisited_trains.empty? + # @type [String] + train_number = unvisited_trains.shift + log.temporary_add "Getting train #{train_number}" + train = reattempt 3 do get_train train_number end + if train.nil? + # Failed to get + failed_trains << train_number + log.add "Failed to get train #{train_number}" + else + populate_link_name_cache train + visited_trains << train_number + File.write(File.join($datestr, 'trains', "#{train_number}.json"), JSON.dump(train)) + log.add "Got train #{train_number}" + old_ustations_cnt = unvisited_stations.count + get_stations_from_train(train).each do |station| + unvisited_stations << station unless visited_stations.include? station or unvisited_stations.include? station + end + unless unvisited_stations.count == old_ustations_cnt + log.add "Found #{unvisited_stations.count - old_ustations_cnt} new stations" + end + end + log.temporary_add "Stations: #{visited_stations.count} visited, #{failed_stations.count} failed, #{unvisited_stations.count} remaining" + log.temporary_add "Trains: #{visited_trains.count} visited, #{failed_trains.count} failed, #{unvisited_trains.count} remaining" + # Then visit stations + elsif not unvisited_stations.empty? + # @type [String] + station_name = unvisited_stations.shift + log.temporary_add "Getting station #{station_name}" + station = reattempt 3 do get_station station_name end + if station.nil? + # Failed to get + failed_stations << station_name + log.add "Failed to get station #{station_name}" + else + visited_stations << station_name + filename = normalize_station(station_name) + filename = $station_link_name_cache.fetch(filename, filename) + File.write(File.join($datestr, 'stations', "#{filename}.json"), JSON.dump(station)) + log.add "Got station #{station_name}" + old_ustations_cnt = unvisited_stations.count + old_utrains_cnt = unvisited_trains.count + get_stations_from_station(station).each do |station| + unvisited_stations << station unless visited_stations.include? station or unvisited_stations.include? station + end + get_trains_from_station(station).each do |train| + unvisited_trains << train unless visited_trains.include? train or unvisited_trains.include? train + end + unless unvisited_stations.count == old_ustations_cnt + log.add "Found #{unvisited_stations.count - old_ustations_cnt} new stations" + end + unless unvisited_trains.count == old_utrains_cnt + log.add "Found #{unvisited_trains.count - old_utrains_cnt} new trains" + end + end + log.temporary_add "Stations: #{visited_stations.count} visited, #{failed_stations.count} failed, #{unvisited_stations.count} remaining" + log.temporary_add "Trains: #{visited_trains.count} visited, #{failed_trains.count} failed, #{unvisited_trains.count} remaining" + end + end + + log.add "Stations: #{visited_stations.count} visited, #{failed_stations.count} failed, #{unvisited_stations.count} remaining" + log.add "Trains: #{visited_trains.count} visited, #{failed_trains.count} failed, #{unvisited_trains.count} remaining" + + end_time = Time.now + File.write(File.join($datestr, 'end_time.txt'), end_time.strftime('%FT%T%:z')) + timespan = (end_time - start_time).to_i + log.add "Finished in #{timespan}s (#{timespan / 3600}h #{timespan / 60 % 60}m #{timespan % 60}s)" + + # Finally, write failures log + File.open(File.join($datestr, 'failed_stations.txt'), "w") do |f| + failed_stations.each do |s| + f.puts s + end + end + File.open(File.join($datestr, 'failed_trains.txt'), "w") do |f| + failed_trains.each do |s| + f.puts s + end + end + File.write(File.join($datestr, 'failures.json'), JSON.dump({ + stations: failed_stations, + trains: failed_trains, + })) +end + +if __FILE__ == $0 + main +end