Browse Source

Add LinkName to stations when querying a train

master
Kenneth Bruen 2 years ago
parent
commit
5785e4a98b
Signed by: kbruen
GPG Key ID: C1980A470C3EE5B1
  1. 634
      scraper/src/Models/Train.cs
  2. 483
      scraper/src/Scrapers/Train.cs

634
scraper/src/Models/Train.cs

@ -1,316 +1,318 @@
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
using InfoferScraper.Models.Status;
using InfoferScraper.Models.Train.JsonConverters;
namespace InfoferScraper.Models.Train {
#region Interfaces
public interface ITrainScrapeResult {
public string Rank { get; }
public string Number { get; }
/// <summary>
/// Date in the DD.MM.YYYY format
/// This date is taken as-is from the result.
/// </summary>
public string Date { get; }
public string Operator { get; }
public IReadOnlyList<ITrainGroup> Groups { get; }
}
public interface ITrainGroup {
public ITrainRoute Route { get; }
public ITrainStatus? Status { get; }
public IReadOnlyList<ITrainStopDescription> Stations { get; }
}
public interface ITrainRoute {
public string From { get; }
public string To { get; }
}
public interface ITrainStatus {
public int Delay { get; }
public string Station { get; }
public StatusKind State { get; }
}
public interface ITrainStopDescription {
public string Name { get; }
public int Km { get; }
/// <summary>
/// The time the train waits in the station in seconds
/// </summary>
public int? StoppingTime { get; }
public string? Platform { get; }
public ITrainStopArrDep? Arrival { get; }
public ITrainStopArrDep? Departure { get; }
public IReadOnlyList<object> Notes { get; }
}
public interface ITrainStopNote {
public NoteKind Kind { get; }
}
public interface ITrainStopTrainNumberChangeNote : ITrainStopNote {
public string Rank { get; }
public string Number { get; }
}
public interface ITrainStopDepartsAsNote : ITrainStopNote {
public string Rank { get; }
public string Number { get; }
public DateTimeOffset DepartureDate { get; }
}
public interface ITrainStopDetachingWagonsNote : ITrainStopNote {
public string Station { get; }
}
public interface ITrainStopReceivingWagonsNote : ITrainStopNote {
public string Station { get; }
}
public interface ITrainStopArrDep {
public DateTimeOffset ScheduleTime { get; }
public IStatus? Status { get; }
}
#endregion
[JsonConverter(typeof(StatusKindConverter))]
public enum StatusKind {
Passing,
Arrival,
Departure,
}
[JsonConverter(typeof(NoteKindConverter))]
public enum NoteKind {
TrainNumberChange,
DetachingWagons,
ReceivingWagons,
DepartsAs,
}
#region Implementations
internal record TrainScrapeResult : ITrainScrapeResult {
private List<ITrainGroup> ModifyableGroups { get; set; } = new();
public string Rank { get; set; } = "";
public string Number { get; set; } = "";
public string Date { get; set; } = "";
public string Operator { get; set; } = "";
public IReadOnlyList<ITrainGroup> Groups => ModifyableGroups.AsReadOnly();
private void AddTrainGroup(ITrainGroup trainGroup) {
ModifyableGroups.Add(trainGroup);
}
internal void AddTrainGroup(Action<TrainGroup> configurator) {
TrainGroup newTrainGroup = new();
configurator(newTrainGroup);
AddTrainGroup(newTrainGroup);
}
}
internal record TrainGroup : ITrainGroup {
private List<ITrainStopDescription> ModifyableStations { get; set; } = new();
public ITrainRoute Route { get; init; } = new TrainRoute();
public ITrainStatus? Status { get; private set; }
public IReadOnlyList<ITrainStopDescription> Stations => ModifyableStations.AsReadOnly();
private void AddStopDescription(ITrainStopDescription stopDescription) {
ModifyableStations.Add(stopDescription);
}
internal void AddStopDescription(Action<TrainStopDescription> configurator) {
TrainStopDescription newStopDescription = new();
configurator(newStopDescription);
AddStopDescription(newStopDescription);
}
internal void ConfigureRoute(Action<TrainRoute> configurator) {
configurator((TrainRoute)Route);
}
internal void MakeStatus(Action<TrainStatus> configurator) {
TrainStatus newStatus = new();
configurator(newStatus);
Status = newStatus;
}
}
internal record TrainRoute : ITrainRoute {
public TrainRoute() {
From = "";
To = "";
}
public string From { get; set; }
public string To { get; set; }
}
internal record TrainStatus : ITrainStatus {
public int Delay { get; set; }
public string Station { get; set; } = "";
public StatusKind State { get; set; }
}
internal record TrainStopDescription : ITrainStopDescription {
private List<ITrainStopNote> ModifyableNotes { get; } = new();
public string Name { get; set; } = "";
public int Km { get; set; }
public int? StoppingTime { get; set; }
public string? Platform { get; set; }
public ITrainStopArrDep? Arrival { get; private set; }
public ITrainStopArrDep? Departure { get; private set; }
public IReadOnlyList<object> Notes => ModifyableNotes.AsReadOnly();
internal void MakeArrival(Action<TrainStopArrDep> configurator) {
TrainStopArrDep newArrival = new();
configurator(newArrival);
Arrival = newArrival;
}
internal void MakeDeparture(Action<TrainStopArrDep> configurator) {
TrainStopArrDep newDeparture = new();
configurator(newDeparture);
Departure = newDeparture;
}
class DepartsAsNote : ITrainStopDepartsAsNote {
public NoteKind Kind => NoteKind.DepartsAs;
public string Rank { get; set; } = "";
public string Number { get; set; } = "";
public DateTimeOffset DepartureDate { get; set; }
}
class TrainNumberChangeNote : ITrainStopTrainNumberChangeNote {
public NoteKind Kind => NoteKind.TrainNumberChange;
public string Rank { get; set; } = "";
public string Number { get; set; } = "";
}
class ReceivingWagonsNote : ITrainStopReceivingWagonsNote {
public NoteKind Kind => NoteKind.ReceivingWagons;
public string Station { get; set; } = "";
}
class DetachingWagonsNote : ITrainStopReceivingWagonsNote {
public NoteKind Kind => NoteKind.DetachingWagons;
public string Station { get; set; } = "";
}
internal void AddDepartsAsNote(string rank, string number, DateTimeOffset departureDate) {
ModifyableNotes.Add(new DepartsAsNote { Rank = rank, Number = number, DepartureDate = departureDate });
}
internal void AddTrainNumberChangeNote(string rank, string number) {
ModifyableNotes.Add(new TrainNumberChangeNote { Rank = rank, Number = number });
}
internal void AddReceivingWagonsNote(string station) {
ModifyableNotes.Add(new ReceivingWagonsNote { Station = station });
}
internal void AddDetachingWagonsNote(string station) {
ModifyableNotes.Add(new DetachingWagonsNote { Station = station });
}
}
public record TrainStopArrDep : ITrainStopArrDep {
public DateTimeOffset ScheduleTime { get; set; }
public IStatus? Status { get; private set; }
internal void MakeStatus(Action<Status.Status> configurator) {
Status.Status newStatus = new();
configurator(newStatus);
Status = newStatus;
}
}
#endregion
#region JSON Converters
namespace JsonConverters {
internal class StatusKindConverter : JsonConverterFactory {
public override bool CanConvert(Type typeToConvert) {
return typeToConvert == typeof(StatusKind);
}
public override JsonConverter? CreateConverter(Type typeToConvert, JsonSerializerOptions options) {
return new Converter();
}
private class Converter : JsonConverter<StatusKind> {
public override StatusKind Read(
ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options
) {
return reader.GetString() switch {
"arrival" => StatusKind.Arrival,
"departure" => StatusKind.Departure,
"passing" => StatusKind.Passing,
_ => throw new NotImplementedException()
};
}
public override void Write(Utf8JsonWriter writer, StatusKind value, JsonSerializerOptions options) {
writer.WriteStringValue(value switch {
StatusKind.Passing => "passing",
StatusKind.Arrival => "arrival",
StatusKind.Departure => "departure",
_ => throw new NotImplementedException()
});
}
}
}
internal class NoteKindConverter : JsonConverterFactory {
public override bool CanConvert(Type typeToConvert) {
return typeToConvert == typeof(NoteKind);
}
public override JsonConverter? CreateConverter(Type typeToConvert, JsonSerializerOptions options) {
return new Converter();
}
private class Converter : JsonConverter<NoteKind> {
public override NoteKind Read(
ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options
) {
return reader.GetString() switch {
"departsAs" => NoteKind.DepartsAs,
"trainNumberChange" => NoteKind.TrainNumberChange,
"receivingWagons" => NoteKind.ReceivingWagons,
"detachingWagons" => NoteKind.DetachingWagons,
_ => throw new NotImplementedException()
};
}
public override void Write(Utf8JsonWriter writer, NoteKind value, JsonSerializerOptions options) {
writer.WriteStringValue(value switch {
NoteKind.DepartsAs => "departsAs",
NoteKind.TrainNumberChange => "trainNumberChange",
NoteKind.DetachingWagons => "detachingWagons",
NoteKind.ReceivingWagons => "receivingWagons",
_ => throw new NotImplementedException()
});
}
}
}
}
#endregion
}
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Text.Json.Serialization;
using InfoferScraper.Models.Status;
using InfoferScraper.Models.Train.JsonConverters;
namespace InfoferScraper.Models.Train {
#region Interfaces
public interface ITrainScrapeResult {
public string Rank { get; }
public string Number { get; }
/// <summary>
/// Date in the DD.MM.YYYY format
/// This date is taken as-is from the result.
/// </summary>
public string Date { get; }
public string Operator { get; }
public IReadOnlyList<ITrainGroup> Groups { get; }
}
public interface ITrainGroup {
public ITrainRoute Route { get; }
public ITrainStatus? Status { get; }
public IReadOnlyList<ITrainStopDescription> Stations { get; }
}
public interface ITrainRoute {
public string From { get; }
public string To { get; }
}
public interface ITrainStatus {
public int Delay { get; }
public string Station { get; }
public StatusKind State { get; }
}
public interface ITrainStopDescription {
public string Name { get; }
public string LinkName { get; }
public int Km { get; }
/// <summary>
/// The time the train waits in the station in seconds
/// </summary>
public int? StoppingTime { get; }
public string? Platform { get; }
public ITrainStopArrDep? Arrival { get; }
public ITrainStopArrDep? Departure { get; }
public IReadOnlyList<object> Notes { get; }
}
public interface ITrainStopNote {
public NoteKind Kind { get; }
}
public interface ITrainStopTrainNumberChangeNote : ITrainStopNote {
public string Rank { get; }
public string Number { get; }
}
public interface ITrainStopDepartsAsNote : ITrainStopNote {
public string Rank { get; }
public string Number { get; }
public DateTimeOffset DepartureDate { get; }
}
public interface ITrainStopDetachingWagonsNote : ITrainStopNote {
public string Station { get; }
}
public interface ITrainStopReceivingWagonsNote : ITrainStopNote {
public string Station { get; }
}
public interface ITrainStopArrDep {
public DateTimeOffset ScheduleTime { get; }
public IStatus? Status { get; }
}
#endregion
[JsonConverter(typeof(StatusKindConverter))]
public enum StatusKind {
Passing,
Arrival,
Departure,
}
[JsonConverter(typeof(NoteKindConverter))]
public enum NoteKind {
TrainNumberChange,
DetachingWagons,
ReceivingWagons,
DepartsAs,
}
#region Implementations
internal record TrainScrapeResult : ITrainScrapeResult {
private List<ITrainGroup> ModifyableGroups { get; set; } = new();
public string Rank { get; set; } = "";
public string Number { get; set; } = "";
public string Date { get; set; } = "";
public string Operator { get; set; } = "";
public IReadOnlyList<ITrainGroup> Groups => ModifyableGroups.AsReadOnly();
private void AddTrainGroup(ITrainGroup trainGroup) {
ModifyableGroups.Add(trainGroup);
}
internal void AddTrainGroup(Action<TrainGroup> configurator) {
TrainGroup newTrainGroup = new();
configurator(newTrainGroup);
AddTrainGroup(newTrainGroup);
}
}
internal record TrainGroup : ITrainGroup {
private List<ITrainStopDescription> ModifyableStations { get; set; } = new();
public ITrainRoute Route { get; init; } = new TrainRoute();
public ITrainStatus? Status { get; private set; }
public IReadOnlyList<ITrainStopDescription> Stations => ModifyableStations.AsReadOnly();
private void AddStopDescription(ITrainStopDescription stopDescription) {
ModifyableStations.Add(stopDescription);
}
internal void AddStopDescription(Action<TrainStopDescription> configurator) {
TrainStopDescription newStopDescription = new();
configurator(newStopDescription);
AddStopDescription(newStopDescription);
}
internal void ConfigureRoute(Action<TrainRoute> configurator) {
configurator((TrainRoute)Route);
}
internal void MakeStatus(Action<TrainStatus> configurator) {
TrainStatus newStatus = new();
configurator(newStatus);
Status = newStatus;
}
}
internal record TrainRoute : ITrainRoute {
public TrainRoute() {
From = "";
To = "";
}
public string From { get; set; }
public string To { get; set; }
}
internal record TrainStatus : ITrainStatus {
public int Delay { get; set; }
public string Station { get; set; } = "";
public StatusKind State { get; set; }
}
internal record TrainStopDescription : ITrainStopDescription {
private List<ITrainStopNote> ModifyableNotes { get; } = new();
public string Name { get; set; } = "";
public string LinkName { get; set; } = "";
public int Km { get; set; }
public int? StoppingTime { get; set; }
public string? Platform { get; set; }
public ITrainStopArrDep? Arrival { get; private set; }
public ITrainStopArrDep? Departure { get; private set; }
public IReadOnlyList<object> Notes => ModifyableNotes.AsReadOnly();
internal void MakeArrival(Action<TrainStopArrDep> configurator) {
TrainStopArrDep newArrival = new();
configurator(newArrival);
Arrival = newArrival;
}
internal void MakeDeparture(Action<TrainStopArrDep> configurator) {
TrainStopArrDep newDeparture = new();
configurator(newDeparture);
Departure = newDeparture;
}
class DepartsAsNote : ITrainStopDepartsAsNote {
public NoteKind Kind => NoteKind.DepartsAs;
public string Rank { get; set; } = "";
public string Number { get; set; } = "";
public DateTimeOffset DepartureDate { get; set; }
}
class TrainNumberChangeNote : ITrainStopTrainNumberChangeNote {
public NoteKind Kind => NoteKind.TrainNumberChange;
public string Rank { get; set; } = "";
public string Number { get; set; } = "";
}
class ReceivingWagonsNote : ITrainStopReceivingWagonsNote {
public NoteKind Kind => NoteKind.ReceivingWagons;
public string Station { get; set; } = "";
}
class DetachingWagonsNote : ITrainStopReceivingWagonsNote {
public NoteKind Kind => NoteKind.DetachingWagons;
public string Station { get; set; } = "";
}
internal void AddDepartsAsNote(string rank, string number, DateTimeOffset departureDate) {
ModifyableNotes.Add(new DepartsAsNote { Rank = rank, Number = number, DepartureDate = departureDate });
}
internal void AddTrainNumberChangeNote(string rank, string number) {
ModifyableNotes.Add(new TrainNumberChangeNote { Rank = rank, Number = number });
}
internal void AddReceivingWagonsNote(string station) {
ModifyableNotes.Add(new ReceivingWagonsNote { Station = station });
}
internal void AddDetachingWagonsNote(string station) {
ModifyableNotes.Add(new DetachingWagonsNote { Station = station });
}
}
public record TrainStopArrDep : ITrainStopArrDep {
public DateTimeOffset ScheduleTime { get; set; }
public IStatus? Status { get; private set; }
internal void MakeStatus(Action<Status.Status> configurator) {
Status.Status newStatus = new();
configurator(newStatus);
Status = newStatus;
}
}
#endregion
#region JSON Converters
namespace JsonConverters {
internal class StatusKindConverter : JsonConverterFactory {
public override bool CanConvert(Type typeToConvert) {
return typeToConvert == typeof(StatusKind);
}
public override JsonConverter? CreateConverter(Type typeToConvert, JsonSerializerOptions options) {
return new Converter();
}
private class Converter : JsonConverter<StatusKind> {
public override StatusKind Read(
ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options
) {
return reader.GetString() switch {
"arrival" => StatusKind.Arrival,
"departure" => StatusKind.Departure,
"passing" => StatusKind.Passing,
_ => throw new NotImplementedException()
};
}
public override void Write(Utf8JsonWriter writer, StatusKind value, JsonSerializerOptions options) {
writer.WriteStringValue(value switch {
StatusKind.Passing => "passing",
StatusKind.Arrival => "arrival",
StatusKind.Departure => "departure",
_ => throw new NotImplementedException()
});
}
}
}
internal class NoteKindConverter : JsonConverterFactory {
public override bool CanConvert(Type typeToConvert) {
return typeToConvert == typeof(NoteKind);
}
public override JsonConverter? CreateConverter(Type typeToConvert, JsonSerializerOptions options) {
return new Converter();
}
private class Converter : JsonConverter<NoteKind> {
public override NoteKind Read(
ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options
) {
return reader.GetString() switch {
"departsAs" => NoteKind.DepartsAs,
"trainNumberChange" => NoteKind.TrainNumberChange,
"receivingWagons" => NoteKind.ReceivingWagons,
"detachingWagons" => NoteKind.DetachingWagons,
_ => throw new NotImplementedException()
};
}
public override void Write(Utf8JsonWriter writer, NoteKind value, JsonSerializerOptions options) {
writer.WriteStringValue(value switch {
NoteKind.DepartsAs => "departsAs",
NoteKind.TrainNumberChange => "trainNumberChange",
NoteKind.DetachingWagons => "detachingWagons",
NoteKind.ReceivingWagons => "receivingWagons",
_ => throw new NotImplementedException()
});
}
}
}
}
#endregion
}

483
scraper/src/Scrapers/Train.cs

@ -1,239 +1,244 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Net.Http;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using AngleSharp;
using AngleSharp.Dom;
using AngleSharp.Html.Dom;
using Flurl;
using InfoferScraper.Models.Train;
using NodaTime;
using NodaTime.Extensions;
using scraper.Exceptions;
namespace InfoferScraper.Scrapers {
public static class TrainScraper {
private const string BaseUrl = "https://mersultrenurilor.infofer.ro/ro-RO/";
private static readonly Regex TrainInfoRegex = new(@"^([A-Z-]+)\s([0-9]+)\sîn\s([0-9.]+)$");
private static readonly Regex OperatorRegex = new(@"^Operat\sde\s(.+)$");
private static readonly Regex RouteRegex =
new(@$"^Parcurs\stren\s([{Utils.RoLetters} ]+)[-–]([{Utils.RoLetters}\s]+)$");
private static readonly Regex SlRegex =
new(
@"^(?:Fără|([0-9]+)\smin)\s(întârziere|mai\sdevreme)\sla\s(trecerea\sfără\soprire\sprin|sosirea\sîn|plecarea\sdin)\s(.+)\.$");
private static readonly Dictionary<char, StatusKind> SlStateMap = new() {
{ 't', StatusKind.Passing },
{ 's', StatusKind.Arrival },
{ 'p', StatusKind.Departure },
};
private static readonly Regex KmRegex = new(@"^km\s([0-9]+)$");
private static readonly Regex StoppingTimeRegex = new(@"^([0-9]+)\s(min|sec)\soprire$");
private static readonly Regex PlatformRegex = new(@"^linia\s(.+)$");
private static readonly Regex StationArrdepStatusRegex =
new(@"^(?:(la timp)|(?:((?:\+|-)[0-9]+) min \((?:(?:întârziere)|(?:mai devreme))\)))(\*?)$");
private static readonly Regex TrainNumberChangeNoteRegex =
new(@"^Trenul își schimbă numărul în\s([A-Z-]+)\s([0-9]+)$");
private static readonly Regex DepartsAsNoteRegex =
new(@"^Trenul pleacă cu numărul\s([A-Z-]+)\s([0-9]+)\sîn\s([0-9]{2}).([0-9]{2}).([0-9]{4})$");
private static readonly Regex ReceivingWagonsNoteRegex =
new(@"^Trenul primește vagoane de la\s(.+)\.$");
private static readonly Regex DetachingWagonsNoteRegex =
new(@"^Trenul detașează vagoane pentru stația\s(.+)\.$");
private static readonly DateTimeZone BucharestTz = DateTimeZoneProviders.Tzdb["Europe/Bucharest"];
private static readonly CookieContainer CookieContainer = new();
private static readonly HttpClient HttpClient = new(new HttpClientHandler {
CookieContainer = CookieContainer,
UseCookies = true,
}) {
BaseAddress = new Uri(BaseUrl),
DefaultRequestVersion = new Version(2, 0),
};
public static async Task<ITrainScrapeResult?> Scrape(string trainNumber, DateTimeOffset? dateOverride = null) {
var dateOverrideInstant = dateOverride?.ToInstant().InZone(BucharestTz);
dateOverride = dateOverrideInstant?.ToDateTimeOffset();
TrainScrapeResult result = new();
var asConfig = Configuration.Default;
var asContext = BrowsingContext.New(asConfig);
var firstUrl = "Tren"
.AppendPathSegment(trainNumber);
if (dateOverride != null) {
firstUrl = firstUrl.SetQueryParam("Date", $"{dateOverride:d.MM.yyyy}");
}
var firstResponse = await HttpClient.GetStringAsync(firstUrl);
var firstDocument = await asContext.OpenAsync(req => req.Content(firstResponse));
var firstForm = firstDocument.GetElementById("form-search")!;
var firstResult = firstForm
.QuerySelectorAll<IHtmlInputElement>("input")
.Where(elem => elem.Name != null)
.ToDictionary(elem => elem.Name!, elem => elem.Value);
var secondUrl = "".AppendPathSegments("Trains", "TrainsResult");
var secondResponse = await HttpClient.PostAsync(
secondUrl,
#pragma warning disable CS8620
new FormUrlEncodedContent(firstResult)
#pragma warning restore CS8620
);
var secondResponseContent = await secondResponse.Content.ReadAsStringAsync();
var secondDocument = await asContext.OpenAsync(
req => req.Content(secondResponseContent)
);
var (trainInfoDiv, (_, (_, (resultsDiv, _)))) = secondDocument
.QuerySelectorAll("body > div");
if (trainInfoDiv == null) {
return null;
}
if (resultsDiv == null) {
throw new TrainNotThisDayException();
}
trainInfoDiv = trainInfoDiv.QuerySelectorAll(":scope > div > div").First();
(result.Rank, (result.Number, (result.Date, _))) = (TrainInfoRegex.Match(
trainInfoDiv.QuerySelector(":scope > h2")!.Text().WithCollapsedSpaces()
).Groups as IEnumerable<Group>).Select(group => group.Value).Skip(1);
var (scrapedDateD, (scrapedDateM, (scrapedDateY, _))) = result.Date
.Split('.')
.Select(int.Parse);
var date = new DateTime(scrapedDateY, scrapedDateM, scrapedDateD);
result.Operator = (OperatorRegex.Match(
trainInfoDiv.QuerySelector(":scope > p")!.Text().WithCollapsedSpaces()
).Groups as IEnumerable<Group>).Skip(1).First().Value;
foreach (var groupDiv in resultsDiv.QuerySelectorAll(":scope > div")) {
result.AddTrainGroup(group => {
var statusDiv = groupDiv.QuerySelectorAll(":scope > div").First();
var routeText = statusDiv.QuerySelector(":scope > h4")!.Text().WithCollapsedSpaces();
group.ConfigureRoute(route => {
(route.From, (route.To, _)) = (RouteRegex.Match(routeText).Groups as IEnumerable<Group>).Skip(1)
.Select(group => group.Value);
});
try {
var statusLineMatch =
SlRegex.Match(statusDiv.QuerySelector(":scope > div")!.Text().WithCollapsedSpaces());
var (slmDelay, (slmLate, (slmArrival, (slmStation, _)))) =
(statusLineMatch.Groups as IEnumerable<Group>).Skip(1).Select(group => group.Value);
group.MakeStatus(status => {
status.Delay = string.IsNullOrEmpty(slmDelay) ? 0 :
slmLate == "întârziere" ? int.Parse(slmDelay) : -int.Parse(slmDelay);
status.Station = slmStation;
status.State = SlStateMap[slmArrival[0]];
});
}
catch {
// ignored
}
Utils.DateTimeSequencer dtSeq = new(date.Year, date.Month, date.Day);
var stations = statusDiv.QuerySelectorAll(":scope > ul > li");
foreach (var station in stations) {
group.AddStopDescription(stopDescription => {
var (left, (middle, (right, _))) = station
.QuerySelectorAll(":scope > div > div");
var (stopDetails, (stopNotes, _)) = middle
.QuerySelectorAll(":scope > div > div > div");
stopDescription.Name = stopDetails
.QuerySelectorAll(":scope > div")[0]
.Text()
.WithCollapsedSpaces();
var scrapedKm = stopDetails
.QuerySelectorAll(":scope > div")[1]
.Text()
.WithCollapsedSpaces();
stopDescription.Km = int.Parse(
(KmRegex.Match(scrapedKm).Groups as IEnumerable<Group>).Skip(1).First().Value
);
var scrapedStoppingTime = stopDetails
.QuerySelectorAll(":scope > div")[2]
.Text()
.WithCollapsedSpaces();
if (!string.IsNullOrEmpty(scrapedStoppingTime)) {
var (stValue, (stMinsec, _)) =
(StoppingTimeRegex.Match(scrapedStoppingTime).Groups as IEnumerable<Group>)
.Skip(1)
.Select(group => group.Value);
stopDescription.StoppingTime = int.Parse(stValue);
if (stMinsec == "min") stopDescription.StoppingTime *= 60;
}
var scrapedPlatform = stopDetails
.QuerySelectorAll(":scope > div")[3]
.Text()
.WithCollapsedSpaces();
if (!string.IsNullOrEmpty(scrapedPlatform))
stopDescription.Platform = PlatformRegex.Match(scrapedPlatform).Groups[1].Value;
void ScrapeTime(IElement element, ref TrainStopArrDep arrDep) {
var parts = element.QuerySelectorAll(":scope > div > div > div");
if (parts.Length == 0) throw new OperationCanceledException();
var time = parts[0];
var scrapedTime = time.Text().WithCollapsedSpaces();
var (stHour, (stMin, _)) = scrapedTime.Split(':').Select(int.Parse);
arrDep.ScheduleTime = BucharestTz.AtLeniently(dtSeq.Next(stHour, stMin).ToLocalDateTime())
.ToDateTimeOffset();
if (parts.Length < 2) return;
var statusElement = parts[1];
var (onTime, (delay, (approx, _))) = (StationArrdepStatusRegex.Match(
statusElement.Text().WithCollapsedSpaces(replaceWith: " ")
).Groups as IEnumerable<Group>).Skip(1).Select(group => group.Value);
arrDep.MakeStatus(status => {
status.Delay = string.IsNullOrEmpty(onTime) ? int.Parse(delay) : 0;
status.Real = string.IsNullOrEmpty(approx);
});
}
try {
stopDescription.MakeArrival(arrival => { ScrapeTime(left, ref arrival); });
}
catch (OperationCanceledException) { }
try {
stopDescription.MakeDeparture(departure => { ScrapeTime(right, ref departure); });
}
catch (OperationCanceledException) { }
foreach (var noteDiv in stopNotes.QuerySelectorAll(":scope > div > div")) {
var noteText = noteDiv.Text().WithCollapsedSpaces();
Match trainNumberChangeMatch, departsAsMatch, detachingWagons, receivingWagons;
if ((trainNumberChangeMatch = TrainNumberChangeNoteRegex.Match(noteText)).Success) {
stopDescription.AddTrainNumberChangeNote(trainNumberChangeMatch.Groups[1].Value, trainNumberChangeMatch.Groups[2].Value);
}
else if ((departsAsMatch = DepartsAsNoteRegex.Match(noteText)).Success) {
var groups = departsAsMatch.Groups;
var departureDate = BucharestTz.AtStrictly(new(int.Parse(groups[5].Value), int.Parse(groups[4].Value), int.Parse(groups[3].Value), 0, 0));
stopDescription.AddDepartsAsNote(groups[1].Value, groups[2].Value, departureDate.ToDateTimeOffset());
}
else if ((detachingWagons = DetachingWagonsNoteRegex.Match(noteText)).Success) {
stopDescription.AddDetachingWagonsNote(detachingWagons.Groups[1].Value);
}
else if ((receivingWagons = ReceivingWagonsNoteRegex.Match(noteText)).Success) {
stopDescription.AddReceivingWagonsNote(receivingWagons.Groups[1].Value);
}
}
});
}
});
}
return result;
}
}
} // namespace
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Net.Http;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using AngleSharp;
using AngleSharp.Dom;
using AngleSharp.Html.Dom;
using Flurl;
using InfoferScraper.Models.Train;
using NodaTime;
using NodaTime.Extensions;
using scraper.Exceptions;
namespace InfoferScraper.Scrapers {
public static class TrainScraper {
private const string BaseUrl = "https://mersultrenurilor.infofer.ro/ro-RO/";
private static readonly Regex TrainInfoRegex = new(@"^([A-Z-]+)\s([0-9]+)\sîn\s([0-9.]+)$");
private static readonly Regex OperatorRegex = new(@"^Operat\sde\s(.+)$");
private static readonly Regex RouteRegex =
new(@$"^Parcurs\stren\s([{Utils.RoLetters} ]+)[-–]([{Utils.RoLetters}\s]+)$");
private static readonly Regex SlRegex =
new(
@"^(?:Fără|([0-9]+)\smin)\s(întârziere|mai\sdevreme)\sla\s(trecerea\sfără\soprire\sprin|sosirea\sîn|plecarea\sdin)\s(.+)\.$");
private static readonly Dictionary<char, StatusKind> SlStateMap = new() {
{ 't', StatusKind.Passing },
{ 's', StatusKind.Arrival },
{ 'p', StatusKind.Departure },
};
private static readonly Regex KmRegex = new(@"^km\s([0-9]+)$");
private static readonly Regex StoppingTimeRegex = new(@"^([0-9]+)\s(min|sec)\soprire$");
private static readonly Regex PlatformRegex = new(@"^linia\s(.+)$");
private static readonly Regex StationArrdepStatusRegex =
new(@"^(?:(la timp)|(?:((?:\+|-)[0-9]+) min \((?:(?:întârziere)|(?:mai devreme))\)))(\*?)$");
private static readonly Regex TrainNumberChangeNoteRegex =
new(@"^Trenul își schimbă numărul în\s([A-Z-]+)\s([0-9]+)$");
private static readonly Regex DepartsAsNoteRegex =
new(@"^Trenul pleacă cu numărul\s([A-Z-]+)\s([0-9]+)\sîn\s([0-9]{2}).([0-9]{2}).([0-9]{4})$");
private static readonly Regex ReceivingWagonsNoteRegex =
new(@"^Trenul primește vagoane de la\s(.+)\.$");
private static readonly Regex DetachingWagonsNoteRegex =
new(@"^Trenul detașează vagoane pentru stația\s(.+)\.$");
private static readonly DateTimeZone BucharestTz = DateTimeZoneProviders.Tzdb["Europe/Bucharest"];
private static readonly CookieContainer CookieContainer = new();
private static readonly HttpClient HttpClient = new(new HttpClientHandler {
CookieContainer = CookieContainer,
UseCookies = true,
}) {
BaseAddress = new Uri(BaseUrl),
DefaultRequestVersion = new Version(2, 0),
};
public static async Task<ITrainScrapeResult?> Scrape(string trainNumber, DateTimeOffset? dateOverride = null) {
var dateOverrideInstant = dateOverride?.ToInstant().InZone(BucharestTz);
dateOverride = dateOverrideInstant?.ToDateTimeOffset();
TrainScrapeResult result = new();
var asConfig = Configuration.Default;
var asContext = BrowsingContext.New(asConfig);
var firstUrl = "Tren"
.AppendPathSegment(trainNumber);
if (dateOverride != null) {
firstUrl = firstUrl.SetQueryParam("Date", $"{dateOverride:d.MM.yyyy}");
}
var firstResponse = await HttpClient.GetStringAsync(firstUrl);
var firstDocument = await asContext.OpenAsync(req => req.Content(firstResponse));
var firstForm = firstDocument.GetElementById("form-search")!;
var firstResult = firstForm
.QuerySelectorAll<IHtmlInputElement>("input")
.Where(elem => elem.Name != null)
.ToDictionary(elem => elem.Name!, elem => elem.Value);
var secondUrl = "".AppendPathSegments("Trains", "TrainsResult");
var secondResponse = await HttpClient.PostAsync(
secondUrl,
#pragma warning disable CS8620
new FormUrlEncodedContent(firstResult)
#pragma warning restore CS8620
);
var secondResponseContent = await secondResponse.Content.ReadAsStringAsync();
var secondDocument = await asContext.OpenAsync(
req => req.Content(secondResponseContent)
);
var (trainInfoDiv, (_, (_, (resultsDiv, _)))) = secondDocument
.QuerySelectorAll("body > div");
if (trainInfoDiv == null) {
return null;
}
if (resultsDiv == null) {
throw new TrainNotThisDayException();
}
trainInfoDiv = trainInfoDiv.QuerySelectorAll(":scope > div > div").First();
(result.Rank, (result.Number, (result.Date, _))) = (TrainInfoRegex.Match(
trainInfoDiv.QuerySelector(":scope > h2")!.Text().WithCollapsedSpaces()
).Groups as IEnumerable<Group>).Select(group => group.Value).Skip(1);
var (scrapedDateD, (scrapedDateM, (scrapedDateY, _))) = result.Date
.Split('.')
.Select(int.Parse);
var date = new DateTime(scrapedDateY, scrapedDateM, scrapedDateD);
result.Operator = (OperatorRegex.Match(
trainInfoDiv.QuerySelector(":scope > p")!.Text().WithCollapsedSpaces()
).Groups as IEnumerable<Group>).Skip(1).First().Value;
foreach (var groupDiv in resultsDiv.QuerySelectorAll(":scope > div")) {
result.AddTrainGroup(group => {
var statusDiv = groupDiv.QuerySelectorAll(":scope > div").First();
var routeText = statusDiv.QuerySelector(":scope > h4")!.Text().WithCollapsedSpaces();
group.ConfigureRoute(route => {
(route.From, (route.To, _)) = (RouteRegex.Match(routeText).Groups as IEnumerable<Group>).Skip(1)
.Select(group => group.Value);
});
try {
var statusLineMatch =
SlRegex.Match(statusDiv.QuerySelector(":scope > div")!.Text().WithCollapsedSpaces());
var (slmDelay, (slmLate, (slmArrival, (slmStation, _)))) =
(statusLineMatch.Groups as IEnumerable<Group>).Skip(1).Select(group => group.Value);
group.MakeStatus(status => {
status.Delay = string.IsNullOrEmpty(slmDelay) ? 0 :
slmLate == "întârziere" ? int.Parse(slmDelay) : -int.Parse(slmDelay);
status.Station = slmStation;
status.State = SlStateMap[slmArrival[0]];
});
}
catch {
// ignored
}
Utils.DateTimeSequencer dtSeq = new(date.Year, date.Month, date.Day);
var stations = statusDiv.QuerySelectorAll(":scope > ul > li");
foreach (var station in stations) {
group.AddStopDescription(stopDescription => {
var (left, (middle, (right, _))) = station
.QuerySelectorAll(":scope > div > div");
var (stopDetails, (stopNotes, _)) = middle
.QuerySelectorAll(":scope > div > div > div");
stopDescription.Name = stopDetails
.QuerySelectorAll(":scope > div")[0]
.Text()
.WithCollapsedSpaces();
stopDescription.LinkName = new Flurl.Url(stopDetails
.QuerySelectorAll(":scope > div")[0]
.QuerySelector(":scope a")
.Attributes["href"]
.Value).PathSegments.Last();
var scrapedKm = stopDetails
.QuerySelectorAll(":scope > div")[1]
.Text()
.WithCollapsedSpaces();
stopDescription.Km = int.Parse(
(KmRegex.Match(scrapedKm).Groups as IEnumerable<Group>).Skip(1).First().Value
);
var scrapedStoppingTime = stopDetails
.QuerySelectorAll(":scope > div")[2]
.Text()
.WithCollapsedSpaces();
if (!string.IsNullOrEmpty(scrapedStoppingTime)) {
var (stValue, (stMinsec, _)) =
(StoppingTimeRegex.Match(scrapedStoppingTime).Groups as IEnumerable<Group>)
.Skip(1)
.Select(group => group.Value);
stopDescription.StoppingTime = int.Parse(stValue);
if (stMinsec == "min") stopDescription.StoppingTime *= 60;
}
var scrapedPlatform = stopDetails
.QuerySelectorAll(":scope > div")[3]
.Text()
.WithCollapsedSpaces();
if (!string.IsNullOrEmpty(scrapedPlatform))
stopDescription.Platform = PlatformRegex.Match(scrapedPlatform).Groups[1].Value;
void ScrapeTime(IElement element, ref TrainStopArrDep arrDep) {
var parts = element.QuerySelectorAll(":scope > div > div > div");
if (parts.Length == 0) throw new OperationCanceledException();
var time = parts[0];
var scrapedTime = time.Text().WithCollapsedSpaces();
var (stHour, (stMin, _)) = scrapedTime.Split(':').Select(int.Parse);
arrDep.ScheduleTime = BucharestTz.AtLeniently(dtSeq.Next(stHour, stMin).ToLocalDateTime())
.ToDateTimeOffset();
if (parts.Length < 2) return;
var statusElement = parts[1];
var (onTime, (delay, (approx, _))) = (StationArrdepStatusRegex.Match(
statusElement.Text().WithCollapsedSpaces(replaceWith: " ")
).Groups as IEnumerable<Group>).Skip(1).Select(group => group.Value);
arrDep.MakeStatus(status => {
status.Delay = string.IsNullOrEmpty(onTime) ? int.Parse(delay) : 0;
status.Real = string.IsNullOrEmpty(approx);
});
}
try {
stopDescription.MakeArrival(arrival => { ScrapeTime(left, ref arrival); });
}
catch (OperationCanceledException) { }
try {
stopDescription.MakeDeparture(departure => { ScrapeTime(right, ref departure); });
}
catch (OperationCanceledException) { }
foreach (var noteDiv in stopNotes.QuerySelectorAll(":scope > div > div")) {
var noteText = noteDiv.Text().WithCollapsedSpaces();
Match trainNumberChangeMatch, departsAsMatch, detachingWagons, receivingWagons;
if ((trainNumberChangeMatch = TrainNumberChangeNoteRegex.Match(noteText)).Success) {
stopDescription.AddTrainNumberChangeNote(trainNumberChangeMatch.Groups[1].Value, trainNumberChangeMatch.Groups[2].Value);
}
else if ((departsAsMatch = DepartsAsNoteRegex.Match(noteText)).Success) {
var groups = departsAsMatch.Groups;
var departureDate = BucharestTz.AtStrictly(new(int.Parse(groups[5].Value), int.Parse(groups[4].Value), int.Parse(groups[3].Value), 0, 0));
stopDescription.AddDepartsAsNote(groups[1].Value, groups[2].Value, departureDate.ToDateTimeOffset());
}
else if ((detachingWagons = DetachingWagonsNoteRegex.Match(noteText)).Success) {
stopDescription.AddDetachingWagonsNote(detachingWagons.Groups[1].Value);
}
else if ((receivingWagons = ReceivingWagonsNoteRegex.Match(noteText)).Success) {
stopDescription.AddReceivingWagonsNote(receivingWagons.Groups[1].Value);
}
}
});
}
});
}
return result;
}
}
} // namespace

Loading…
Cancel
Save