From 5785e4a98b4c411df656585f80472606a78d2538 Mon Sep 17 00:00:00 2001 From: Dan Cojocaru Date: Wed, 1 Feb 2023 05:24:24 +0100 Subject: [PATCH] Add LinkName to stations when querying a train --- scraper/src/Models/Train.cs | 634 +++++++++++++++++----------------- scraper/src/Scrapers/Train.cs | 483 +++++++++++++------------- 2 files changed, 562 insertions(+), 555 deletions(-) diff --git a/scraper/src/Models/Train.cs b/scraper/src/Models/Train.cs index 9ebdeb7..aa34507 100644 --- a/scraper/src/Models/Train.cs +++ b/scraper/src/Models/Train.cs @@ -1,316 +1,318 @@ -using System; -using System.Collections.Generic; -using System.Text.Json; -using System.Text.Json.Serialization; -using InfoferScraper.Models.Status; -using InfoferScraper.Models.Train.JsonConverters; - -namespace InfoferScraper.Models.Train { - #region Interfaces - - public interface ITrainScrapeResult { - public string Rank { get; } - - public string Number { get; } - - /// - /// Date in the DD.MM.YYYY format - /// This date is taken as-is from the result. - /// - public string Date { get; } - - public string Operator { get; } - - public IReadOnlyList Groups { get; } - } - - public interface ITrainGroup { - public ITrainRoute Route { get; } - - public ITrainStatus? Status { get; } - public IReadOnlyList Stations { get; } - } - - public interface ITrainRoute { - public string From { get; } - public string To { get; } - } - - public interface ITrainStatus { - public int Delay { get; } - public string Station { get; } - public StatusKind State { get; } - } - - public interface ITrainStopDescription { - public string Name { get; } - public int Km { get; } - - /// - /// The time the train waits in the station in seconds - /// - public int? StoppingTime { get; } - - public string? Platform { get; } - public ITrainStopArrDep? Arrival { get; } - public ITrainStopArrDep? Departure { get; } - - public IReadOnlyList Notes { get; } - } - - public interface ITrainStopNote { - public NoteKind Kind { get; } - } - - public interface ITrainStopTrainNumberChangeNote : ITrainStopNote { - public string Rank { get; } - public string Number { get; } - } - - public interface ITrainStopDepartsAsNote : ITrainStopNote { - public string Rank { get; } - public string Number { get; } - public DateTimeOffset DepartureDate { get; } - } - - public interface ITrainStopDetachingWagonsNote : ITrainStopNote { - public string Station { get; } - } - - public interface ITrainStopReceivingWagonsNote : ITrainStopNote { - public string Station { get; } - } - - public interface ITrainStopArrDep { - public DateTimeOffset ScheduleTime { get; } - public IStatus? Status { get; } - } - - #endregion - - [JsonConverter(typeof(StatusKindConverter))] - public enum StatusKind { - Passing, - Arrival, - Departure, - } - - [JsonConverter(typeof(NoteKindConverter))] - public enum NoteKind { - TrainNumberChange, - DetachingWagons, - ReceivingWagons, - DepartsAs, - } - - #region Implementations - - internal record TrainScrapeResult : ITrainScrapeResult { - private List ModifyableGroups { get; set; } = new(); - public string Rank { get; set; } = ""; - public string Number { get; set; } = ""; - public string Date { get; set; } = ""; - public string Operator { get; set; } = ""; - public IReadOnlyList Groups => ModifyableGroups.AsReadOnly(); - - private void AddTrainGroup(ITrainGroup trainGroup) { - ModifyableGroups.Add(trainGroup); - } - - internal void AddTrainGroup(Action configurator) { - TrainGroup newTrainGroup = new(); - configurator(newTrainGroup); - AddTrainGroup(newTrainGroup); - } - } - - internal record TrainGroup : ITrainGroup { - private List ModifyableStations { get; set; } = new(); - public ITrainRoute Route { get; init; } = new TrainRoute(); - public ITrainStatus? Status { get; private set; } - public IReadOnlyList Stations => ModifyableStations.AsReadOnly(); - - private void AddStopDescription(ITrainStopDescription stopDescription) { - ModifyableStations.Add(stopDescription); - } - - internal void AddStopDescription(Action configurator) { - TrainStopDescription newStopDescription = new(); - configurator(newStopDescription); - AddStopDescription(newStopDescription); - } - - internal void ConfigureRoute(Action configurator) { - configurator((TrainRoute)Route); - } - - internal void MakeStatus(Action configurator) { - TrainStatus newStatus = new(); - configurator(newStatus); - Status = newStatus; - } - } - - internal record TrainRoute : ITrainRoute { - public TrainRoute() { - From = ""; - To = ""; - } - - public string From { get; set; } - public string To { get; set; } - } - - internal record TrainStatus : ITrainStatus { - public int Delay { get; set; } - public string Station { get; set; } = ""; - public StatusKind State { get; set; } - } - - internal record TrainStopDescription : ITrainStopDescription { - private List ModifyableNotes { get; } = new(); - public string Name { get; set; } = ""; - public int Km { get; set; } - public int? StoppingTime { get; set; } - public string? Platform { get; set; } - public ITrainStopArrDep? Arrival { get; private set; } - public ITrainStopArrDep? Departure { get; private set; } - public IReadOnlyList Notes => ModifyableNotes.AsReadOnly(); - - internal void MakeArrival(Action configurator) { - TrainStopArrDep newArrival = new(); - configurator(newArrival); - Arrival = newArrival; - } - - internal void MakeDeparture(Action configurator) { - TrainStopArrDep newDeparture = new(); - configurator(newDeparture); - Departure = newDeparture; - } - - class DepartsAsNote : ITrainStopDepartsAsNote { - public NoteKind Kind => NoteKind.DepartsAs; - public string Rank { get; set; } = ""; - public string Number { get; set; } = ""; - public DateTimeOffset DepartureDate { get; set; } - } - - class TrainNumberChangeNote : ITrainStopTrainNumberChangeNote { - public NoteKind Kind => NoteKind.TrainNumberChange; - public string Rank { get; set; } = ""; - public string Number { get; set; } = ""; - } - - class ReceivingWagonsNote : ITrainStopReceivingWagonsNote { - public NoteKind Kind => NoteKind.ReceivingWagons; - public string Station { get; set; } = ""; - } - - class DetachingWagonsNote : ITrainStopReceivingWagonsNote { - public NoteKind Kind => NoteKind.DetachingWagons; - public string Station { get; set; } = ""; - } - - internal void AddDepartsAsNote(string rank, string number, DateTimeOffset departureDate) { - ModifyableNotes.Add(new DepartsAsNote { Rank = rank, Number = number, DepartureDate = departureDate }); - } - - internal void AddTrainNumberChangeNote(string rank, string number) { - ModifyableNotes.Add(new TrainNumberChangeNote { Rank = rank, Number = number }); - } - - internal void AddReceivingWagonsNote(string station) { - ModifyableNotes.Add(new ReceivingWagonsNote { Station = station }); - } - - internal void AddDetachingWagonsNote(string station) { - ModifyableNotes.Add(new DetachingWagonsNote { Station = station }); - } - } - - public record TrainStopArrDep : ITrainStopArrDep { - public DateTimeOffset ScheduleTime { get; set; } - public IStatus? Status { get; private set; } - - internal void MakeStatus(Action configurator) { - Status.Status newStatus = new(); - configurator(newStatus); - Status = newStatus; - } - } - - #endregion - - #region JSON Converters - - namespace JsonConverters { - internal class StatusKindConverter : JsonConverterFactory { - public override bool CanConvert(Type typeToConvert) { - return typeToConvert == typeof(StatusKind); - } - - public override JsonConverter? CreateConverter(Type typeToConvert, JsonSerializerOptions options) { - return new Converter(); - } - - private class Converter : JsonConverter { - public override StatusKind Read( - ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options - ) { - return reader.GetString() switch { - "arrival" => StatusKind.Arrival, - "departure" => StatusKind.Departure, - "passing" => StatusKind.Passing, - _ => throw new NotImplementedException() - }; - } - - public override void Write(Utf8JsonWriter writer, StatusKind value, JsonSerializerOptions options) { - writer.WriteStringValue(value switch { - StatusKind.Passing => "passing", - StatusKind.Arrival => "arrival", - StatusKind.Departure => "departure", - _ => throw new NotImplementedException() - }); - } - } - } - - internal class NoteKindConverter : JsonConverterFactory { - public override bool CanConvert(Type typeToConvert) { - return typeToConvert == typeof(NoteKind); - } - - public override JsonConverter? CreateConverter(Type typeToConvert, JsonSerializerOptions options) { - return new Converter(); - } - - private class Converter : JsonConverter { - public override NoteKind Read( - ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options - ) { - return reader.GetString() switch { - "departsAs" => NoteKind.DepartsAs, - "trainNumberChange" => NoteKind.TrainNumberChange, - "receivingWagons" => NoteKind.ReceivingWagons, - "detachingWagons" => NoteKind.DetachingWagons, - _ => throw new NotImplementedException() - }; - } - - public override void Write(Utf8JsonWriter writer, NoteKind value, JsonSerializerOptions options) { - writer.WriteStringValue(value switch { - NoteKind.DepartsAs => "departsAs", - NoteKind.TrainNumberChange => "trainNumberChange", - NoteKind.DetachingWagons => "detachingWagons", - NoteKind.ReceivingWagons => "receivingWagons", - _ => throw new NotImplementedException() - }); - } - } - } - } - - #endregion -} +using System; +using System.Collections.Generic; +using System.Text.Json; +using System.Text.Json.Serialization; +using InfoferScraper.Models.Status; +using InfoferScraper.Models.Train.JsonConverters; + +namespace InfoferScraper.Models.Train { + #region Interfaces + + public interface ITrainScrapeResult { + public string Rank { get; } + + public string Number { get; } + + /// + /// Date in the DD.MM.YYYY format + /// This date is taken as-is from the result. + /// + public string Date { get; } + + public string Operator { get; } + + public IReadOnlyList Groups { get; } + } + + public interface ITrainGroup { + public ITrainRoute Route { get; } + + public ITrainStatus? Status { get; } + public IReadOnlyList Stations { get; } + } + + public interface ITrainRoute { + public string From { get; } + public string To { get; } + } + + public interface ITrainStatus { + public int Delay { get; } + public string Station { get; } + public StatusKind State { get; } + } + + public interface ITrainStopDescription { + public string Name { get; } + public string LinkName { get; } + public int Km { get; } + + /// + /// The time the train waits in the station in seconds + /// + public int? StoppingTime { get; } + + public string? Platform { get; } + public ITrainStopArrDep? Arrival { get; } + public ITrainStopArrDep? Departure { get; } + + public IReadOnlyList Notes { get; } + } + + public interface ITrainStopNote { + public NoteKind Kind { get; } + } + + public interface ITrainStopTrainNumberChangeNote : ITrainStopNote { + public string Rank { get; } + public string Number { get; } + } + + public interface ITrainStopDepartsAsNote : ITrainStopNote { + public string Rank { get; } + public string Number { get; } + public DateTimeOffset DepartureDate { get; } + } + + public interface ITrainStopDetachingWagonsNote : ITrainStopNote { + public string Station { get; } + } + + public interface ITrainStopReceivingWagonsNote : ITrainStopNote { + public string Station { get; } + } + + public interface ITrainStopArrDep { + public DateTimeOffset ScheduleTime { get; } + public IStatus? Status { get; } + } + + #endregion + + [JsonConverter(typeof(StatusKindConverter))] + public enum StatusKind { + Passing, + Arrival, + Departure, + } + + [JsonConverter(typeof(NoteKindConverter))] + public enum NoteKind { + TrainNumberChange, + DetachingWagons, + ReceivingWagons, + DepartsAs, + } + + #region Implementations + + internal record TrainScrapeResult : ITrainScrapeResult { + private List ModifyableGroups { get; set; } = new(); + public string Rank { get; set; } = ""; + public string Number { get; set; } = ""; + public string Date { get; set; } = ""; + public string Operator { get; set; } = ""; + public IReadOnlyList Groups => ModifyableGroups.AsReadOnly(); + + private void AddTrainGroup(ITrainGroup trainGroup) { + ModifyableGroups.Add(trainGroup); + } + + internal void AddTrainGroup(Action configurator) { + TrainGroup newTrainGroup = new(); + configurator(newTrainGroup); + AddTrainGroup(newTrainGroup); + } + } + + internal record TrainGroup : ITrainGroup { + private List ModifyableStations { get; set; } = new(); + public ITrainRoute Route { get; init; } = new TrainRoute(); + public ITrainStatus? Status { get; private set; } + public IReadOnlyList Stations => ModifyableStations.AsReadOnly(); + + private void AddStopDescription(ITrainStopDescription stopDescription) { + ModifyableStations.Add(stopDescription); + } + + internal void AddStopDescription(Action configurator) { + TrainStopDescription newStopDescription = new(); + configurator(newStopDescription); + AddStopDescription(newStopDescription); + } + + internal void ConfigureRoute(Action configurator) { + configurator((TrainRoute)Route); + } + + internal void MakeStatus(Action configurator) { + TrainStatus newStatus = new(); + configurator(newStatus); + Status = newStatus; + } + } + + internal record TrainRoute : ITrainRoute { + public TrainRoute() { + From = ""; + To = ""; + } + + public string From { get; set; } + public string To { get; set; } + } + + internal record TrainStatus : ITrainStatus { + public int Delay { get; set; } + public string Station { get; set; } = ""; + public StatusKind State { get; set; } + } + + internal record TrainStopDescription : ITrainStopDescription { + private List ModifyableNotes { get; } = new(); + public string Name { get; set; } = ""; + public string LinkName { get; set; } = ""; + public int Km { get; set; } + public int? StoppingTime { get; set; } + public string? Platform { get; set; } + public ITrainStopArrDep? Arrival { get; private set; } + public ITrainStopArrDep? Departure { get; private set; } + public IReadOnlyList Notes => ModifyableNotes.AsReadOnly(); + + internal void MakeArrival(Action configurator) { + TrainStopArrDep newArrival = new(); + configurator(newArrival); + Arrival = newArrival; + } + + internal void MakeDeparture(Action configurator) { + TrainStopArrDep newDeparture = new(); + configurator(newDeparture); + Departure = newDeparture; + } + + class DepartsAsNote : ITrainStopDepartsAsNote { + public NoteKind Kind => NoteKind.DepartsAs; + public string Rank { get; set; } = ""; + public string Number { get; set; } = ""; + public DateTimeOffset DepartureDate { get; set; } + } + + class TrainNumberChangeNote : ITrainStopTrainNumberChangeNote { + public NoteKind Kind => NoteKind.TrainNumberChange; + public string Rank { get; set; } = ""; + public string Number { get; set; } = ""; + } + + class ReceivingWagonsNote : ITrainStopReceivingWagonsNote { + public NoteKind Kind => NoteKind.ReceivingWagons; + public string Station { get; set; } = ""; + } + + class DetachingWagonsNote : ITrainStopReceivingWagonsNote { + public NoteKind Kind => NoteKind.DetachingWagons; + public string Station { get; set; } = ""; + } + + internal void AddDepartsAsNote(string rank, string number, DateTimeOffset departureDate) { + ModifyableNotes.Add(new DepartsAsNote { Rank = rank, Number = number, DepartureDate = departureDate }); + } + + internal void AddTrainNumberChangeNote(string rank, string number) { + ModifyableNotes.Add(new TrainNumberChangeNote { Rank = rank, Number = number }); + } + + internal void AddReceivingWagonsNote(string station) { + ModifyableNotes.Add(new ReceivingWagonsNote { Station = station }); + } + + internal void AddDetachingWagonsNote(string station) { + ModifyableNotes.Add(new DetachingWagonsNote { Station = station }); + } + } + + public record TrainStopArrDep : ITrainStopArrDep { + public DateTimeOffset ScheduleTime { get; set; } + public IStatus? Status { get; private set; } + + internal void MakeStatus(Action configurator) { + Status.Status newStatus = new(); + configurator(newStatus); + Status = newStatus; + } + } + + #endregion + + #region JSON Converters + + namespace JsonConverters { + internal class StatusKindConverter : JsonConverterFactory { + public override bool CanConvert(Type typeToConvert) { + return typeToConvert == typeof(StatusKind); + } + + public override JsonConverter? CreateConverter(Type typeToConvert, JsonSerializerOptions options) { + return new Converter(); + } + + private class Converter : JsonConverter { + public override StatusKind Read( + ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options + ) { + return reader.GetString() switch { + "arrival" => StatusKind.Arrival, + "departure" => StatusKind.Departure, + "passing" => StatusKind.Passing, + _ => throw new NotImplementedException() + }; + } + + public override void Write(Utf8JsonWriter writer, StatusKind value, JsonSerializerOptions options) { + writer.WriteStringValue(value switch { + StatusKind.Passing => "passing", + StatusKind.Arrival => "arrival", + StatusKind.Departure => "departure", + _ => throw new NotImplementedException() + }); + } + } + } + + internal class NoteKindConverter : JsonConverterFactory { + public override bool CanConvert(Type typeToConvert) { + return typeToConvert == typeof(NoteKind); + } + + public override JsonConverter? CreateConverter(Type typeToConvert, JsonSerializerOptions options) { + return new Converter(); + } + + private class Converter : JsonConverter { + public override NoteKind Read( + ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options + ) { + return reader.GetString() switch { + "departsAs" => NoteKind.DepartsAs, + "trainNumberChange" => NoteKind.TrainNumberChange, + "receivingWagons" => NoteKind.ReceivingWagons, + "detachingWagons" => NoteKind.DetachingWagons, + _ => throw new NotImplementedException() + }; + } + + public override void Write(Utf8JsonWriter writer, NoteKind value, JsonSerializerOptions options) { + writer.WriteStringValue(value switch { + NoteKind.DepartsAs => "departsAs", + NoteKind.TrainNumberChange => "trainNumberChange", + NoteKind.DetachingWagons => "detachingWagons", + NoteKind.ReceivingWagons => "receivingWagons", + _ => throw new NotImplementedException() + }); + } + } + } + } + + #endregion +} diff --git a/scraper/src/Scrapers/Train.cs b/scraper/src/Scrapers/Train.cs index 4c24409..57cd25b 100644 --- a/scraper/src/Scrapers/Train.cs +++ b/scraper/src/Scrapers/Train.cs @@ -1,239 +1,244 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Net; -using System.Net.Http; -using System.Text.RegularExpressions; -using System.Threading.Tasks; -using AngleSharp; -using AngleSharp.Dom; -using AngleSharp.Html.Dom; -using Flurl; -using InfoferScraper.Models.Train; -using NodaTime; -using NodaTime.Extensions; -using scraper.Exceptions; - -namespace InfoferScraper.Scrapers { - public static class TrainScraper { - private const string BaseUrl = "https://mersultrenurilor.infofer.ro/ro-RO/"; - private static readonly Regex TrainInfoRegex = new(@"^([A-Z-]+)\s([0-9]+)\sîn\s([0-9.]+)$"); - private static readonly Regex OperatorRegex = new(@"^Operat\sde\s(.+)$"); - - private static readonly Regex RouteRegex = - new(@$"^Parcurs\stren\s([{Utils.RoLetters} ]+)[-–]([{Utils.RoLetters}\s]+)$"); - - private static readonly Regex SlRegex = - new( - @"^(?:Fără|([0-9]+)\smin)\s(întârziere|mai\sdevreme)\sla\s(trecerea\sfără\soprire\sprin|sosirea\sîn|plecarea\sdin)\s(.+)\.$"); - - private static readonly Dictionary SlStateMap = new() { - { 't', StatusKind.Passing }, - { 's', StatusKind.Arrival }, - { 'p', StatusKind.Departure }, - }; - - private static readonly Regex KmRegex = new(@"^km\s([0-9]+)$"); - private static readonly Regex StoppingTimeRegex = new(@"^([0-9]+)\s(min|sec)\soprire$"); - private static readonly Regex PlatformRegex = new(@"^linia\s(.+)$"); - - private static readonly Regex StationArrdepStatusRegex = - new(@"^(?:(la timp)|(?:((?:\+|-)[0-9]+) min \((?:(?:întârziere)|(?:mai devreme))\)))(\*?)$"); - - private static readonly Regex TrainNumberChangeNoteRegex = - new(@"^Trenul își schimbă numărul în\s([A-Z-]+)\s([0-9]+)$"); - private static readonly Regex DepartsAsNoteRegex = - new(@"^Trenul pleacă cu numărul\s([A-Z-]+)\s([0-9]+)\sîn\s([0-9]{2}).([0-9]{2}).([0-9]{4})$"); - private static readonly Regex ReceivingWagonsNoteRegex = - new(@"^Trenul primește vagoane de la\s(.+)\.$"); - private static readonly Regex DetachingWagonsNoteRegex = - new(@"^Trenul detașează vagoane pentru stația\s(.+)\.$"); - - private static readonly DateTimeZone BucharestTz = DateTimeZoneProviders.Tzdb["Europe/Bucharest"]; - - private static readonly CookieContainer CookieContainer = new(); - private static readonly HttpClient HttpClient = new(new HttpClientHandler { - CookieContainer = CookieContainer, - UseCookies = true, - }) { - BaseAddress = new Uri(BaseUrl), - DefaultRequestVersion = new Version(2, 0), - }; - - public static async Task Scrape(string trainNumber, DateTimeOffset? dateOverride = null) { - var dateOverrideInstant = dateOverride?.ToInstant().InZone(BucharestTz); - dateOverride = dateOverrideInstant?.ToDateTimeOffset(); - TrainScrapeResult result = new(); - - var asConfig = Configuration.Default; - var asContext = BrowsingContext.New(asConfig); - - var firstUrl = "Tren" - .AppendPathSegment(trainNumber); - if (dateOverride != null) { - firstUrl = firstUrl.SetQueryParam("Date", $"{dateOverride:d.MM.yyyy}"); - } - var firstResponse = await HttpClient.GetStringAsync(firstUrl); - var firstDocument = await asContext.OpenAsync(req => req.Content(firstResponse)); - var firstForm = firstDocument.GetElementById("form-search")!; - - var firstResult = firstForm - .QuerySelectorAll("input") - .Where(elem => elem.Name != null) - .ToDictionary(elem => elem.Name!, elem => elem.Value); - - var secondUrl = "".AppendPathSegments("Trains", "TrainsResult"); - var secondResponse = await HttpClient.PostAsync( - secondUrl, -#pragma warning disable CS8620 - new FormUrlEncodedContent(firstResult) -#pragma warning restore CS8620 - ); - var secondResponseContent = await secondResponse.Content.ReadAsStringAsync(); - var secondDocument = await asContext.OpenAsync( - req => req.Content(secondResponseContent) - ); - - var (trainInfoDiv, (_, (_, (resultsDiv, _)))) = secondDocument - .QuerySelectorAll("body > div"); - if (trainInfoDiv == null) { - return null; - } - if (resultsDiv == null) { - throw new TrainNotThisDayException(); - } - trainInfoDiv = trainInfoDiv.QuerySelectorAll(":scope > div > div").First(); - - (result.Rank, (result.Number, (result.Date, _))) = (TrainInfoRegex.Match( - trainInfoDiv.QuerySelector(":scope > h2")!.Text().WithCollapsedSpaces() - ).Groups as IEnumerable).Select(group => group.Value).Skip(1); - var (scrapedDateD, (scrapedDateM, (scrapedDateY, _))) = result.Date - .Split('.') - .Select(int.Parse); - var date = new DateTime(scrapedDateY, scrapedDateM, scrapedDateD); - - result.Operator = (OperatorRegex.Match( - trainInfoDiv.QuerySelector(":scope > p")!.Text().WithCollapsedSpaces() - ).Groups as IEnumerable).Skip(1).First().Value; - - foreach (var groupDiv in resultsDiv.QuerySelectorAll(":scope > div")) { - result.AddTrainGroup(group => { - var statusDiv = groupDiv.QuerySelectorAll(":scope > div").First(); - var routeText = statusDiv.QuerySelector(":scope > h4")!.Text().WithCollapsedSpaces(); - group.ConfigureRoute(route => { - (route.From, (route.To, _)) = (RouteRegex.Match(routeText).Groups as IEnumerable).Skip(1) - .Select(group => group.Value); - }); - - try { - var statusLineMatch = - SlRegex.Match(statusDiv.QuerySelector(":scope > div")!.Text().WithCollapsedSpaces()); - var (slmDelay, (slmLate, (slmArrival, (slmStation, _)))) = - (statusLineMatch.Groups as IEnumerable).Skip(1).Select(group => group.Value); - group.MakeStatus(status => { - status.Delay = string.IsNullOrEmpty(slmDelay) ? 0 : - slmLate == "întârziere" ? int.Parse(slmDelay) : -int.Parse(slmDelay); - status.Station = slmStation; - status.State = SlStateMap[slmArrival[0]]; - }); - } - catch { - // ignored - } - - Utils.DateTimeSequencer dtSeq = new(date.Year, date.Month, date.Day); - var stations = statusDiv.QuerySelectorAll(":scope > ul > li"); - foreach (var station in stations) { - group.AddStopDescription(stopDescription => { - var (left, (middle, (right, _))) = station - .QuerySelectorAll(":scope > div > div"); - var (stopDetails, (stopNotes, _)) = middle - .QuerySelectorAll(":scope > div > div > div"); - stopDescription.Name = stopDetails - .QuerySelectorAll(":scope > div")[0] - .Text() - .WithCollapsedSpaces(); - var scrapedKm = stopDetails - .QuerySelectorAll(":scope > div")[1] - .Text() - .WithCollapsedSpaces(); - stopDescription.Km = int.Parse( - (KmRegex.Match(scrapedKm).Groups as IEnumerable).Skip(1).First().Value - ); - var scrapedStoppingTime = stopDetails - .QuerySelectorAll(":scope > div")[2] - .Text() - .WithCollapsedSpaces(); - if (!string.IsNullOrEmpty(scrapedStoppingTime)) { - var (stValue, (stMinsec, _)) = - (StoppingTimeRegex.Match(scrapedStoppingTime).Groups as IEnumerable) - .Skip(1) - .Select(group => group.Value); - stopDescription.StoppingTime = int.Parse(stValue); - if (stMinsec == "min") stopDescription.StoppingTime *= 60; - } - - var scrapedPlatform = stopDetails - .QuerySelectorAll(":scope > div")[3] - .Text() - .WithCollapsedSpaces(); - if (!string.IsNullOrEmpty(scrapedPlatform)) - stopDescription.Platform = PlatformRegex.Match(scrapedPlatform).Groups[1].Value; - - void ScrapeTime(IElement element, ref TrainStopArrDep arrDep) { - var parts = element.QuerySelectorAll(":scope > div > div > div"); - if (parts.Length == 0) throw new OperationCanceledException(); - var time = parts[0]; - var scrapedTime = time.Text().WithCollapsedSpaces(); - var (stHour, (stMin, _)) = scrapedTime.Split(':').Select(int.Parse); - arrDep.ScheduleTime = BucharestTz.AtLeniently(dtSeq.Next(stHour, stMin).ToLocalDateTime()) - .ToDateTimeOffset(); - - if (parts.Length < 2) return; - - var statusElement = parts[1]; - var (onTime, (delay, (approx, _))) = (StationArrdepStatusRegex.Match( - statusElement.Text().WithCollapsedSpaces(replaceWith: " ") - ).Groups as IEnumerable).Skip(1).Select(group => group.Value); - arrDep.MakeStatus(status => { - status.Delay = string.IsNullOrEmpty(onTime) ? int.Parse(delay) : 0; - status.Real = string.IsNullOrEmpty(approx); - }); - } - - try { - stopDescription.MakeArrival(arrival => { ScrapeTime(left, ref arrival); }); - } - catch (OperationCanceledException) { } - - try { - stopDescription.MakeDeparture(departure => { ScrapeTime(right, ref departure); }); - } - catch (OperationCanceledException) { } - - foreach (var noteDiv in stopNotes.QuerySelectorAll(":scope > div > div")) { - var noteText = noteDiv.Text().WithCollapsedSpaces(); - Match trainNumberChangeMatch, departsAsMatch, detachingWagons, receivingWagons; - if ((trainNumberChangeMatch = TrainNumberChangeNoteRegex.Match(noteText)).Success) { - stopDescription.AddTrainNumberChangeNote(trainNumberChangeMatch.Groups[1].Value, trainNumberChangeMatch.Groups[2].Value); - } - else if ((departsAsMatch = DepartsAsNoteRegex.Match(noteText)).Success) { - var groups = departsAsMatch.Groups; - var departureDate = BucharestTz.AtStrictly(new(int.Parse(groups[5].Value), int.Parse(groups[4].Value), int.Parse(groups[3].Value), 0, 0)); - stopDescription.AddDepartsAsNote(groups[1].Value, groups[2].Value, departureDate.ToDateTimeOffset()); - } - else if ((detachingWagons = DetachingWagonsNoteRegex.Match(noteText)).Success) { - stopDescription.AddDetachingWagonsNote(detachingWagons.Groups[1].Value); - } - else if ((receivingWagons = ReceivingWagonsNoteRegex.Match(noteText)).Success) { - stopDescription.AddReceivingWagonsNote(receivingWagons.Groups[1].Value); - } - } - }); - } - }); - } - return result; - } - } -} // namespace +using System; +using System.Collections.Generic; +using System.Linq; +using System.Net; +using System.Net.Http; +using System.Text.RegularExpressions; +using System.Threading.Tasks; +using AngleSharp; +using AngleSharp.Dom; +using AngleSharp.Html.Dom; +using Flurl; +using InfoferScraper.Models.Train; +using NodaTime; +using NodaTime.Extensions; +using scraper.Exceptions; + +namespace InfoferScraper.Scrapers { + public static class TrainScraper { + private const string BaseUrl = "https://mersultrenurilor.infofer.ro/ro-RO/"; + private static readonly Regex TrainInfoRegex = new(@"^([A-Z-]+)\s([0-9]+)\sîn\s([0-9.]+)$"); + private static readonly Regex OperatorRegex = new(@"^Operat\sde\s(.+)$"); + + private static readonly Regex RouteRegex = + new(@$"^Parcurs\stren\s([{Utils.RoLetters} ]+)[-–]([{Utils.RoLetters}\s]+)$"); + + private static readonly Regex SlRegex = + new( + @"^(?:Fără|([0-9]+)\smin)\s(întârziere|mai\sdevreme)\sla\s(trecerea\sfără\soprire\sprin|sosirea\sîn|plecarea\sdin)\s(.+)\.$"); + + private static readonly Dictionary SlStateMap = new() { + { 't', StatusKind.Passing }, + { 's', StatusKind.Arrival }, + { 'p', StatusKind.Departure }, + }; + + private static readonly Regex KmRegex = new(@"^km\s([0-9]+)$"); + private static readonly Regex StoppingTimeRegex = new(@"^([0-9]+)\s(min|sec)\soprire$"); + private static readonly Regex PlatformRegex = new(@"^linia\s(.+)$"); + + private static readonly Regex StationArrdepStatusRegex = + new(@"^(?:(la timp)|(?:((?:\+|-)[0-9]+) min \((?:(?:întârziere)|(?:mai devreme))\)))(\*?)$"); + + private static readonly Regex TrainNumberChangeNoteRegex = + new(@"^Trenul își schimbă numărul în\s([A-Z-]+)\s([0-9]+)$"); + private static readonly Regex DepartsAsNoteRegex = + new(@"^Trenul pleacă cu numărul\s([A-Z-]+)\s([0-9]+)\sîn\s([0-9]{2}).([0-9]{2}).([0-9]{4})$"); + private static readonly Regex ReceivingWagonsNoteRegex = + new(@"^Trenul primește vagoane de la\s(.+)\.$"); + private static readonly Regex DetachingWagonsNoteRegex = + new(@"^Trenul detașează vagoane pentru stația\s(.+)\.$"); + + private static readonly DateTimeZone BucharestTz = DateTimeZoneProviders.Tzdb["Europe/Bucharest"]; + + private static readonly CookieContainer CookieContainer = new(); + private static readonly HttpClient HttpClient = new(new HttpClientHandler { + CookieContainer = CookieContainer, + UseCookies = true, + }) { + BaseAddress = new Uri(BaseUrl), + DefaultRequestVersion = new Version(2, 0), + }; + + public static async Task Scrape(string trainNumber, DateTimeOffset? dateOverride = null) { + var dateOverrideInstant = dateOverride?.ToInstant().InZone(BucharestTz); + dateOverride = dateOverrideInstant?.ToDateTimeOffset(); + TrainScrapeResult result = new(); + + var asConfig = Configuration.Default; + var asContext = BrowsingContext.New(asConfig); + + var firstUrl = "Tren" + .AppendPathSegment(trainNumber); + if (dateOverride != null) { + firstUrl = firstUrl.SetQueryParam("Date", $"{dateOverride:d.MM.yyyy}"); + } + var firstResponse = await HttpClient.GetStringAsync(firstUrl); + var firstDocument = await asContext.OpenAsync(req => req.Content(firstResponse)); + var firstForm = firstDocument.GetElementById("form-search")!; + + var firstResult = firstForm + .QuerySelectorAll("input") + .Where(elem => elem.Name != null) + .ToDictionary(elem => elem.Name!, elem => elem.Value); + + var secondUrl = "".AppendPathSegments("Trains", "TrainsResult"); + var secondResponse = await HttpClient.PostAsync( + secondUrl, +#pragma warning disable CS8620 + new FormUrlEncodedContent(firstResult) +#pragma warning restore CS8620 + ); + var secondResponseContent = await secondResponse.Content.ReadAsStringAsync(); + var secondDocument = await asContext.OpenAsync( + req => req.Content(secondResponseContent) + ); + + var (trainInfoDiv, (_, (_, (resultsDiv, _)))) = secondDocument + .QuerySelectorAll("body > div"); + if (trainInfoDiv == null) { + return null; + } + if (resultsDiv == null) { + throw new TrainNotThisDayException(); + } + trainInfoDiv = trainInfoDiv.QuerySelectorAll(":scope > div > div").First(); + + (result.Rank, (result.Number, (result.Date, _))) = (TrainInfoRegex.Match( + trainInfoDiv.QuerySelector(":scope > h2")!.Text().WithCollapsedSpaces() + ).Groups as IEnumerable).Select(group => group.Value).Skip(1); + var (scrapedDateD, (scrapedDateM, (scrapedDateY, _))) = result.Date + .Split('.') + .Select(int.Parse); + var date = new DateTime(scrapedDateY, scrapedDateM, scrapedDateD); + + result.Operator = (OperatorRegex.Match( + trainInfoDiv.QuerySelector(":scope > p")!.Text().WithCollapsedSpaces() + ).Groups as IEnumerable).Skip(1).First().Value; + + foreach (var groupDiv in resultsDiv.QuerySelectorAll(":scope > div")) { + result.AddTrainGroup(group => { + var statusDiv = groupDiv.QuerySelectorAll(":scope > div").First(); + var routeText = statusDiv.QuerySelector(":scope > h4")!.Text().WithCollapsedSpaces(); + group.ConfigureRoute(route => { + (route.From, (route.To, _)) = (RouteRegex.Match(routeText).Groups as IEnumerable).Skip(1) + .Select(group => group.Value); + }); + + try { + var statusLineMatch = + SlRegex.Match(statusDiv.QuerySelector(":scope > div")!.Text().WithCollapsedSpaces()); + var (slmDelay, (slmLate, (slmArrival, (slmStation, _)))) = + (statusLineMatch.Groups as IEnumerable).Skip(1).Select(group => group.Value); + group.MakeStatus(status => { + status.Delay = string.IsNullOrEmpty(slmDelay) ? 0 : + slmLate == "întârziere" ? int.Parse(slmDelay) : -int.Parse(slmDelay); + status.Station = slmStation; + status.State = SlStateMap[slmArrival[0]]; + }); + } + catch { + // ignored + } + + Utils.DateTimeSequencer dtSeq = new(date.Year, date.Month, date.Day); + var stations = statusDiv.QuerySelectorAll(":scope > ul > li"); + foreach (var station in stations) { + group.AddStopDescription(stopDescription => { + var (left, (middle, (right, _))) = station + .QuerySelectorAll(":scope > div > div"); + var (stopDetails, (stopNotes, _)) = middle + .QuerySelectorAll(":scope > div > div > div"); + stopDescription.Name = stopDetails + .QuerySelectorAll(":scope > div")[0] + .Text() + .WithCollapsedSpaces(); + stopDescription.LinkName = new Flurl.Url(stopDetails + .QuerySelectorAll(":scope > div")[0] + .QuerySelector(":scope a") + .Attributes["href"] + .Value).PathSegments.Last(); + var scrapedKm = stopDetails + .QuerySelectorAll(":scope > div")[1] + .Text() + .WithCollapsedSpaces(); + stopDescription.Km = int.Parse( + (KmRegex.Match(scrapedKm).Groups as IEnumerable).Skip(1).First().Value + ); + var scrapedStoppingTime = stopDetails + .QuerySelectorAll(":scope > div")[2] + .Text() + .WithCollapsedSpaces(); + if (!string.IsNullOrEmpty(scrapedStoppingTime)) { + var (stValue, (stMinsec, _)) = + (StoppingTimeRegex.Match(scrapedStoppingTime).Groups as IEnumerable) + .Skip(1) + .Select(group => group.Value); + stopDescription.StoppingTime = int.Parse(stValue); + if (stMinsec == "min") stopDescription.StoppingTime *= 60; + } + + var scrapedPlatform = stopDetails + .QuerySelectorAll(":scope > div")[3] + .Text() + .WithCollapsedSpaces(); + if (!string.IsNullOrEmpty(scrapedPlatform)) + stopDescription.Platform = PlatformRegex.Match(scrapedPlatform).Groups[1].Value; + + void ScrapeTime(IElement element, ref TrainStopArrDep arrDep) { + var parts = element.QuerySelectorAll(":scope > div > div > div"); + if (parts.Length == 0) throw new OperationCanceledException(); + var time = parts[0]; + var scrapedTime = time.Text().WithCollapsedSpaces(); + var (stHour, (stMin, _)) = scrapedTime.Split(':').Select(int.Parse); + arrDep.ScheduleTime = BucharestTz.AtLeniently(dtSeq.Next(stHour, stMin).ToLocalDateTime()) + .ToDateTimeOffset(); + + if (parts.Length < 2) return; + + var statusElement = parts[1]; + var (onTime, (delay, (approx, _))) = (StationArrdepStatusRegex.Match( + statusElement.Text().WithCollapsedSpaces(replaceWith: " ") + ).Groups as IEnumerable).Skip(1).Select(group => group.Value); + arrDep.MakeStatus(status => { + status.Delay = string.IsNullOrEmpty(onTime) ? int.Parse(delay) : 0; + status.Real = string.IsNullOrEmpty(approx); + }); + } + + try { + stopDescription.MakeArrival(arrival => { ScrapeTime(left, ref arrival); }); + } + catch (OperationCanceledException) { } + + try { + stopDescription.MakeDeparture(departure => { ScrapeTime(right, ref departure); }); + } + catch (OperationCanceledException) { } + + foreach (var noteDiv in stopNotes.QuerySelectorAll(":scope > div > div")) { + var noteText = noteDiv.Text().WithCollapsedSpaces(); + Match trainNumberChangeMatch, departsAsMatch, detachingWagons, receivingWagons; + if ((trainNumberChangeMatch = TrainNumberChangeNoteRegex.Match(noteText)).Success) { + stopDescription.AddTrainNumberChangeNote(trainNumberChangeMatch.Groups[1].Value, trainNumberChangeMatch.Groups[2].Value); + } + else if ((departsAsMatch = DepartsAsNoteRegex.Match(noteText)).Success) { + var groups = departsAsMatch.Groups; + var departureDate = BucharestTz.AtStrictly(new(int.Parse(groups[5].Value), int.Parse(groups[4].Value), int.Parse(groups[3].Value), 0, 0)); + stopDescription.AddDepartsAsNote(groups[1].Value, groups[2].Value, departureDate.ToDateTimeOffset()); + } + else if ((detachingWagons = DetachingWagonsNoteRegex.Match(noteText)).Success) { + stopDescription.AddDetachingWagonsNote(detachingWagons.Groups[1].Value); + } + else if ((receivingWagons = ReceivingWagonsNoteRegex.Match(noteText)).Success) { + stopDescription.AddReceivingWagonsNote(receivingWagons.Groups[1].Value); + } + } + }); + } + }); + } + return result; + } + } +} // namespace