From 1d9db5b491de114f8677dbaf14a2b1eba354dbcc Mon Sep 17 00:00:00 2001 From: Dan Cojocaru Date: Tue, 27 Dec 2022 12:36:03 +0200 Subject: [PATCH] Add initial itinerary scraping --- ConsoleTest/ConsoleTest.csproj | 2 +- ConsoleTest/Program.cs | 32 +++++ scraper/src/Models/Itinerary.cs | 62 ++++++++++ scraper/src/Scrapers/Route.cs | 207 ++++++++++++++++++++++++++++++++ 4 files changed, 302 insertions(+), 1 deletion(-) create mode 100644 scraper/src/Models/Itinerary.cs create mode 100644 scraper/src/Scrapers/Route.cs diff --git a/ConsoleTest/ConsoleTest.csproj b/ConsoleTest/ConsoleTest.csproj index 4df0a6c..7c6a3da 100644 --- a/ConsoleTest/ConsoleTest.csproj +++ b/ConsoleTest/ConsoleTest.csproj @@ -6,7 +6,7 @@ Exe - net6.0 + net6.0;net7.0 diff --git a/ConsoleTest/Program.cs b/ConsoleTest/Program.cs index 470e4f0..c4d66da 100644 --- a/ConsoleTest/Program.cs +++ b/ConsoleTest/Program.cs @@ -1,4 +1,5 @@ using System; +using System.Linq; using System.Text.Json; using System.Threading.Tasks; using InfoferScraper; @@ -7,6 +8,7 @@ using InfoferScraper.Scrapers; while (true) { Console.WriteLine("1. Scrape Train"); Console.WriteLine("2. Scrape Station"); + Console.WriteLine("3. Scrape Itineraries"); Console.WriteLine("0. Exit"); var input = Console.ReadLine()?.Trim(); @@ -17,6 +19,9 @@ while (true) { case "2": await PrintStation(); break; + case "3": + await ScrapeItineraries(); + break; case null: case "0": goto INPUT_LOOP_BREAK; @@ -61,3 +66,30 @@ async Task PrintStation() { ) ); } +async Task ScrapeItineraries() { + Console.Write("From station: "); + var from = Console.ReadLine(); + Console.Write("To station: "); + var to = Console.ReadLine(); + + if (from == null || to == null) return; + + var data = await RouteScraper.Scrape(from, to); + + Console.WriteLine($"{data.Count} itineraries:"); + Console.WriteLine(); + + void PrintArrDepLine(DateTimeOffset date, string station) { + Console.WriteLine($"{date:HH:mm} {station}"); + } + + foreach (var itinerary in data) { + foreach (var train in itinerary.Trains) { + PrintArrDepLine(train.DepartureDate, train.From); + Console.WriteLine($" {train.TrainRank,-4} {train.TrainNumber,-5} ({train.Operator}), {train.Km,3} km via {string.Join(", ", train.IntermediateStops)}"); + PrintArrDepLine(train.ArrivalDate, train.To); + } + + Console.WriteLine(); + } +} diff --git a/scraper/src/Models/Itinerary.cs b/scraper/src/Models/Itinerary.cs new file mode 100644 index 0000000..521389b --- /dev/null +++ b/scraper/src/Models/Itinerary.cs @@ -0,0 +1,62 @@ +using System; +using System.Collections.Generic; + +namespace scraper.Models.Itinerary; + +#region Interfaces + +public interface IItinerary { + public IReadOnlyList Trains { get; } +} + +public interface IItineraryTrain { + public string From { get; } + public string To { get; } + public IReadOnlyList IntermediateStops { get; } + public DateTimeOffset DepartureDate { get; } + public DateTimeOffset ArrivalDate { get; } + public int Km { get; } + public string Operator { get; } + public string TrainRank { get; } + public string TrainNumber { get; } +} + +#endregion + +#region Implementations + +internal record Itinerary : IItinerary { + private List ModifyableTrains { get; set; } = new(); + + public IReadOnlyList Trains => ModifyableTrains; + + internal void AddTrain(IItineraryTrain train) { + ModifyableTrains.Add(train); + } + + internal void AddTrain(Action configurator) { + ItineraryTrain newTrain = new(); + configurator(newTrain); + AddTrain(newTrain); + } +} + +internal record ItineraryTrain : IItineraryTrain { + private List ModifyableIntermediateStops { get; set; } = new(); + + public string From { get; internal set; } = ""; + public string To { get; internal set; } = ""; + public IReadOnlyList IntermediateStops => ModifyableIntermediateStops; + public DateTimeOffset DepartureDate { get; internal set; } = new(); + public DateTimeOffset ArrivalDate { get; internal set; } = new(); + public int Km { get; internal set; } = 0; + public string Operator { get; internal set; } = ""; + public string TrainRank { get; internal set; } = ""; + public string TrainNumber { get; internal set; } = ""; + + internal void AddIntermediateStop(string stop) { + ModifyableIntermediateStops.Add(stop); + } +} + +#endregion \ No newline at end of file diff --git a/scraper/src/Scrapers/Route.cs b/scraper/src/Scrapers/Route.cs new file mode 100644 index 0000000..ab07a3d --- /dev/null +++ b/scraper/src/Scrapers/Route.cs @@ -0,0 +1,207 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Net; +using System.Net.Http; +using System.Text.RegularExpressions; +using System.Threading.Tasks; +using AngleSharp; +using AngleSharp.Dom; +using AngleSharp.Html.Dom; +using Flurl; +using InfoferScraper.Models.Train; +using NodaTime; +using NodaTime.Extensions; +using scraper.Models.Itinerary; + +namespace InfoferScraper.Scrapers; + +public static class RouteScraper { + private const string BaseUrl = "https://mersultrenurilor.infofer.ro/ro-RO/"; + private static readonly DateTimeZone BucharestTz = DateTimeZoneProviders.Tzdb["Europe/Bucharest"]; + + private static readonly CookieContainer CookieContainer = new(); + + private static readonly HttpClient HttpClient = new(new HttpClientHandler { + CookieContainer = CookieContainer, + UseCookies = true, + }) { + BaseAddress = new Uri(BaseUrl), + DefaultRequestVersion = new Version(2, 0), + }; + + private static readonly Regex KmTrainRankNoRegex = new(@"^([0-9]+)\skm\scu\s([A-Z-]+)\s([0-9]+)$"); + private static readonly Regex OperatorRegex = new(@$"^Operat\sde\s([{Utils.RoLetters}\s]+)$"); + private static readonly Regex DepArrRegex = new(@"^(Ple|Sos)\s([0-9]+)\s([a-z]+)\.?\s([0-9]+):([0-9]+)$"); + + private static readonly Dictionary Months = new Dictionary() { + ["ian"] = 1, + ["feb"] = 2, + ["mar"] = 3, + ["apr"] = 4, + ["mai"] = 5, + ["iun"] = 6, + ["iul"] = 7, + ["aug"] = 8, + ["sep"] = 9, + ["oct"] = 10, + ["noi"] = 11, + ["dec"] = 12, + }; + + public static async Task?> Scrape(string from, string to, DateTimeOffset? dateOverride = null) { + var dateOverrideInstant = dateOverride?.ToInstant().InZone(BucharestTz); + dateOverride = dateOverrideInstant?.ToDateTimeOffset(); + TrainScrapeResult result = new(); + + var asConfig = Configuration.Default; + var asContext = BrowsingContext.New(asConfig); + + var firstUrl = "Rute-trenuri" + .AppendPathSegment(from) + .AppendPathSegment(to); + if (dateOverride != null) { + firstUrl = firstUrl.SetQueryParam("DepartureDate", $"{dateOverride:d.MM.yyyy}"); + } + firstUrl = firstUrl.SetQueryParam("OrderingTypeId", "0"); + firstUrl = firstUrl.SetQueryParam("TimeSelectionId", "0"); + firstUrl = firstUrl.SetQueryParam("MinutesInDay", "0"); + firstUrl = firstUrl.SetQueryParam("ConnectionsTypeId", "1"); + firstUrl = firstUrl.SetQueryParam("BetweenTrainsMinimumMinutes", "5"); + firstUrl = firstUrl.SetQueryParam("ChangeStationName", ""); + + var firstResponse = await HttpClient.GetStringAsync(firstUrl); + var firstDocument = await asContext.OpenAsync(req => req.Content(firstResponse)); + var firstForm = firstDocument.GetElementById("form-search")!; + + var firstResult = firstForm + .QuerySelectorAll("input") + .Where(elem => elem.Name != null) + .ToDictionary(elem => elem.Name!, elem => elem.Value); + + var secondUrl = "".AppendPathSegments("Itineraries", "GetItineraries"); + var secondResponse = await HttpClient.PostAsync( + secondUrl, +#pragma warning disable CS8620 + new FormUrlEncodedContent(firstResult) +#pragma warning restore CS8620 + ); + var secondResponseContent = await secondResponse.Content.ReadAsStringAsync(); + var secondDocument = await asContext.OpenAsync( + req => req.Content(secondResponseContent) + ); + + var (itineraryInfoDiv, _) = secondDocument + .QuerySelectorAll("body > div"); + + if (itineraryInfoDiv == null) { + return null; + } + + var itinerariesLi = secondDocument + .QuerySelectorAll("body > ul > li"); + var itineraries = new List(); + foreach (var itineraryLi in itinerariesLi) { + var itinerary = new Itinerary(); + + var cardDivs = itineraryLi.QuerySelectorAll(":scope > div > div > div > div"); + var detailsDivs = cardDivs[3] + .QuerySelectorAll(":scope > div > div")[1] + .QuerySelectorAll(":scope > div"); + var trainItineraryAndDetailsLis = detailsDivs[0] + .QuerySelectorAll(":scope > ul > li"); + var stations = new List(); + var details = new List(); + foreach (var (idx, li) in trainItineraryAndDetailsLis.Select((li, idx) => (idx, li))) { + if (idx % 2 == 0) { + // Station + stations.Add( + li + .QuerySelectorAll(":scope > div > div > div > div")[1] + .Text() + .WithCollapsedSpaces() + ); + } + else { + var now = LocalDateTime.FromDateTime(DateTime.Now); + // Detail + var detailColumns = li.QuerySelectorAll(":scope > div > div"); + var leftSideDivs = detailColumns[0].QuerySelectorAll(":scope > div"); + + var departureDateText = leftSideDivs[0] + .QuerySelectorAll(":scope > div")[1] + .Text() + .WithCollapsedSpaces(); + var departureDateMatch = DepArrRegex.Match(departureDateText); + var departureDate = new LocalDateTime( + now.Year, + Months[departureDateMatch.Groups[3].Value], + int.Parse(departureDateMatch.Groups[2].Value), + int.Parse(departureDateMatch.Groups[4].Value), + int.Parse(departureDateMatch.Groups[5].Value), + 0 + ); + if (departureDate < now.PlusDays(-1)) { + departureDate = departureDate.PlusYears(1); + } + + var arrivalDateText = leftSideDivs[3] + .QuerySelectorAll(":scope > div")[1] + .Text() + .WithCollapsedSpaces(); + var arrivalDateMatch = DepArrRegex.Match(arrivalDateText); + var arrivalDate = new LocalDateTime( + now.Year, + Months[arrivalDateMatch.Groups[3].Value], + int.Parse(arrivalDateMatch.Groups[2].Value), + int.Parse(arrivalDateMatch.Groups[4].Value), + int.Parse(arrivalDateMatch.Groups[5].Value), + 0 + ); + if (arrivalDate < now.PlusDays(-1)) { + arrivalDate = arrivalDate.PlusYears(1); + } + + var rightSideDivs = detailColumns[1].QuerySelectorAll(":scope > div > div"); + var kmRankNumberText = rightSideDivs[0] + .QuerySelectorAll(":scope > div > div")[0] + .Text() + .WithCollapsedSpaces(); + var kmRankNumberMatch = KmTrainRankNoRegex.Match(kmRankNumberText); + + var operatorText = rightSideDivs[0] + .QuerySelectorAll(":scope > div > div")[1] + .Text() + .WithCollapsedSpaces(); + var operatorMatch = OperatorRegex.Match(operatorText); + + var train = new ItineraryTrain { + ArrivalDate = BucharestTz.AtLeniently(arrivalDate).ToDateTimeOffset(), + DepartureDate = BucharestTz.AtLeniently(departureDate).ToDateTimeOffset(), + Km = int.Parse(kmRankNumberMatch.Groups[1].Value), + TrainRank = kmRankNumberMatch.Groups[2].Value, + TrainNumber = kmRankNumberMatch.Groups[3].Value, + Operator = operatorMatch.Groups[1].Value, + }; + + foreach (var div in leftSideDivs[2] + .QuerySelectorAll(":scope > div") + .Where((_, i) => i % 2 != 0)) { + train.AddIntermediateStop(div.Text().WithCollapsedSpaces()); + } + + details.Add(train); + } + } + foreach (var ((iFrom, iTo), detail) in stations.Zip(stations.Skip(1)).Zip(details)) { + detail.From = iFrom; + detail.To = iTo; + itinerary.AddTrain(detail); + } + + itineraries.Add(itinerary); + } + + return itineraries; + } +} \ No newline at end of file