From 6c6d69ee26d3f8c527755ff577f8f88c876db53a Mon Sep 17 00:00:00 2001 From: Dan Cojocaru Date: Fri, 3 Jan 2025 06:50:58 +0100 Subject: [PATCH] Add UserAgent --- scraper/src/Scrapers/Route.cs | 7 + scraper/src/Scrapers/Station.cs | 463 ++++++++++++++++---------------- scraper/src/Scrapers/Train.cs | 7 + 3 files changed, 249 insertions(+), 228 deletions(-) diff --git a/scraper/src/Scrapers/Route.cs b/scraper/src/Scrapers/Route.cs index 3a224c6..675d588 100644 --- a/scraper/src/Scrapers/Route.cs +++ b/scraper/src/Scrapers/Route.cs @@ -3,6 +3,7 @@ using System.Collections.Generic; using System.Linq; using System.Net; using System.Net.Http; +using System.Net.Http.Headers; using System.Text.RegularExpressions; using System.Threading.Tasks; using AngleSharp; @@ -57,6 +58,12 @@ public class RouteScraper { httpClient = new HttpClient(httpClientHandler) { BaseAddress = new Uri(BaseUrl), DefaultRequestVersion = new Version(2, 0), + DefaultRequestHeaders = { + UserAgent = { + new ProductInfoHeaderValue("new-infofer-scraper", "0.0.1"), + new ProductInfoHeaderValue("(developed by DC Dev as a hobby, get in touch at webmasterdcdev.ro for any issues)"), + }, + }, }; } diff --git a/scraper/src/Scrapers/Station.cs b/scraper/src/Scrapers/Station.cs index ac1315d..7f8f121 100644 --- a/scraper/src/Scrapers/Station.cs +++ b/scraper/src/Scrapers/Station.cs @@ -1,228 +1,235 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Net; -using System.Net.Http; -using System.Text.RegularExpressions; -using System.Threading.Tasks; -using AngleSharp; -using AngleSharp.Dom; -using AngleSharp.Html.Dom; -using Flurl; -using InfoferScraper.Models.Station; -using NodaTime; -using NodaTime.Extensions; - -namespace InfoferScraper.Scrapers { - public class StationScraper { - private static readonly Regex StationInfoRegex = new($@"^([{Utils.RoLetters}.0-9 ]+)\sîn\s([0-9.]+)$"); - - private static readonly Regex StoppingTimeRegex = new( - @"^(necunoscută \(stație terminus\))|(?:([0-9]+) (min|sec) \((?:începând cu|până la) ([0-9]{1,2}:[0-9]{2})\))$" - ); - - private static readonly Regex StatusRegex = new( - @"^(?:la timp|([+-]?[0-9]+) min \((?:întârziere|mai devreme)\))(\*?)$" - ); - - private static readonly Regex PlatformRegex = new(@"^linia\s([A-Za-z0-9]+)$"); - - private static readonly Regex TrainUrlDateRegex = new(@"Date=([0-9]{2}).([0-9]{2}).([0-9]{4})"); - - private static readonly DateTimeZone BucharestTz = DateTimeZoneProviders.Tzdb["Europe/Bucharest"]; - - private const string BaseUrl = "https://mersultrenurilor.infofer.ro/ro-RO/"; - - private readonly CookieContainer cookieContainer = new(); - - private readonly HttpClient httpClient; - - public StationScraper(HttpClientHandler? httpClientHandler = null) { - if (httpClientHandler == null) { - httpClientHandler = new HttpClientHandler { - CookieContainer = cookieContainer, - UseCookies = true, - }; - } - else { - httpClientHandler.CookieContainer = cookieContainer; - httpClientHandler.UseCookies = true; - } - httpClient = new HttpClient(httpClientHandler) { - BaseAddress = new Uri(BaseUrl), - DefaultRequestVersion = new Version(2, 0), - }; - } - - public async Task Scrape(string stationName, DateTimeOffset? date = null) { - var dateInstant = date?.ToInstant().InZone(BucharestTz); - date = dateInstant?.ToDateTimeOffset(); - - stationName = stationName.RoLettersToEn(); - - var result = new StationScrapeResult(); - - var asConfig = Configuration.Default; - var asContext = BrowsingContext.New(asConfig); - - var firstUrl = "Statie" - .AppendPathSegment(Regex.Replace(stationName, @"\s", "-")); - if (date != null) { - firstUrl = firstUrl.SetQueryParam("Date", $"{date:d.MM.yyyy}"); - } - var firstResponse = await httpClient.GetStringAsync(firstUrl); - var firstDocument = await asContext.OpenAsync(req => req.Content(firstResponse)); - var firstForm = firstDocument.GetElementById("form-search")!; - - var firstResult = firstForm - .QuerySelectorAll("input") - .Where(elem => elem.Name != null) - .ToDictionary(elem => elem.Name!, elem => elem.Value); - - var secondUrl = "".AppendPathSegments("Stations", "StationsResult"); - var secondResponse = await httpClient.PostAsync( - secondUrl, -#pragma warning disable CS8620 - new FormUrlEncodedContent(firstResult) -#pragma warning restore CS8620 - ); - var secondResponseContent = await secondResponse.Content.ReadAsStringAsync(); - var secondDocument = await asContext.OpenAsync( - req => req.Content(secondResponseContent) - ); - - var (stationInfoDiv, (_, (departuresDiv, (arrivalsDiv, _)))) = secondDocument - .QuerySelectorAll("body > div"); - - (result.StationName, (result.Date, _)) = (StationInfoRegex.Match( - stationInfoDiv - .QuerySelector(":scope > h2")! - .Text() - .WithCollapsedSpaces() - ).Groups as IEnumerable).Skip(1).Select(group => group.Value); - - var (dateDay, (dateMonth, (dateYear, _))) = result.Date.Split('.').Select(int.Parse); - - void ParseArrDepList(IElement element, Action> adder) { - Utils.DateTimeSequencer dtSeq = new(dateYear, dateMonth, dateDay); - - if (element.QuerySelector(":scope > div > ul") == null) return; - - foreach (var trainElement in element.QuerySelectorAll(":scope > div > ul > li")) { - adder(arrDep => { - var divs = trainElement.QuerySelectorAll(":scope > div"); - var dataDiv = divs[0]; - var statusDiv = divs.Length >= 2 ? divs[1] : null; - - var (dataMainDiv, (dataDetailsDiv, _)) = dataDiv - .QuerySelectorAll(":scope > div"); - var (timeDiv, (destDiv, (trainDiv, _))) = dataMainDiv - .QuerySelectorAll(":scope > div"); - var (operatorDiv, (routeDiv, (stoppingTimeDiv, _))) = dataDetailsDiv - .QuerySelectorAll(":scope > div > div"); - - var timeResult = timeDiv - .QuerySelectorAll(":scope > div > div > div")[1] - .Text() - .WithCollapsedSpaces(); - var (stHr, (stMin, _)) = timeResult.Split(':').Select(int.Parse); - arrDep.Time = BucharestTz.AtLeniently( - dtSeq.Next(stHr, stMin).ToLocalDateTime() - ).ToDateTimeOffset(); - - // ReSharper disable once UnusedVariable // stOppositeTime: might be useful in the future - var (unknownSt, (st, (minsec, (stOppositeTime, _)))) = (StoppingTimeRegex.Match( - stoppingTimeDiv.QuerySelectorAll(":scope > div > div")[1] - .Text() - .WithCollapsedSpaces() - ).Groups as IEnumerable).Skip(1).Select(group => group.Value); - if (unknownSt.Length == 0 && st.Length > 0) { - arrDep.StoppingTime = int.Parse(st); - if (minsec == "min") { - arrDep.StoppingTime *= 60; - } - } - - arrDep.ModifyableTrain.Rank = trainDiv - .QuerySelectorAll(":scope > div > div > div")[1] - .QuerySelector(":scope > span")! - .Text() - .WithCollapsedSpaces(); - arrDep.ModifyableTrain.Number = trainDiv - .QuerySelectorAll(":scope > div > div > div")[1] - .QuerySelector(":scope > a")! - .Text() - .WithCollapsedSpaces(); - var trainUri = new Uri( - "http://localhost" + trainDiv - .QuerySelectorAll(":scope > div > div > div")[1] - .QuerySelector(":scope > a")! - .GetAttribute("href")! - ); - var (trainDepDay, (trainDepMonth, (trainDepYear, _))) = TrainUrlDateRegex - .Match(trainUri.Query) - .Groups - .Values - .Skip(1) - .Select(g => int.Parse(g.Value)); - arrDep.ModifyableTrain.DepartureDate = BucharestTz - .AtStartOfDay(new(trainDepYear, trainDepMonth, trainDepDay)) - .ToDateTimeOffset() - .ToUniversalTime(); - arrDep.ModifyableTrain.Terminus = destDiv - .QuerySelectorAll(":scope > div > div > div")[1] - .Text() - .WithCollapsedSpaces(); - arrDep.ModifyableTrain.Operator = operatorDiv - .QuerySelectorAll(":scope > div > div")[1] - .Text() - .WithCollapsedSpaces(); - foreach (var station in routeDiv.QuerySelectorAll(":scope > div > div")[1] - .Text() - .WithCollapsedSpaces() - .Split(" - ")) { - arrDep.ModifyableTrain.AddRouteStation(station); - } - - if (statusDiv == null) { - return; - } - - var statusDivComponents = statusDiv - .QuerySelectorAll(":scope > div")[0] - .QuerySelectorAll(":scope > div"); - - var delayDiv = statusDivComponents[0]; - - var (delayMin, (approx, _)) = (StatusRegex.Match( - delayDiv - .Text() - .WithCollapsedSpaces() - ).Groups as IEnumerable).Skip(1).Select(group => group.Value); - if (delayMin is null && delayDiv.Text().WithCollapsedSpaces() == "anulat") { - arrDep.ModifyableStatus.Cancelled = true; - } - else if (delayMin is null) { - throw new Exception($"Unexpected delayDiv value: {delayDiv.Text().WithCollapsedSpaces()}"); - } - else { - arrDep.ModifyableStatus.Real = string.IsNullOrEmpty(approx); - arrDep.ModifyableStatus.Delay = delayMin.Length == 0 ? 0 : int.Parse(delayMin); - } - - if (statusDivComponents.Length < 2) return; - - var platformDiv = statusDivComponents[1]; - arrDep.ModifyableStatus.Platform = PlatformRegex.Match(platformDiv.Text().WithCollapsedSpaces()) - .Groups[1].Value; - }); - } - } - - ParseArrDepList(departuresDiv, result.AddNewStationDeparture); - ParseArrDepList(arrivalsDiv, result.AddNewStationArrival); - - return result; - } - } -} +using System; +using System.Collections.Generic; +using System.Linq; +using System.Net; +using System.Net.Http; +using System.Net.Http.Headers; +using System.Text.RegularExpressions; +using System.Threading.Tasks; +using AngleSharp; +using AngleSharp.Dom; +using AngleSharp.Html.Dom; +using Flurl; +using InfoferScraper.Models.Station; +using NodaTime; +using NodaTime.Extensions; + +namespace InfoferScraper.Scrapers { + public class StationScraper { + private static readonly Regex StationInfoRegex = new($@"^([{Utils.RoLetters}.0-9 ]+)\sîn\s([0-9.]+)$"); + + private static readonly Regex StoppingTimeRegex = new( + @"^(necunoscută \(stație terminus\))|(?:([0-9]+) (min|sec) \((?:începând cu|până la) ([0-9]{1,2}:[0-9]{2})\))$" + ); + + private static readonly Regex StatusRegex = new( + @"^(?:la timp|([+-]?[0-9]+) min \((?:întârziere|mai devreme)\))(\*?)$" + ); + + private static readonly Regex PlatformRegex = new(@"^linia\s([A-Za-z0-9]+)$"); + + private static readonly Regex TrainUrlDateRegex = new(@"Date=([0-9]{2}).([0-9]{2}).([0-9]{4})"); + + private static readonly DateTimeZone BucharestTz = DateTimeZoneProviders.Tzdb["Europe/Bucharest"]; + + private const string BaseUrl = "https://mersultrenurilor.infofer.ro/ro-RO/"; + + private readonly CookieContainer cookieContainer = new(); + + private readonly HttpClient httpClient; + + public StationScraper(HttpClientHandler? httpClientHandler = null) { + if (httpClientHandler == null) { + httpClientHandler = new HttpClientHandler { + CookieContainer = cookieContainer, + UseCookies = true, + }; + } + else { + httpClientHandler.CookieContainer = cookieContainer; + httpClientHandler.UseCookies = true; + } + httpClient = new HttpClient(httpClientHandler) { + BaseAddress = new Uri(BaseUrl), + DefaultRequestVersion = new Version(2, 0), + DefaultRequestHeaders = { + UserAgent = { + new ProductInfoHeaderValue("new-infofer-scraper", "0.0.1"), + new ProductInfoHeaderValue("(developed by DC Dev as a hobby, get in touch at webmasterdcdev.ro for any issues)"), + }, + }, + }; + } + + public async Task Scrape(string stationName, DateTimeOffset? date = null) { + var dateInstant = date?.ToInstant().InZone(BucharestTz); + date = dateInstant?.ToDateTimeOffset(); + + stationName = stationName.RoLettersToEn(); + + var result = new StationScrapeResult(); + + var asConfig = Configuration.Default; + var asContext = BrowsingContext.New(asConfig); + + var firstUrl = "Statie" + .AppendPathSegment(Regex.Replace(stationName, @"\s", "-")); + if (date != null) { + firstUrl = firstUrl.SetQueryParam("Date", $"{date:d.MM.yyyy}"); + } + var firstResponse = await httpClient.GetStringAsync(firstUrl); + var firstDocument = await asContext.OpenAsync(req => req.Content(firstResponse)); + var firstForm = firstDocument.GetElementById("form-search")!; + + var firstResult = firstForm + .QuerySelectorAll("input") + .Where(elem => elem.Name != null) + .ToDictionary(elem => elem.Name!, elem => elem.Value); + + var secondUrl = "".AppendPathSegments("Stations", "StationsResult"); + var secondResponse = await httpClient.PostAsync( + secondUrl, +#pragma warning disable CS8620 + new FormUrlEncodedContent(firstResult) +#pragma warning restore CS8620 + ); + var secondResponseContent = await secondResponse.Content.ReadAsStringAsync(); + var secondDocument = await asContext.OpenAsync( + req => req.Content(secondResponseContent) + ); + + var (stationInfoDiv, (_, (departuresDiv, (arrivalsDiv, _)))) = secondDocument + .QuerySelectorAll("body > div"); + + (result.StationName, (result.Date, _)) = (StationInfoRegex.Match( + stationInfoDiv + .QuerySelector(":scope > h2")! + .Text() + .WithCollapsedSpaces() + ).Groups as IEnumerable).Skip(1).Select(group => group.Value); + + var (dateDay, (dateMonth, (dateYear, _))) = result.Date.Split('.').Select(int.Parse); + + void ParseArrDepList(IElement element, Action> adder) { + Utils.DateTimeSequencer dtSeq = new(dateYear, dateMonth, dateDay); + + if (element.QuerySelector(":scope > div > ul") == null) return; + + foreach (var trainElement in element.QuerySelectorAll(":scope > div > ul > li")) { + adder(arrDep => { + var divs = trainElement.QuerySelectorAll(":scope > div"); + var dataDiv = divs[0]; + var statusDiv = divs.Length >= 2 ? divs[1] : null; + + var (dataMainDiv, (dataDetailsDiv, _)) = dataDiv + .QuerySelectorAll(":scope > div"); + var (timeDiv, (destDiv, (trainDiv, _))) = dataMainDiv + .QuerySelectorAll(":scope > div"); + var (operatorDiv, (routeDiv, (stoppingTimeDiv, _))) = dataDetailsDiv + .QuerySelectorAll(":scope > div > div"); + + var timeResult = timeDiv + .QuerySelectorAll(":scope > div > div > div")[1] + .Text() + .WithCollapsedSpaces(); + var (stHr, (stMin, _)) = timeResult.Split(':').Select(int.Parse); + arrDep.Time = BucharestTz.AtLeniently( + dtSeq.Next(stHr, stMin).ToLocalDateTime() + ).ToDateTimeOffset(); + + // ReSharper disable once UnusedVariable // stOppositeTime: might be useful in the future + var (unknownSt, (st, (minsec, (stOppositeTime, _)))) = (StoppingTimeRegex.Match( + stoppingTimeDiv.QuerySelectorAll(":scope > div > div")[1] + .Text() + .WithCollapsedSpaces() + ).Groups as IEnumerable).Skip(1).Select(group => group.Value); + if (unknownSt.Length == 0 && st.Length > 0) { + arrDep.StoppingTime = int.Parse(st); + if (minsec == "min") { + arrDep.StoppingTime *= 60; + } + } + + arrDep.ModifyableTrain.Rank = trainDiv + .QuerySelectorAll(":scope > div > div > div")[1] + .QuerySelector(":scope > span")! + .Text() + .WithCollapsedSpaces(); + arrDep.ModifyableTrain.Number = trainDiv + .QuerySelectorAll(":scope > div > div > div")[1] + .QuerySelector(":scope > a")! + .Text() + .WithCollapsedSpaces(); + var trainUri = new Uri( + "http://localhost" + trainDiv + .QuerySelectorAll(":scope > div > div > div")[1] + .QuerySelector(":scope > a")! + .GetAttribute("href")! + ); + var (trainDepDay, (trainDepMonth, (trainDepYear, _))) = TrainUrlDateRegex + .Match(trainUri.Query) + .Groups + .Values + .Skip(1) + .Select(g => int.Parse(g.Value)); + arrDep.ModifyableTrain.DepartureDate = BucharestTz + .AtStartOfDay(new(trainDepYear, trainDepMonth, trainDepDay)) + .ToDateTimeOffset() + .ToUniversalTime(); + arrDep.ModifyableTrain.Terminus = destDiv + .QuerySelectorAll(":scope > div > div > div")[1] + .Text() + .WithCollapsedSpaces(); + arrDep.ModifyableTrain.Operator = operatorDiv + .QuerySelectorAll(":scope > div > div")[1] + .Text() + .WithCollapsedSpaces(); + foreach (var station in routeDiv.QuerySelectorAll(":scope > div > div")[1] + .Text() + .WithCollapsedSpaces() + .Split(" - ")) { + arrDep.ModifyableTrain.AddRouteStation(station); + } + + if (statusDiv == null) { + return; + } + + var statusDivComponents = statusDiv + .QuerySelectorAll(":scope > div")[0] + .QuerySelectorAll(":scope > div"); + + var delayDiv = statusDivComponents[0]; + + var (delayMin, (approx, _)) = (StatusRegex.Match( + delayDiv + .Text() + .WithCollapsedSpaces() + ).Groups as IEnumerable).Skip(1).Select(group => group.Value); + if (delayMin is null && delayDiv.Text().WithCollapsedSpaces() == "anulat") { + arrDep.ModifyableStatus.Cancelled = true; + } + else if (delayMin is null) { + throw new Exception($"Unexpected delayDiv value: {delayDiv.Text().WithCollapsedSpaces()}"); + } + else { + arrDep.ModifyableStatus.Real = string.IsNullOrEmpty(approx); + arrDep.ModifyableStatus.Delay = delayMin.Length == 0 ? 0 : int.Parse(delayMin); + } + + if (statusDivComponents.Length < 2) return; + + var platformDiv = statusDivComponents[1]; + arrDep.ModifyableStatus.Platform = PlatformRegex.Match(platformDiv.Text().WithCollapsedSpaces()) + .Groups[1].Value; + }); + } + } + + ParseArrDepList(departuresDiv, result.AddNewStationDeparture); + ParseArrDepList(arrivalsDiv, result.AddNewStationArrival); + + return result; + } + } +} diff --git a/scraper/src/Scrapers/Train.cs b/scraper/src/Scrapers/Train.cs index e043851..f3351d7 100644 --- a/scraper/src/Scrapers/Train.cs +++ b/scraper/src/Scrapers/Train.cs @@ -3,6 +3,7 @@ using System.Collections.Generic; using System.Linq; using System.Net; using System.Net.Http; +using System.Net.Http.Headers; using System.Text.RegularExpressions; using System.Threading.Tasks; using AngleSharp; @@ -69,6 +70,12 @@ namespace InfoferScraper.Scrapers { httpClient = new HttpClient(httpClientHandler) { BaseAddress = new Uri(BaseUrl), DefaultRequestVersion = new Version(2, 0), + DefaultRequestHeaders = { + UserAgent = { + new ProductInfoHeaderValue("new-infofer-scraper", "0.0.1"), + new ProductInfoHeaderValue("(developed by DC Dev as a hobby, get in touch at webmasterdcdev.ro for any issues)"), + }, + }, }; }