From 422b4727c0b6cd4e0d994db503dc74f0213b8b35 Mon Sep 17 00:00:00 2001 From: Dan Cojocaru Date: Sun, 1 Sep 2024 03:45:00 +0200 Subject: [PATCH] Add proxy and .NET 8 support --- ConsoleTest/ConsoleTest.csproj | 2 +- ConsoleTest/Program.cs | 6 +- scraper/scraper.csproj | 2 +- scraper/src/Scrapers/Route.cs | 57 +++++++++++-------- scraper/src/Scrapers/Station.cs | 47 +++++++++------ scraper/src/Scrapers/Train.cs | 36 ++++++++---- server/Models/ProxySettings.cs | 9 +++ .../Services/Implementations/DataManager.cs | 22 +++++-- server/Startup.cs | 18 +++++- server/Utils/Constants.cs | 7 +++ server/server.csproj | 2 +- 11 files changed, 143 insertions(+), 65 deletions(-) create mode 100644 server/Models/ProxySettings.cs create mode 100644 server/Utils/Constants.cs diff --git a/ConsoleTest/ConsoleTest.csproj b/ConsoleTest/ConsoleTest.csproj index 7c6a3da..40e087c 100644 --- a/ConsoleTest/ConsoleTest.csproj +++ b/ConsoleTest/ConsoleTest.csproj @@ -6,7 +6,7 @@ Exe - net6.0;net7.0 + net6.0;net7.0;net8.0 diff --git a/ConsoleTest/Program.cs b/ConsoleTest/Program.cs index c4d66da..0ce4b90 100644 --- a/ConsoleTest/Program.cs +++ b/ConsoleTest/Program.cs @@ -40,7 +40,7 @@ async Task PrintTrain() { Console.WriteLine( JsonSerializer.Serialize( - await TrainScraper.Scrape(trainNumber), + await new TrainScraper().Scrape(trainNumber), new JsonSerializerOptions { PropertyNamingPolicy = JsonNamingPolicy.CamelCase, WriteIndented = true, @@ -58,7 +58,7 @@ async Task PrintStation() { Console.WriteLine( JsonSerializer.Serialize( - await StationScraper.Scrape(stationName), + await new StationScraper().Scrape(stationName), new JsonSerializerOptions { PropertyNamingPolicy = JsonNamingPolicy.CamelCase, WriteIndented = true, @@ -74,7 +74,7 @@ async Task ScrapeItineraries() { if (from == null || to == null) return; - var data = await RouteScraper.Scrape(from, to); + var data = await new RouteScraper().Scrape(from, to); Console.WriteLine($"{data.Count} itineraries:"); Console.WriteLine(); diff --git a/scraper/scraper.csproj b/scraper/scraper.csproj index 4dfa9a6..25c1486 100644 --- a/scraper/scraper.csproj +++ b/scraper/scraper.csproj @@ -2,7 +2,7 @@ enable - net6.0;net7.0 + net6.0;net7.0;net8.0 diff --git a/scraper/src/Scrapers/Route.cs b/scraper/src/Scrapers/Route.cs index 110bd5f..3a224c6 100644 --- a/scraper/src/Scrapers/Route.cs +++ b/scraper/src/Scrapers/Route.cs @@ -16,20 +16,10 @@ using scraper.Models.Itinerary; namespace InfoferScraper.Scrapers; -public static class RouteScraper { +public class RouteScraper { private const string BaseUrl = "https://mersultrenurilor.infofer.ro/ro-RO/"; private static readonly DateTimeZone BucharestTz = DateTimeZoneProviders.Tzdb["Europe/Bucharest"]; - private static readonly CookieContainer CookieContainer = new(); - - private static readonly HttpClient HttpClient = new(new HttpClientHandler { - CookieContainer = CookieContainer, - UseCookies = true, - }) { - BaseAddress = new Uri(BaseUrl), - DefaultRequestVersion = new Version(2, 0), - }; - private static readonly Regex KmTrainRankNoRegex = new(@"^([0-9]+)\skm\scu\s([A-Z-]+)\s([0-9]+)$"); private static readonly Regex OperatorRegex = new(@$"^Operat\sde\s([{Utils.RoLetters}\s]+)$"); private static readonly Regex DepArrRegex = new(@"^(Ple|Sos)\s([0-9]+)\s([a-z]+)\.?\s([0-9]+):([0-9]+)$"); @@ -49,7 +39,28 @@ public static class RouteScraper { ["dec"] = 12, }; - public static async Task?> Scrape(string from, string to, DateTimeOffset? dateOverride = null) { + private readonly CookieContainer cookieContainer = new(); + + private readonly HttpClient httpClient; + + public RouteScraper(HttpClientHandler? httpClientHandler = null) { + if (httpClientHandler == null) { + httpClientHandler = new HttpClientHandler { + CookieContainer = cookieContainer, + UseCookies = true, + }; + } + else { + httpClientHandler.CookieContainer = cookieContainer; + httpClientHandler.UseCookies = true; + } + httpClient = new HttpClient(httpClientHandler) { + BaseAddress = new Uri(BaseUrl), + DefaultRequestVersion = new Version(2, 0), + }; + } + + public async Task?> Scrape(string from, string to, DateTimeOffset? dateOverride = null) { var dateOverrideInstant = dateOverride?.ToInstant().InZone(BucharestTz); dateOverride = dateOverrideInstant?.ToDateTimeOffset(); TrainScrapeResult result = new(); @@ -70,7 +81,7 @@ public static class RouteScraper { firstUrl = firstUrl.SetQueryParam("BetweenTrainsMinimumMinutes", "5"); firstUrl = firstUrl.SetQueryParam("ChangeStationName", ""); - var firstResponse = await HttpClient.GetStringAsync(firstUrl); + var firstResponse = await httpClient.GetStringAsync(firstUrl); var firstDocument = await asContext.OpenAsync(req => req.Content(firstResponse)); var firstForm = firstDocument.GetElementById("form-search")!; @@ -80,7 +91,7 @@ public static class RouteScraper { .ToDictionary(elem => elem.Name!, elem => elem.Value); var secondUrl = "".AppendPathSegments("Itineraries", "GetItineraries"); - var secondResponse = await HttpClient.PostAsync( + var secondResponse = await httpClient.PostAsync( secondUrl, #pragma warning disable CS8620 new FormUrlEncodedContent(firstResult) @@ -90,10 +101,10 @@ public static class RouteScraper { var secondDocument = await asContext.OpenAsync( req => req.Content(secondResponseContent) ); - + var (itineraryInfoDiv, _) = secondDocument .QuerySelectorAll("body > div"); - + if (itineraryInfoDiv == null) { return null; } @@ -103,7 +114,7 @@ public static class RouteScraper { var itineraries = new List(); foreach (var itineraryLi in itinerariesLi) { var itinerary = new Itinerary(); - + var cardDivs = itineraryLi.QuerySelectorAll(":scope > div > div > div > div"); var detailsDivs = cardDivs.Last() .QuerySelectorAll(":scope > div > div")[1] @@ -127,7 +138,7 @@ public static class RouteScraper { // Detail var detailColumns = li.QuerySelectorAll(":scope > div > div"); var leftSideDivs = detailColumns[0].QuerySelectorAll(":scope > div"); - + var departureDateText = leftSideDivs[0] .QuerySelectorAll(":scope > div")[1] .Text() @@ -144,7 +155,7 @@ public static class RouteScraper { if (departureDate < now.PlusDays(-1)) { departureDate = departureDate.PlusYears(1); } - + var arrivalDateText = leftSideDivs[3] .QuerySelectorAll(":scope > div")[1] .Text() @@ -168,7 +179,7 @@ public static class RouteScraper { .Text() .WithCollapsedSpaces(); var kmRankNumberMatch = KmTrainRankNoRegex.Match(kmRankNumberText); - + var operatorText = rightSideDivs[0] .QuerySelectorAll(":scope > div > div")[1] .Text() @@ -191,7 +202,7 @@ public static class RouteScraper { if (text == "Nu sunt stații intermediare.") continue; train.AddIntermediateStop(div.Text().WithCollapsedSpaces()); } - + details.Add(train); } } @@ -200,10 +211,10 @@ public static class RouteScraper { detail.To = iTo; itinerary.AddTrain(detail); } - + itineraries.Add(itinerary); } return itineraries; } -} \ No newline at end of file +} diff --git a/scraper/src/Scrapers/Station.cs b/scraper/src/Scrapers/Station.cs index 488b8d7..ac1315d 100644 --- a/scraper/src/Scrapers/Station.cs +++ b/scraper/src/Scrapers/Station.cs @@ -14,7 +14,7 @@ using NodaTime; using NodaTime.Extensions; namespace InfoferScraper.Scrapers { - public static class StationScraper { + public class StationScraper { private static readonly Regex StationInfoRegex = new($@"^([{Utils.RoLetters}.0-9 ]+)\sîn\s([0-9.]+)$"); private static readonly Regex StoppingTimeRegex = new( @@ -28,25 +28,36 @@ namespace InfoferScraper.Scrapers { private static readonly Regex PlatformRegex = new(@"^linia\s([A-Za-z0-9]+)$"); private static readonly Regex TrainUrlDateRegex = new(@"Date=([0-9]{2}).([0-9]{2}).([0-9]{4})"); - + private static readonly DateTimeZone BucharestTz = DateTimeZoneProviders.Tzdb["Europe/Bucharest"]; private const string BaseUrl = "https://mersultrenurilor.infofer.ro/ro-RO/"; - private static readonly CookieContainer CookieContainer = new(); + private readonly CookieContainer cookieContainer = new(); + + private readonly HttpClient httpClient; - private static readonly HttpClient HttpClient = new(new HttpClientHandler { - CookieContainer = CookieContainer, - UseCookies = true, - }) { - BaseAddress = new Uri(BaseUrl), - DefaultRequestVersion = new Version(2, 0), - }; + public StationScraper(HttpClientHandler? httpClientHandler = null) { + if (httpClientHandler == null) { + httpClientHandler = new HttpClientHandler { + CookieContainer = cookieContainer, + UseCookies = true, + }; + } + else { + httpClientHandler.CookieContainer = cookieContainer; + httpClientHandler.UseCookies = true; + } + httpClient = new HttpClient(httpClientHandler) { + BaseAddress = new Uri(BaseUrl), + DefaultRequestVersion = new Version(2, 0), + }; + } - public static async Task Scrape(string stationName, DateTimeOffset? date = null) { + public async Task Scrape(string stationName, DateTimeOffset? date = null) { var dateInstant = date?.ToInstant().InZone(BucharestTz); date = dateInstant?.ToDateTimeOffset(); - + stationName = stationName.RoLettersToEn(); var result = new StationScrapeResult(); @@ -59,7 +70,7 @@ namespace InfoferScraper.Scrapers { if (date != null) { firstUrl = firstUrl.SetQueryParam("Date", $"{date:d.MM.yyyy}"); } - var firstResponse = await HttpClient.GetStringAsync(firstUrl); + var firstResponse = await httpClient.GetStringAsync(firstUrl); var firstDocument = await asContext.OpenAsync(req => req.Content(firstResponse)); var firstForm = firstDocument.GetElementById("form-search")!; @@ -69,7 +80,7 @@ namespace InfoferScraper.Scrapers { .ToDictionary(elem => elem.Name!, elem => elem.Value); var secondUrl = "".AppendPathSegments("Stations", "StationsResult"); - var secondResponse = await HttpClient.PostAsync( + var secondResponse = await httpClient.PostAsync( secondUrl, #pragma warning disable CS8620 new FormUrlEncodedContent(firstResult) @@ -167,9 +178,9 @@ namespace InfoferScraper.Scrapers { .Text() .WithCollapsedSpaces(); foreach (var station in routeDiv.QuerySelectorAll(":scope > div > div")[1] - .Text() - .WithCollapsedSpaces() - .Split(" - ")) { + .Text() + .WithCollapsedSpaces() + .Split(" - ")) { arrDep.ModifyableTrain.AddRouteStation(station); } @@ -182,7 +193,7 @@ namespace InfoferScraper.Scrapers { .QuerySelectorAll(":scope > div"); var delayDiv = statusDivComponents[0]; - + var (delayMin, (approx, _)) = (StatusRegex.Match( delayDiv .Text() diff --git a/scraper/src/Scrapers/Train.cs b/scraper/src/Scrapers/Train.cs index abd2c40..e043851 100644 --- a/scraper/src/Scrapers/Train.cs +++ b/scraper/src/Scrapers/Train.cs @@ -15,7 +15,7 @@ using NodaTime.Extensions; using scraper.Exceptions; namespace InfoferScraper.Scrapers { - public static class TrainScraper { + public class TrainScraper { private const string BaseUrl = "https://mersultrenurilor.infofer.ro/ro-RO/"; private static readonly Regex TrainInfoRegex = new(@"^([A-Z-]+)\s([0-9]+)\sîn\s([0-9.]+)$"); private static readonly Regex OperatorRegex = new(@"^Operat\sde\s(.+)$"); @@ -51,16 +51,28 @@ namespace InfoferScraper.Scrapers { private static readonly DateTimeZone BucharestTz = DateTimeZoneProviders.Tzdb["Europe/Bucharest"]; - private static readonly CookieContainer CookieContainer = new(); - private static readonly HttpClient HttpClient = new(new HttpClientHandler { - CookieContainer = CookieContainer, - UseCookies = true, - }) { - BaseAddress = new Uri(BaseUrl), - DefaultRequestVersion = new Version(2, 0), - }; + private readonly CookieContainer cookieContainer = new(); + private readonly HttpClient httpClient; + + public TrainScraper(HttpClientHandler? httpClientHandler = null) + { + if (httpClientHandler == null) { + httpClientHandler = new HttpClientHandler { + CookieContainer = cookieContainer, + UseCookies = true, + }; + } + else { + httpClientHandler.CookieContainer = cookieContainer; + httpClientHandler.UseCookies = true; + } + httpClient = new HttpClient(httpClientHandler) { + BaseAddress = new Uri(BaseUrl), + DefaultRequestVersion = new Version(2, 0), + }; + } - public static async Task Scrape(string trainNumber, DateTimeOffset? dateOverride = null) { + public async Task Scrape(string trainNumber, DateTimeOffset? dateOverride = null) { var dateOverrideInstant = dateOverride?.ToInstant().InZone(BucharestTz); dateOverride = dateOverrideInstant?.ToDateTimeOffset(); TrainScrapeResult result = new(); @@ -73,7 +85,7 @@ namespace InfoferScraper.Scrapers { if (dateOverride != null) { firstUrl = firstUrl.SetQueryParam("Date", $"{dateOverride:d.MM.yyyy}"); } - var firstResponse = await HttpClient.GetStringAsync(firstUrl); + var firstResponse = await httpClient.GetStringAsync(firstUrl); var firstDocument = await asContext.OpenAsync(req => req.Content(firstResponse)); var firstForm = firstDocument.GetElementById("form-search")!; @@ -83,7 +95,7 @@ namespace InfoferScraper.Scrapers { .ToDictionary(elem => elem.Name!, elem => elem.Value); var secondUrl = "".AppendPathSegments("Trains", "TrainsResult"); - var secondResponse = await HttpClient.PostAsync( + var secondResponse = await httpClient.PostAsync( secondUrl, #pragma warning disable CS8620 new FormUrlEncodedContent(firstResult) diff --git a/server/Models/ProxySettings.cs b/server/Models/ProxySettings.cs new file mode 100644 index 0000000..389e9c3 --- /dev/null +++ b/server/Models/ProxySettings.cs @@ -0,0 +1,9 @@ +namespace Server.Models; + +public record ProxySettings(string Url, ProxyCredentials? Credentials = null) { + public ProxySettings() : this("") { } +} + +public record ProxyCredentials(string Username, string Password) { + public ProxyCredentials() : this("", "") { } +} diff --git a/server/Services/Implementations/DataManager.cs b/server/Services/Implementations/DataManager.cs index f5b99c8..a8e0e25 100644 --- a/server/Services/Implementations/DataManager.cs +++ b/server/Services/Implementations/DataManager.cs @@ -1,12 +1,15 @@ using System; using System.Collections.Generic; using System.Diagnostics; +using System.Net; +using System.Net.Http; using System.Threading.Tasks; using InfoferScraper; using InfoferScraper.Models.Station; using InfoferScraper.Models.Train; using Microsoft.Extensions.Logging; using scraper.Models.Itinerary; +using Server.Models; using Server.Services.Interfaces; using Server.Utils; @@ -18,17 +21,26 @@ namespace Server.Services.Implementations { private NodaTime.IDateTimeZoneProvider TzProvider { get; } private NodaTime.DateTimeZone CfrTimeZone => TzProvider["Europe/Bucharest"]; - public DataManager(NodaTime.IDateTimeZoneProvider tzProvider, IDatabase database, ILogger logger) { + public DataManager(NodaTime.IDateTimeZoneProvider tzProvider, IDatabase database, ILogger logger, ProxySettings? proxySettings) { this.TzProvider = tzProvider; this.Database = database; this.Logger = logger; + HttpClientHandler httpClientHandler = new (){ + UseProxy = proxySettings != null, + Proxy = proxySettings == null ? null : new WebProxy(proxySettings.Url), + DefaultProxyCredentials = proxySettings?.Credentials == null ? null : new NetworkCredential(proxySettings.Credentials.Username, proxySettings.Credentials.Password), + }; + InfoferScraper.Scrapers.StationScraper stationScraper = new(httpClientHandler); + InfoferScraper.Scrapers.TrainScraper trainScraper = new(httpClientHandler); + InfoferScraper.Scrapers.RouteScraper routeScraper = new(httpClientHandler); + stationCache = new(async (t) => { var (stationName, date) = t; Logger.LogDebug("Fetching station {StationName} for date {Date}", stationName, date); var zonedDate = new NodaTime.LocalDate(date.Year, date.Month, date.Day).AtStartOfDayInZone(CfrTimeZone); - var station = await InfoferScraper.Scrapers.StationScraper.Scrape(stationName, zonedDate.ToDateTimeOffset()); + var station = await stationScraper.Scrape(stationName, zonedDate.ToDateTimeOffset()); if (station != null) { _ = Task.Run(async () => { var watch = Stopwatch.StartNew(); @@ -44,7 +56,7 @@ namespace Server.Services.Implementations { Logger.LogDebug("Fetching train {TrainNumber} for date {Date}", trainNumber, date); var zonedDate = new NodaTime.LocalDate(date.Year, date.Month, date.Day).AtStartOfDayInZone(CfrTimeZone); - var train = await InfoferScraper.Scrapers.TrainScraper.Scrape(trainNumber, zonedDate.ToDateTimeOffset()); + var train = await trainScraper.Scrape(trainNumber, zonedDate.ToDateTimeOffset()); if (train != null) { _ = Task.Run(async () => { var watch = Stopwatch.StartNew(); @@ -60,7 +72,7 @@ namespace Server.Services.Implementations { Logger.LogDebug("Fetching itinerary from {From} to {To} for date {Date}", from, to, date); var zonedDate = new NodaTime.LocalDate(date.Year, date.Month, date.Day).AtStartOfDayInZone(CfrTimeZone); - var itineraries = await InfoferScraper.Scrapers.RouteScraper.Scrape(from, to, zonedDate.ToDateTimeOffset()); + var itineraries = await routeScraper.Scrape(from, to, zonedDate.ToDateTimeOffset()); if (itineraries != null) { _ = Task.Run(async () => { var watch = Stopwatch.StartNew(); @@ -99,4 +111,4 @@ namespace Server.Services.Implementations { return await itinerariesCache.GetItem((from, to, cfrDate)); } } -} \ No newline at end of file +} diff --git a/server/Startup.cs b/server/Startup.cs index ffaea98..4c573df 100644 --- a/server/Startup.cs +++ b/server/Startup.cs @@ -10,6 +10,7 @@ using Microsoft.Extensions.Hosting; using Microsoft.OpenApi.Models; using MongoDB.Bson.Serialization.Conventions; using Newtonsoft.Json.Serialization; +using Server.Models; using Server.Models.Database; using Server.Services.Implementations; using Server.Services.Interfaces; @@ -30,12 +31,27 @@ namespace Server { }); } + services.Configure(Configuration.GetSection("Proxy")); services.Configure(Configuration.GetSection("TrainDataMongo")); var conventionPack = new ConventionPack { new CamelCaseElementNameConvention() }; ConventionRegistry.Register("camelCase", conventionPack, _ => true); services.AddSingleton(); services.AddSingleton(); - services.AddSingleton(NodaTime.DateTimeZoneProviders.Tzdb); + services.AddSingleton(NodaTime.DateTimeZoneProviders.Tzdb); + + services.AddSingleton((serviceProvider) => { + var conf = serviceProvider.GetRequiredService(); + var section = conf.GetSection("FileStorage"); + switch (section["Type"]) { + case "local": { + var dir = section["Directory"]; + return new LocalFileStorage(dir!); + } + default: + throw new Exception("Unable to configure FileStorage"); + } + }); + services.AddControllers() .AddNewtonsoftJson(options => { options.SerializerSettings.ContractResolver = new DefaultContractResolver { diff --git a/server/Utils/Constants.cs b/server/Utils/Constants.cs new file mode 100644 index 0000000..d59b5cf --- /dev/null +++ b/server/Utils/Constants.cs @@ -0,0 +1,7 @@ +using NodaTime; + +namespace Server.Utils; + +public static class Constants { + public static readonly DateTimeZone BucharestTz = DateTimeZoneProviders.Tzdb["Europe/Bucharest"]; +} \ No newline at end of file diff --git a/server/server.csproj b/server/server.csproj index c06495a..ec644e6 100644 --- a/server/server.csproj +++ b/server/server.csproj @@ -5,7 +5,7 @@ Server Server 11 - net6.0;net7.0 + net6.0;net7.0;net8.0