Compare commits

...

2 Commits

  1. 2
      ConsoleTest/ConsoleTest.csproj
  2. 6
      ConsoleTest/Program.cs
  3. 6
      Dockerfile
  4. 2
      scraper/scraper.csproj
  5. 57
      scraper/src/Scrapers/Route.cs
  6. 47
      scraper/src/Scrapers/Station.cs
  7. 36
      scraper/src/Scrapers/Train.cs
  8. 9
      server/Models/ProxySettings.cs
  9. 22
      server/Services/Implementations/DataManager.cs
  10. 5
      server/Startup.cs
  11. 7
      server/Utils/Constants.cs
  12. 2
      server/server.csproj

2
ConsoleTest/ConsoleTest.csproj

@ -6,7 +6,7 @@
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFrameworks>net6.0;net7.0</TargetFrameworks>
<TargetFrameworks>net6.0;net7.0;net8.0</TargetFrameworks>
</PropertyGroup>
</Project>

6
ConsoleTest/Program.cs

@ -40,7 +40,7 @@ async Task PrintTrain() {
Console.WriteLine(
JsonSerializer.Serialize(
await TrainScraper.Scrape(trainNumber),
await new TrainScraper().Scrape(trainNumber),
new JsonSerializerOptions {
PropertyNamingPolicy = JsonNamingPolicy.CamelCase,
WriteIndented = true,
@ -58,7 +58,7 @@ async Task PrintStation() {
Console.WriteLine(
JsonSerializer.Serialize(
await StationScraper.Scrape(stationName),
await new StationScraper().Scrape(stationName),
new JsonSerializerOptions {
PropertyNamingPolicy = JsonNamingPolicy.CamelCase,
WriteIndented = true,
@ -74,7 +74,7 @@ async Task ScrapeItineraries() {
if (from == null || to == null) return;
var data = await RouteScraper.Scrape(from, to);
var data = await new RouteScraper().Scrape(from, to);
Console.WriteLine($"{data.Count} itineraries:");
Console.WriteLine();

6
Dockerfile

@ -1,5 +1,5 @@
# https://hub.docker.com/_/microsoft-dotnet
FROM mcr.microsoft.com/dotnet/sdk:7.0 AS build
FROM mcr.microsoft.com/dotnet/sdk:8.0 AS build
WORKDIR /source
# copy csproj and restore as distinct layers
@ -14,10 +14,10 @@ COPY server/. ./server/
COPY scraper/. ./scraper/
COPY ConsoleTest/. ./ConsoleTest/
WORKDIR /source/server
RUN dotnet publish -f net7.0 -c release -o /app --no-restore
RUN dotnet publish -f net8.0 -c release -o /app --no-restore
# final stage/image
FROM mcr.microsoft.com/dotnet/aspnet:7.0
FROM mcr.microsoft.com/dotnet/aspnet:8.0
WORKDIR /app
COPY --from=build /app ./
ENV INSIDE_DOCKER=true

2
scraper/scraper.csproj

@ -2,7 +2,7 @@
<PropertyGroup>
<Nullable>enable</Nullable>
<TargetFrameworks>net6.0;net7.0</TargetFrameworks>
<TargetFrameworks>net6.0;net7.0;net8.0</TargetFrameworks>
</PropertyGroup>
<ItemGroup>

57
scraper/src/Scrapers/Route.cs

@ -16,20 +16,10 @@ using scraper.Models.Itinerary;
namespace InfoferScraper.Scrapers;
public static class RouteScraper {
public class RouteScraper {
private const string BaseUrl = "https://mersultrenurilor.infofer.ro/ro-RO/";
private static readonly DateTimeZone BucharestTz = DateTimeZoneProviders.Tzdb["Europe/Bucharest"];
private static readonly CookieContainer CookieContainer = new();
private static readonly HttpClient HttpClient = new(new HttpClientHandler {
CookieContainer = CookieContainer,
UseCookies = true,
}) {
BaseAddress = new Uri(BaseUrl),
DefaultRequestVersion = new Version(2, 0),
};
private static readonly Regex KmTrainRankNoRegex = new(@"^([0-9]+)\skm\scu\s([A-Z-]+)\s([0-9]+)$");
private static readonly Regex OperatorRegex = new(@$"^Operat\sde\s([{Utils.RoLetters}\s]+)$");
private static readonly Regex DepArrRegex = new(@"^(Ple|Sos)\s([0-9]+)\s([a-z]+)\.?\s([0-9]+):([0-9]+)$");
@ -49,7 +39,28 @@ public static class RouteScraper {
["dec"] = 12,
};
public static async Task<List<IItinerary>?> Scrape(string from, string to, DateTimeOffset? dateOverride = null) {
private readonly CookieContainer cookieContainer = new();
private readonly HttpClient httpClient;
public RouteScraper(HttpClientHandler? httpClientHandler = null) {
if (httpClientHandler == null) {
httpClientHandler = new HttpClientHandler {
CookieContainer = cookieContainer,
UseCookies = true,
};
}
else {
httpClientHandler.CookieContainer = cookieContainer;
httpClientHandler.UseCookies = true;
}
httpClient = new HttpClient(httpClientHandler) {
BaseAddress = new Uri(BaseUrl),
DefaultRequestVersion = new Version(2, 0),
};
}
public async Task<List<IItinerary>?> Scrape(string from, string to, DateTimeOffset? dateOverride = null) {
var dateOverrideInstant = dateOverride?.ToInstant().InZone(BucharestTz);
dateOverride = dateOverrideInstant?.ToDateTimeOffset();
TrainScrapeResult result = new();
@ -70,7 +81,7 @@ public static class RouteScraper {
firstUrl = firstUrl.SetQueryParam("BetweenTrainsMinimumMinutes", "5");
firstUrl = firstUrl.SetQueryParam("ChangeStationName", "");
var firstResponse = await HttpClient.GetStringAsync(firstUrl);
var firstResponse = await httpClient.GetStringAsync(firstUrl);
var firstDocument = await asContext.OpenAsync(req => req.Content(firstResponse));
var firstForm = firstDocument.GetElementById("form-search")!;
@ -80,7 +91,7 @@ public static class RouteScraper {
.ToDictionary(elem => elem.Name!, elem => elem.Value);
var secondUrl = "".AppendPathSegments("Itineraries", "GetItineraries");
var secondResponse = await HttpClient.PostAsync(
var secondResponse = await httpClient.PostAsync(
secondUrl,
#pragma warning disable CS8620
new FormUrlEncodedContent(firstResult)
@ -90,10 +101,10 @@ public static class RouteScraper {
var secondDocument = await asContext.OpenAsync(
req => req.Content(secondResponseContent)
);
var (itineraryInfoDiv, _) = secondDocument
.QuerySelectorAll("body > div");
if (itineraryInfoDiv == null) {
return null;
}
@ -103,7 +114,7 @@ public static class RouteScraper {
var itineraries = new List<IItinerary>();
foreach (var itineraryLi in itinerariesLi) {
var itinerary = new Itinerary();
var cardDivs = itineraryLi.QuerySelectorAll(":scope > div > div > div > div");
var detailsDivs = cardDivs.Last()
.QuerySelectorAll(":scope > div > div")[1]
@ -127,7 +138,7 @@ public static class RouteScraper {
// Detail
var detailColumns = li.QuerySelectorAll(":scope > div > div");
var leftSideDivs = detailColumns[0].QuerySelectorAll(":scope > div");
var departureDateText = leftSideDivs[0]
.QuerySelectorAll(":scope > div")[1]
.Text()
@ -144,7 +155,7 @@ public static class RouteScraper {
if (departureDate < now.PlusDays(-1)) {
departureDate = departureDate.PlusYears(1);
}
var arrivalDateText = leftSideDivs[3]
.QuerySelectorAll(":scope > div")[1]
.Text()
@ -168,7 +179,7 @@ public static class RouteScraper {
.Text()
.WithCollapsedSpaces();
var kmRankNumberMatch = KmTrainRankNoRegex.Match(kmRankNumberText);
var operatorText = rightSideDivs[0]
.QuerySelectorAll(":scope > div > div")[1]
.Text()
@ -191,7 +202,7 @@ public static class RouteScraper {
if (text == "Nu sunt stații intermediare.") continue;
train.AddIntermediateStop(div.Text().WithCollapsedSpaces());
}
details.Add(train);
}
}
@ -200,10 +211,10 @@ public static class RouteScraper {
detail.To = iTo;
itinerary.AddTrain(detail);
}
itineraries.Add(itinerary);
}
return itineraries;
}
}
}

47
scraper/src/Scrapers/Station.cs

@ -14,7 +14,7 @@ using NodaTime;
using NodaTime.Extensions;
namespace InfoferScraper.Scrapers {
public static class StationScraper {
public class StationScraper {
private static readonly Regex StationInfoRegex = new($@"^([{Utils.RoLetters}.0-9 ]+)\sîn\s([0-9.]+)$");
private static readonly Regex StoppingTimeRegex = new(
@ -28,25 +28,36 @@ namespace InfoferScraper.Scrapers {
private static readonly Regex PlatformRegex = new(@"^linia\s([A-Za-z0-9]+)$");
private static readonly Regex TrainUrlDateRegex = new(@"Date=([0-9]{2}).([0-9]{2}).([0-9]{4})");
private static readonly DateTimeZone BucharestTz = DateTimeZoneProviders.Tzdb["Europe/Bucharest"];
private const string BaseUrl = "https://mersultrenurilor.infofer.ro/ro-RO/";
private static readonly CookieContainer CookieContainer = new();
private readonly CookieContainer cookieContainer = new();
private readonly HttpClient httpClient;
private static readonly HttpClient HttpClient = new(new HttpClientHandler {
CookieContainer = CookieContainer,
UseCookies = true,
}) {
BaseAddress = new Uri(BaseUrl),
DefaultRequestVersion = new Version(2, 0),
};
public StationScraper(HttpClientHandler? httpClientHandler = null) {
if (httpClientHandler == null) {
httpClientHandler = new HttpClientHandler {
CookieContainer = cookieContainer,
UseCookies = true,
};
}
else {
httpClientHandler.CookieContainer = cookieContainer;
httpClientHandler.UseCookies = true;
}
httpClient = new HttpClient(httpClientHandler) {
BaseAddress = new Uri(BaseUrl),
DefaultRequestVersion = new Version(2, 0),
};
}
public static async Task<IStationScrapeResult> Scrape(string stationName, DateTimeOffset? date = null) {
public async Task<IStationScrapeResult> Scrape(string stationName, DateTimeOffset? date = null) {
var dateInstant = date?.ToInstant().InZone(BucharestTz);
date = dateInstant?.ToDateTimeOffset();
stationName = stationName.RoLettersToEn();
var result = new StationScrapeResult();
@ -59,7 +70,7 @@ namespace InfoferScraper.Scrapers {
if (date != null) {
firstUrl = firstUrl.SetQueryParam("Date", $"{date:d.MM.yyyy}");
}
var firstResponse = await HttpClient.GetStringAsync(firstUrl);
var firstResponse = await httpClient.GetStringAsync(firstUrl);
var firstDocument = await asContext.OpenAsync(req => req.Content(firstResponse));
var firstForm = firstDocument.GetElementById("form-search")!;
@ -69,7 +80,7 @@ namespace InfoferScraper.Scrapers {
.ToDictionary(elem => elem.Name!, elem => elem.Value);
var secondUrl = "".AppendPathSegments("Stations", "StationsResult");
var secondResponse = await HttpClient.PostAsync(
var secondResponse = await httpClient.PostAsync(
secondUrl,
#pragma warning disable CS8620
new FormUrlEncodedContent(firstResult)
@ -167,9 +178,9 @@ namespace InfoferScraper.Scrapers {
.Text()
.WithCollapsedSpaces();
foreach (var station in routeDiv.QuerySelectorAll(":scope > div > div")[1]
.Text()
.WithCollapsedSpaces()
.Split(" - ")) {
.Text()
.WithCollapsedSpaces()
.Split(" - ")) {
arrDep.ModifyableTrain.AddRouteStation(station);
}
@ -182,7 +193,7 @@ namespace InfoferScraper.Scrapers {
.QuerySelectorAll(":scope > div");
var delayDiv = statusDivComponents[0];
var (delayMin, (approx, _)) = (StatusRegex.Match(
delayDiv
.Text()

36
scraper/src/Scrapers/Train.cs

@ -15,7 +15,7 @@ using NodaTime.Extensions;
using scraper.Exceptions;
namespace InfoferScraper.Scrapers {
public static class TrainScraper {
public class TrainScraper {
private const string BaseUrl = "https://mersultrenurilor.infofer.ro/ro-RO/";
private static readonly Regex TrainInfoRegex = new(@"^([A-Z-]+)\s([0-9]+)\sîn\s([0-9.]+)$");
private static readonly Regex OperatorRegex = new(@"^Operat\sde\s(.+)$");
@ -51,16 +51,28 @@ namespace InfoferScraper.Scrapers {
private static readonly DateTimeZone BucharestTz = DateTimeZoneProviders.Tzdb["Europe/Bucharest"];
private static readonly CookieContainer CookieContainer = new();
private static readonly HttpClient HttpClient = new(new HttpClientHandler {
CookieContainer = CookieContainer,
UseCookies = true,
}) {
BaseAddress = new Uri(BaseUrl),
DefaultRequestVersion = new Version(2, 0),
};
private readonly CookieContainer cookieContainer = new();
private readonly HttpClient httpClient;
public TrainScraper(HttpClientHandler? httpClientHandler = null)
{
if (httpClientHandler == null) {
httpClientHandler = new HttpClientHandler {
CookieContainer = cookieContainer,
UseCookies = true,
};
}
else {
httpClientHandler.CookieContainer = cookieContainer;
httpClientHandler.UseCookies = true;
}
httpClient = new HttpClient(httpClientHandler) {
BaseAddress = new Uri(BaseUrl),
DefaultRequestVersion = new Version(2, 0),
};
}
public static async Task<ITrainScrapeResult?> Scrape(string trainNumber, DateTimeOffset? dateOverride = null) {
public async Task<ITrainScrapeResult?> Scrape(string trainNumber, DateTimeOffset? dateOverride = null) {
var dateOverrideInstant = dateOverride?.ToInstant().InZone(BucharestTz);
dateOverride = dateOverrideInstant?.ToDateTimeOffset();
TrainScrapeResult result = new();
@ -73,7 +85,7 @@ namespace InfoferScraper.Scrapers {
if (dateOverride != null) {
firstUrl = firstUrl.SetQueryParam("Date", $"{dateOverride:d.MM.yyyy}");
}
var firstResponse = await HttpClient.GetStringAsync(firstUrl);
var firstResponse = await httpClient.GetStringAsync(firstUrl);
var firstDocument = await asContext.OpenAsync(req => req.Content(firstResponse));
var firstForm = firstDocument.GetElementById("form-search")!;
@ -83,7 +95,7 @@ namespace InfoferScraper.Scrapers {
.ToDictionary(elem => elem.Name!, elem => elem.Value);
var secondUrl = "".AppendPathSegments("Trains", "TrainsResult");
var secondResponse = await HttpClient.PostAsync(
var secondResponse = await httpClient.PostAsync(
secondUrl,
#pragma warning disable CS8620
new FormUrlEncodedContent(firstResult)

9
server/Models/ProxySettings.cs

@ -0,0 +1,9 @@
namespace Server.Models;
public record ProxySettings(string Url, ProxyCredentials? Credentials = null) {
public ProxySettings() : this("") { }
}
public record ProxyCredentials(string Username, string Password) {
public ProxyCredentials() : this("", "") { }
}

22
server/Services/Implementations/DataManager.cs

@ -1,12 +1,15 @@
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Net;
using System.Net.Http;
using System.Threading.Tasks;
using InfoferScraper;
using InfoferScraper.Models.Station;
using InfoferScraper.Models.Train;
using Microsoft.Extensions.Logging;
using scraper.Models.Itinerary;
using Server.Models;
using Server.Services.Interfaces;
using Server.Utils;
@ -18,17 +21,26 @@ namespace Server.Services.Implementations {
private NodaTime.IDateTimeZoneProvider TzProvider { get; }
private NodaTime.DateTimeZone CfrTimeZone => TzProvider["Europe/Bucharest"];
public DataManager(NodaTime.IDateTimeZoneProvider tzProvider, IDatabase database, ILogger<DataManager> logger) {
public DataManager(NodaTime.IDateTimeZoneProvider tzProvider, IDatabase database, ILogger<DataManager> logger, ProxySettings? proxySettings) {
this.TzProvider = tzProvider;
this.Database = database;
this.Logger = logger;
HttpClientHandler httpClientHandler = new (){
UseProxy = proxySettings != null,
Proxy = proxySettings == null ? null : new WebProxy(proxySettings.Url),
DefaultProxyCredentials = proxySettings?.Credentials == null ? null : new NetworkCredential(proxySettings.Credentials.Username, proxySettings.Credentials.Password),
};
InfoferScraper.Scrapers.StationScraper stationScraper = new(httpClientHandler);
InfoferScraper.Scrapers.TrainScraper trainScraper = new(httpClientHandler);
InfoferScraper.Scrapers.RouteScraper routeScraper = new(httpClientHandler);
stationCache = new(async (t) => {
var (stationName, date) = t;
Logger.LogDebug("Fetching station {StationName} for date {Date}", stationName, date);
var zonedDate = new NodaTime.LocalDate(date.Year, date.Month, date.Day).AtStartOfDayInZone(CfrTimeZone);
var station = await InfoferScraper.Scrapers.StationScraper.Scrape(stationName, zonedDate.ToDateTimeOffset());
var station = await stationScraper.Scrape(stationName, zonedDate.ToDateTimeOffset());
if (station != null) {
_ = Task.Run(async () => {
var watch = Stopwatch.StartNew();
@ -44,7 +56,7 @@ namespace Server.Services.Implementations {
Logger.LogDebug("Fetching train {TrainNumber} for date {Date}", trainNumber, date);
var zonedDate = new NodaTime.LocalDate(date.Year, date.Month, date.Day).AtStartOfDayInZone(CfrTimeZone);
var train = await InfoferScraper.Scrapers.TrainScraper.Scrape(trainNumber, zonedDate.ToDateTimeOffset());
var train = await trainScraper.Scrape(trainNumber, zonedDate.ToDateTimeOffset());
if (train != null) {
_ = Task.Run(async () => {
var watch = Stopwatch.StartNew();
@ -60,7 +72,7 @@ namespace Server.Services.Implementations {
Logger.LogDebug("Fetching itinerary from {From} to {To} for date {Date}", from, to, date);
var zonedDate = new NodaTime.LocalDate(date.Year, date.Month, date.Day).AtStartOfDayInZone(CfrTimeZone);
var itineraries = await InfoferScraper.Scrapers.RouteScraper.Scrape(from, to, zonedDate.ToDateTimeOffset());
var itineraries = await routeScraper.Scrape(from, to, zonedDate.ToDateTimeOffset());
if (itineraries != null) {
_ = Task.Run(async () => {
var watch = Stopwatch.StartNew();
@ -99,4 +111,4 @@ namespace Server.Services.Implementations {
return await itinerariesCache.GetItem((from, to, cfrDate));
}
}
}
}

5
server/Startup.cs

@ -10,6 +10,7 @@ using Microsoft.Extensions.Hosting;
using Microsoft.OpenApi.Models;
using MongoDB.Bson.Serialization.Conventions;
using Newtonsoft.Json.Serialization;
using Server.Models;
using Server.Models.Database;
using Server.Services.Implementations;
using Server.Services.Interfaces;
@ -30,12 +31,14 @@ namespace Server {
});
}
services.Configure<ProxySettings>(Configuration.GetSection("Proxy"));
services.Configure<MongoSettings>(Configuration.GetSection("TrainDataMongo"));
var conventionPack = new ConventionPack { new CamelCaseElementNameConvention() };
ConventionRegistry.Register("camelCase", conventionPack, _ => true);
services.AddSingleton<IDataManager, DataManager>();
services.AddSingleton<IDatabase, Database>();
services.AddSingleton<NodaTime.IDateTimeZoneProvider>(NodaTime.DateTimeZoneProviders.Tzdb);
services.AddSingleton(NodaTime.DateTimeZoneProviders.Tzdb);
services.AddControllers()
.AddNewtonsoftJson(options => {
options.SerializerSettings.ContractResolver = new DefaultContractResolver {

7
server/Utils/Constants.cs

@ -0,0 +1,7 @@
using NodaTime;
namespace Server.Utils;
public static class Constants {
public static readonly DateTimeZone BucharestTz = DateTimeZoneProviders.Tzdb["Europe/Bucharest"];
}

2
server/server.csproj

@ -5,7 +5,7 @@
<AssemblyName>Server</AssemblyName>
<RootNamespace>Server</RootNamespace>
<LangVersion>11</LangVersion>
<TargetFrameworks>net6.0;net7.0</TargetFrameworks>
<TargetFrameworks>net6.0;net7.0;net8.0</TargetFrameworks>
</PropertyGroup>
<ItemGroup>

Loading…
Cancel
Save