Browse Source

Add UserAgent

master
Kenneth Bruen 4 weeks ago
parent
commit
6c6d69ee26
Signed by: kbruen
GPG Key ID: C1980A470C3EE5B1
  1. 7
      scraper/src/Scrapers/Route.cs
  2. 463
      scraper/src/Scrapers/Station.cs
  3. 7
      scraper/src/Scrapers/Train.cs

7
scraper/src/Scrapers/Route.cs

@ -3,6 +3,7 @@ using System.Collections.Generic;
using System.Linq; using System.Linq;
using System.Net; using System.Net;
using System.Net.Http; using System.Net.Http;
using System.Net.Http.Headers;
using System.Text.RegularExpressions; using System.Text.RegularExpressions;
using System.Threading.Tasks; using System.Threading.Tasks;
using AngleSharp; using AngleSharp;
@ -57,6 +58,12 @@ public class RouteScraper {
httpClient = new HttpClient(httpClientHandler) { httpClient = new HttpClient(httpClientHandler) {
BaseAddress = new Uri(BaseUrl), BaseAddress = new Uri(BaseUrl),
DefaultRequestVersion = new Version(2, 0), DefaultRequestVersion = new Version(2, 0),
DefaultRequestHeaders = {
UserAgent = {
new ProductInfoHeaderValue("new-infofer-scraper", "0.0.1"),
new ProductInfoHeaderValue("(developed by DC Dev as a hobby, get in touch at webmaster<at>dcdev.ro for any issues)"),
},
},
}; };
} }

463
scraper/src/Scrapers/Station.cs

@ -1,228 +1,235 @@
using System; using System;
using System.Collections.Generic; using System.Collections.Generic;
using System.Linq; using System.Linq;
using System.Net; using System.Net;
using System.Net.Http; using System.Net.Http;
using System.Text.RegularExpressions; using System.Net.Http.Headers;
using System.Threading.Tasks; using System.Text.RegularExpressions;
using AngleSharp; using System.Threading.Tasks;
using AngleSharp.Dom; using AngleSharp;
using AngleSharp.Html.Dom; using AngleSharp.Dom;
using Flurl; using AngleSharp.Html.Dom;
using InfoferScraper.Models.Station; using Flurl;
using NodaTime; using InfoferScraper.Models.Station;
using NodaTime.Extensions; using NodaTime;
using NodaTime.Extensions;
namespace InfoferScraper.Scrapers {
public class StationScraper { namespace InfoferScraper.Scrapers {
private static readonly Regex StationInfoRegex = new($@"^([{Utils.RoLetters}.0-9 ]+)\sîn\s([0-9.]+)$"); public class StationScraper {
private static readonly Regex StationInfoRegex = new($@"^([{Utils.RoLetters}.0-9 ]+)\sîn\s([0-9.]+)$");
private static readonly Regex StoppingTimeRegex = new(
@"^(necunoscută \(stație terminus\))|(?:([0-9]+) (min|sec) \((?:începând cu|până la) ([0-9]{1,2}:[0-9]{2})\))$" private static readonly Regex StoppingTimeRegex = new(
); @"^(necunoscută \(stație terminus\))|(?:([0-9]+) (min|sec) \((?:începând cu|până la) ([0-9]{1,2}:[0-9]{2})\))$"
);
private static readonly Regex StatusRegex = new(
@"^(?:la timp|([+-]?[0-9]+) min \((?:întârziere|mai devreme)\))(\*?)$" private static readonly Regex StatusRegex = new(
); @"^(?:la timp|([+-]?[0-9]+) min \((?:întârziere|mai devreme)\))(\*?)$"
);
private static readonly Regex PlatformRegex = new(@"^linia\s([A-Za-z0-9]+)$");
private static readonly Regex PlatformRegex = new(@"^linia\s([A-Za-z0-9]+)$");
private static readonly Regex TrainUrlDateRegex = new(@"Date=([0-9]{2}).([0-9]{2}).([0-9]{4})");
private static readonly Regex TrainUrlDateRegex = new(@"Date=([0-9]{2}).([0-9]{2}).([0-9]{4})");
private static readonly DateTimeZone BucharestTz = DateTimeZoneProviders.Tzdb["Europe/Bucharest"];
private static readonly DateTimeZone BucharestTz = DateTimeZoneProviders.Tzdb["Europe/Bucharest"];
private const string BaseUrl = "https://mersultrenurilor.infofer.ro/ro-RO/";
private const string BaseUrl = "https://mersultrenurilor.infofer.ro/ro-RO/";
private readonly CookieContainer cookieContainer = new();
private readonly CookieContainer cookieContainer = new();
private readonly HttpClient httpClient;
private readonly HttpClient httpClient;
public StationScraper(HttpClientHandler? httpClientHandler = null) {
if (httpClientHandler == null) { public StationScraper(HttpClientHandler? httpClientHandler = null) {
httpClientHandler = new HttpClientHandler { if (httpClientHandler == null) {
CookieContainer = cookieContainer, httpClientHandler = new HttpClientHandler {
UseCookies = true, CookieContainer = cookieContainer,
}; UseCookies = true,
} };
else { }
httpClientHandler.CookieContainer = cookieContainer; else {
httpClientHandler.UseCookies = true; httpClientHandler.CookieContainer = cookieContainer;
} httpClientHandler.UseCookies = true;
httpClient = new HttpClient(httpClientHandler) { }
BaseAddress = new Uri(BaseUrl), httpClient = new HttpClient(httpClientHandler) {
DefaultRequestVersion = new Version(2, 0), BaseAddress = new Uri(BaseUrl),
}; DefaultRequestVersion = new Version(2, 0),
} DefaultRequestHeaders = {
UserAgent = {
public async Task<IStationScrapeResult> Scrape(string stationName, DateTimeOffset? date = null) { new ProductInfoHeaderValue("new-infofer-scraper", "0.0.1"),
var dateInstant = date?.ToInstant().InZone(BucharestTz); new ProductInfoHeaderValue("(developed by DC Dev as a hobby, get in touch at webmaster<at>dcdev.ro for any issues)"),
date = dateInstant?.ToDateTimeOffset(); },
},
stationName = stationName.RoLettersToEn(); };
}
var result = new StationScrapeResult();
public async Task<IStationScrapeResult> Scrape(string stationName, DateTimeOffset? date = null) {
var asConfig = Configuration.Default; var dateInstant = date?.ToInstant().InZone(BucharestTz);
var asContext = BrowsingContext.New(asConfig); date = dateInstant?.ToDateTimeOffset();
var firstUrl = "Statie" stationName = stationName.RoLettersToEn();
.AppendPathSegment(Regex.Replace(stationName, @"\s", "-"));
if (date != null) { var result = new StationScrapeResult();
firstUrl = firstUrl.SetQueryParam("Date", $"{date:d.MM.yyyy}");
} var asConfig = Configuration.Default;
var firstResponse = await httpClient.GetStringAsync(firstUrl); var asContext = BrowsingContext.New(asConfig);
var firstDocument = await asContext.OpenAsync(req => req.Content(firstResponse));
var firstForm = firstDocument.GetElementById("form-search")!; var firstUrl = "Statie"
.AppendPathSegment(Regex.Replace(stationName, @"\s", "-"));
var firstResult = firstForm if (date != null) {
.QuerySelectorAll<IHtmlInputElement>("input") firstUrl = firstUrl.SetQueryParam("Date", $"{date:d.MM.yyyy}");
.Where(elem => elem.Name != null) }
.ToDictionary(elem => elem.Name!, elem => elem.Value); var firstResponse = await httpClient.GetStringAsync(firstUrl);
var firstDocument = await asContext.OpenAsync(req => req.Content(firstResponse));
var secondUrl = "".AppendPathSegments("Stations", "StationsResult"); var firstForm = firstDocument.GetElementById("form-search")!;
var secondResponse = await httpClient.PostAsync(
secondUrl, var firstResult = firstForm
#pragma warning disable CS8620 .QuerySelectorAll<IHtmlInputElement>("input")
new FormUrlEncodedContent(firstResult) .Where(elem => elem.Name != null)
#pragma warning restore CS8620 .ToDictionary(elem => elem.Name!, elem => elem.Value);
);
var secondResponseContent = await secondResponse.Content.ReadAsStringAsync(); var secondUrl = "".AppendPathSegments("Stations", "StationsResult");
var secondDocument = await asContext.OpenAsync( var secondResponse = await httpClient.PostAsync(
req => req.Content(secondResponseContent) secondUrl,
); #pragma warning disable CS8620
new FormUrlEncodedContent(firstResult)
var (stationInfoDiv, (_, (departuresDiv, (arrivalsDiv, _)))) = secondDocument #pragma warning restore CS8620
.QuerySelectorAll("body > div"); );
var secondResponseContent = await secondResponse.Content.ReadAsStringAsync();
(result.StationName, (result.Date, _)) = (StationInfoRegex.Match( var secondDocument = await asContext.OpenAsync(
stationInfoDiv req => req.Content(secondResponseContent)
.QuerySelector(":scope > h2")! );
.Text()
.WithCollapsedSpaces() var (stationInfoDiv, (_, (departuresDiv, (arrivalsDiv, _)))) = secondDocument
).Groups as IEnumerable<Group>).Skip(1).Select(group => group.Value); .QuerySelectorAll("body > div");
var (dateDay, (dateMonth, (dateYear, _))) = result.Date.Split('.').Select(int.Parse); (result.StationName, (result.Date, _)) = (StationInfoRegex.Match(
stationInfoDiv
void ParseArrDepList(IElement element, Action<Action<StationArrDep>> adder) { .QuerySelector(":scope > h2")!
Utils.DateTimeSequencer dtSeq = new(dateYear, dateMonth, dateDay); .Text()
.WithCollapsedSpaces()
if (element.QuerySelector(":scope > div > ul") == null) return; ).Groups as IEnumerable<Group>).Skip(1).Select(group => group.Value);
foreach (var trainElement in element.QuerySelectorAll(":scope > div > ul > li")) { var (dateDay, (dateMonth, (dateYear, _))) = result.Date.Split('.').Select(int.Parse);
adder(arrDep => {
var divs = trainElement.QuerySelectorAll(":scope > div"); void ParseArrDepList(IElement element, Action<Action<StationArrDep>> adder) {
var dataDiv = divs[0]; Utils.DateTimeSequencer dtSeq = new(dateYear, dateMonth, dateDay);
var statusDiv = divs.Length >= 2 ? divs[1] : null;
if (element.QuerySelector(":scope > div > ul") == null) return;
var (dataMainDiv, (dataDetailsDiv, _)) = dataDiv
.QuerySelectorAll(":scope > div"); foreach (var trainElement in element.QuerySelectorAll(":scope > div > ul > li")) {
var (timeDiv, (destDiv, (trainDiv, _))) = dataMainDiv adder(arrDep => {
.QuerySelectorAll(":scope > div"); var divs = trainElement.QuerySelectorAll(":scope > div");
var (operatorDiv, (routeDiv, (stoppingTimeDiv, _))) = dataDetailsDiv var dataDiv = divs[0];
.QuerySelectorAll(":scope > div > div"); var statusDiv = divs.Length >= 2 ? divs[1] : null;
var timeResult = timeDiv var (dataMainDiv, (dataDetailsDiv, _)) = dataDiv
.QuerySelectorAll(":scope > div > div > div")[1] .QuerySelectorAll(":scope > div");
.Text() var (timeDiv, (destDiv, (trainDiv, _))) = dataMainDiv
.WithCollapsedSpaces(); .QuerySelectorAll(":scope > div");
var (stHr, (stMin, _)) = timeResult.Split(':').Select(int.Parse); var (operatorDiv, (routeDiv, (stoppingTimeDiv, _))) = dataDetailsDiv
arrDep.Time = BucharestTz.AtLeniently( .QuerySelectorAll(":scope > div > div");
dtSeq.Next(stHr, stMin).ToLocalDateTime()
).ToDateTimeOffset(); var timeResult = timeDiv
.QuerySelectorAll(":scope > div > div > div")[1]
// ReSharper disable once UnusedVariable // stOppositeTime: might be useful in the future .Text()
var (unknownSt, (st, (minsec, (stOppositeTime, _)))) = (StoppingTimeRegex.Match( .WithCollapsedSpaces();
stoppingTimeDiv.QuerySelectorAll(":scope > div > div")[1] var (stHr, (stMin, _)) = timeResult.Split(':').Select(int.Parse);
.Text() arrDep.Time = BucharestTz.AtLeniently(
.WithCollapsedSpaces() dtSeq.Next(stHr, stMin).ToLocalDateTime()
).Groups as IEnumerable<Group>).Skip(1).Select(group => group.Value); ).ToDateTimeOffset();
if (unknownSt.Length == 0 && st.Length > 0) {
arrDep.StoppingTime = int.Parse(st); // ReSharper disable once UnusedVariable // stOppositeTime: might be useful in the future
if (minsec == "min") { var (unknownSt, (st, (minsec, (stOppositeTime, _)))) = (StoppingTimeRegex.Match(
arrDep.StoppingTime *= 60; stoppingTimeDiv.QuerySelectorAll(":scope > div > div")[1]
} .Text()
} .WithCollapsedSpaces()
).Groups as IEnumerable<Group>).Skip(1).Select(group => group.Value);
arrDep.ModifyableTrain.Rank = trainDiv if (unknownSt.Length == 0 && st.Length > 0) {
.QuerySelectorAll(":scope > div > div > div")[1] arrDep.StoppingTime = int.Parse(st);
.QuerySelector(":scope > span")! if (minsec == "min") {
.Text() arrDep.StoppingTime *= 60;
.WithCollapsedSpaces(); }
arrDep.ModifyableTrain.Number = trainDiv }
.QuerySelectorAll(":scope > div > div > div")[1]
.QuerySelector(":scope > a")! arrDep.ModifyableTrain.Rank = trainDiv
.Text() .QuerySelectorAll(":scope > div > div > div")[1]
.WithCollapsedSpaces(); .QuerySelector(":scope > span")!
var trainUri = new Uri( .Text()
"http://localhost" + trainDiv .WithCollapsedSpaces();
.QuerySelectorAll(":scope > div > div > div")[1] arrDep.ModifyableTrain.Number = trainDiv
.QuerySelector(":scope > a")! .QuerySelectorAll(":scope > div > div > div")[1]
.GetAttribute("href")! .QuerySelector(":scope > a")!
); .Text()
var (trainDepDay, (trainDepMonth, (trainDepYear, _))) = TrainUrlDateRegex .WithCollapsedSpaces();
.Match(trainUri.Query) var trainUri = new Uri(
.Groups "http://localhost" + trainDiv
.Values .QuerySelectorAll(":scope > div > div > div")[1]
.Skip(1) .QuerySelector(":scope > a")!
.Select(g => int.Parse(g.Value)); .GetAttribute("href")!
arrDep.ModifyableTrain.DepartureDate = BucharestTz );
.AtStartOfDay(new(trainDepYear, trainDepMonth, trainDepDay)) var (trainDepDay, (trainDepMonth, (trainDepYear, _))) = TrainUrlDateRegex
.ToDateTimeOffset() .Match(trainUri.Query)
.ToUniversalTime(); .Groups
arrDep.ModifyableTrain.Terminus = destDiv .Values
.QuerySelectorAll(":scope > div > div > div")[1] .Skip(1)
.Text() .Select(g => int.Parse(g.Value));
.WithCollapsedSpaces(); arrDep.ModifyableTrain.DepartureDate = BucharestTz
arrDep.ModifyableTrain.Operator = operatorDiv .AtStartOfDay(new(trainDepYear, trainDepMonth, trainDepDay))
.QuerySelectorAll(":scope > div > div")[1] .ToDateTimeOffset()
.Text() .ToUniversalTime();
.WithCollapsedSpaces(); arrDep.ModifyableTrain.Terminus = destDiv
foreach (var station in routeDiv.QuerySelectorAll(":scope > div > div")[1] .QuerySelectorAll(":scope > div > div > div")[1]
.Text() .Text()
.WithCollapsedSpaces() .WithCollapsedSpaces();
.Split(" - ")) { arrDep.ModifyableTrain.Operator = operatorDiv
arrDep.ModifyableTrain.AddRouteStation(station); .QuerySelectorAll(":scope > div > div")[1]
} .Text()
.WithCollapsedSpaces();
if (statusDiv == null) { foreach (var station in routeDiv.QuerySelectorAll(":scope > div > div")[1]
return; .Text()
} .WithCollapsedSpaces()
.Split(" - ")) {
var statusDivComponents = statusDiv arrDep.ModifyableTrain.AddRouteStation(station);
.QuerySelectorAll(":scope > div")[0] }
.QuerySelectorAll(":scope > div");
if (statusDiv == null) {
var delayDiv = statusDivComponents[0]; return;
}
var (delayMin, (approx, _)) = (StatusRegex.Match(
delayDiv var statusDivComponents = statusDiv
.Text() .QuerySelectorAll(":scope > div")[0]
.WithCollapsedSpaces() .QuerySelectorAll(":scope > div");
).Groups as IEnumerable<Group>).Skip(1).Select(group => group.Value);
if (delayMin is null && delayDiv.Text().WithCollapsedSpaces() == "anulat") { var delayDiv = statusDivComponents[0];
arrDep.ModifyableStatus.Cancelled = true;
} var (delayMin, (approx, _)) = (StatusRegex.Match(
else if (delayMin is null) { delayDiv
throw new Exception($"Unexpected delayDiv value: {delayDiv.Text().WithCollapsedSpaces()}"); .Text()
} .WithCollapsedSpaces()
else { ).Groups as IEnumerable<Group>).Skip(1).Select(group => group.Value);
arrDep.ModifyableStatus.Real = string.IsNullOrEmpty(approx); if (delayMin is null && delayDiv.Text().WithCollapsedSpaces() == "anulat") {
arrDep.ModifyableStatus.Delay = delayMin.Length == 0 ? 0 : int.Parse(delayMin); arrDep.ModifyableStatus.Cancelled = true;
} }
else if (delayMin is null) {
if (statusDivComponents.Length < 2) return; throw new Exception($"Unexpected delayDiv value: {delayDiv.Text().WithCollapsedSpaces()}");
}
var platformDiv = statusDivComponents[1]; else {
arrDep.ModifyableStatus.Platform = PlatformRegex.Match(platformDiv.Text().WithCollapsedSpaces()) arrDep.ModifyableStatus.Real = string.IsNullOrEmpty(approx);
.Groups[1].Value; arrDep.ModifyableStatus.Delay = delayMin.Length == 0 ? 0 : int.Parse(delayMin);
}); }
}
} if (statusDivComponents.Length < 2) return;
ParseArrDepList(departuresDiv, result.AddNewStationDeparture); var platformDiv = statusDivComponents[1];
ParseArrDepList(arrivalsDiv, result.AddNewStationArrival); arrDep.ModifyableStatus.Platform = PlatformRegex.Match(platformDiv.Text().WithCollapsedSpaces())
.Groups[1].Value;
return result; });
} }
} }
}
ParseArrDepList(departuresDiv, result.AddNewStationDeparture);
ParseArrDepList(arrivalsDiv, result.AddNewStationArrival);
return result;
}
}
}

7
scraper/src/Scrapers/Train.cs

@ -3,6 +3,7 @@ using System.Collections.Generic;
using System.Linq; using System.Linq;
using System.Net; using System.Net;
using System.Net.Http; using System.Net.Http;
using System.Net.Http.Headers;
using System.Text.RegularExpressions; using System.Text.RegularExpressions;
using System.Threading.Tasks; using System.Threading.Tasks;
using AngleSharp; using AngleSharp;
@ -69,6 +70,12 @@ namespace InfoferScraper.Scrapers {
httpClient = new HttpClient(httpClientHandler) { httpClient = new HttpClient(httpClientHandler) {
BaseAddress = new Uri(BaseUrl), BaseAddress = new Uri(BaseUrl),
DefaultRequestVersion = new Version(2, 0), DefaultRequestVersion = new Version(2, 0),
DefaultRequestHeaders = {
UserAgent = {
new ProductInfoHeaderValue("new-infofer-scraper", "0.0.1"),
new ProductInfoHeaderValue("(developed by DC Dev as a hobby, get in touch at webmaster<at>dcdev.ro for any issues)"),
},
},
}; };
} }

Loading…
Cancel
Save