Kenneth Bruen
4 weeks ago
3 changed files with 249 additions and 228 deletions
@ -1,228 +1,235 @@ |
|||||||
using System; |
using System; |
||||||
using System.Collections.Generic; |
using System.Collections.Generic; |
||||||
using System.Linq; |
using System.Linq; |
||||||
using System.Net; |
using System.Net; |
||||||
using System.Net.Http; |
using System.Net.Http; |
||||||
using System.Text.RegularExpressions; |
using System.Net.Http.Headers; |
||||||
using System.Threading.Tasks; |
using System.Text.RegularExpressions; |
||||||
using AngleSharp; |
using System.Threading.Tasks; |
||||||
using AngleSharp.Dom; |
using AngleSharp; |
||||||
using AngleSharp.Html.Dom; |
using AngleSharp.Dom; |
||||||
using Flurl; |
using AngleSharp.Html.Dom; |
||||||
using InfoferScraper.Models.Station; |
using Flurl; |
||||||
using NodaTime; |
using InfoferScraper.Models.Station; |
||||||
using NodaTime.Extensions; |
using NodaTime; |
||||||
|
using NodaTime.Extensions; |
||||||
namespace InfoferScraper.Scrapers { |
|
||||||
public class StationScraper { |
namespace InfoferScraper.Scrapers { |
||||||
private static readonly Regex StationInfoRegex = new($@"^([{Utils.RoLetters}.0-9 ]+)\sîn\s([0-9.]+)$"); |
public class StationScraper { |
||||||
|
private static readonly Regex StationInfoRegex = new($@"^([{Utils.RoLetters}.0-9 ]+)\sîn\s([0-9.]+)$"); |
||||||
private static readonly Regex StoppingTimeRegex = new( |
|
||||||
@"^(necunoscută \(stație terminus\))|(?:([0-9]+) (min|sec) \((?:începând cu|până la) ([0-9]{1,2}:[0-9]{2})\))$" |
private static readonly Regex StoppingTimeRegex = new( |
||||||
); |
@"^(necunoscută \(stație terminus\))|(?:([0-9]+) (min|sec) \((?:începând cu|până la) ([0-9]{1,2}:[0-9]{2})\))$" |
||||||
|
); |
||||||
private static readonly Regex StatusRegex = new( |
|
||||||
@"^(?:la timp|([+-]?[0-9]+) min \((?:întârziere|mai devreme)\))(\*?)$" |
private static readonly Regex StatusRegex = new( |
||||||
); |
@"^(?:la timp|([+-]?[0-9]+) min \((?:întârziere|mai devreme)\))(\*?)$" |
||||||
|
); |
||||||
private static readonly Regex PlatformRegex = new(@"^linia\s([A-Za-z0-9]+)$"); |
|
||||||
|
private static readonly Regex PlatformRegex = new(@"^linia\s([A-Za-z0-9]+)$"); |
||||||
private static readonly Regex TrainUrlDateRegex = new(@"Date=([0-9]{2}).([0-9]{2}).([0-9]{4})"); |
|
||||||
|
private static readonly Regex TrainUrlDateRegex = new(@"Date=([0-9]{2}).([0-9]{2}).([0-9]{4})"); |
||||||
private static readonly DateTimeZone BucharestTz = DateTimeZoneProviders.Tzdb["Europe/Bucharest"]; |
|
||||||
|
private static readonly DateTimeZone BucharestTz = DateTimeZoneProviders.Tzdb["Europe/Bucharest"]; |
||||||
private const string BaseUrl = "https://mersultrenurilor.infofer.ro/ro-RO/"; |
|
||||||
|
private const string BaseUrl = "https://mersultrenurilor.infofer.ro/ro-RO/"; |
||||||
private readonly CookieContainer cookieContainer = new(); |
|
||||||
|
private readonly CookieContainer cookieContainer = new(); |
||||||
private readonly HttpClient httpClient; |
|
||||||
|
private readonly HttpClient httpClient; |
||||||
public StationScraper(HttpClientHandler? httpClientHandler = null) { |
|
||||||
if (httpClientHandler == null) { |
public StationScraper(HttpClientHandler? httpClientHandler = null) { |
||||||
httpClientHandler = new HttpClientHandler { |
if (httpClientHandler == null) { |
||||||
CookieContainer = cookieContainer, |
httpClientHandler = new HttpClientHandler { |
||||||
UseCookies = true, |
CookieContainer = cookieContainer, |
||||||
}; |
UseCookies = true, |
||||||
} |
}; |
||||||
else { |
} |
||||||
httpClientHandler.CookieContainer = cookieContainer; |
else { |
||||||
httpClientHandler.UseCookies = true; |
httpClientHandler.CookieContainer = cookieContainer; |
||||||
} |
httpClientHandler.UseCookies = true; |
||||||
httpClient = new HttpClient(httpClientHandler) { |
} |
||||||
BaseAddress = new Uri(BaseUrl), |
httpClient = new HttpClient(httpClientHandler) { |
||||||
DefaultRequestVersion = new Version(2, 0), |
BaseAddress = new Uri(BaseUrl), |
||||||
}; |
DefaultRequestVersion = new Version(2, 0), |
||||||
} |
DefaultRequestHeaders = { |
||||||
|
UserAgent = { |
||||||
public async Task<IStationScrapeResult> Scrape(string stationName, DateTimeOffset? date = null) { |
new ProductInfoHeaderValue("new-infofer-scraper", "0.0.1"), |
||||||
var dateInstant = date?.ToInstant().InZone(BucharestTz); |
new ProductInfoHeaderValue("(developed by DC Dev as a hobby, get in touch at webmaster<at>dcdev.ro for any issues)"), |
||||||
date = dateInstant?.ToDateTimeOffset(); |
}, |
||||||
|
}, |
||||||
stationName = stationName.RoLettersToEn(); |
}; |
||||||
|
} |
||||||
var result = new StationScrapeResult(); |
|
||||||
|
public async Task<IStationScrapeResult> Scrape(string stationName, DateTimeOffset? date = null) { |
||||||
var asConfig = Configuration.Default; |
var dateInstant = date?.ToInstant().InZone(BucharestTz); |
||||||
var asContext = BrowsingContext.New(asConfig); |
date = dateInstant?.ToDateTimeOffset(); |
||||||
|
|
||||||
var firstUrl = "Statie" |
stationName = stationName.RoLettersToEn(); |
||||||
.AppendPathSegment(Regex.Replace(stationName, @"\s", "-")); |
|
||||||
if (date != null) { |
var result = new StationScrapeResult(); |
||||||
firstUrl = firstUrl.SetQueryParam("Date", $"{date:d.MM.yyyy}"); |
|
||||||
} |
var asConfig = Configuration.Default; |
||||||
var firstResponse = await httpClient.GetStringAsync(firstUrl); |
var asContext = BrowsingContext.New(asConfig); |
||||||
var firstDocument = await asContext.OpenAsync(req => req.Content(firstResponse)); |
|
||||||
var firstForm = firstDocument.GetElementById("form-search")!; |
var firstUrl = "Statie" |
||||||
|
.AppendPathSegment(Regex.Replace(stationName, @"\s", "-")); |
||||||
var firstResult = firstForm |
if (date != null) { |
||||||
.QuerySelectorAll<IHtmlInputElement>("input") |
firstUrl = firstUrl.SetQueryParam("Date", $"{date:d.MM.yyyy}"); |
||||||
.Where(elem => elem.Name != null) |
} |
||||||
.ToDictionary(elem => elem.Name!, elem => elem.Value); |
var firstResponse = await httpClient.GetStringAsync(firstUrl); |
||||||
|
var firstDocument = await asContext.OpenAsync(req => req.Content(firstResponse)); |
||||||
var secondUrl = "".AppendPathSegments("Stations", "StationsResult"); |
var firstForm = firstDocument.GetElementById("form-search")!; |
||||||
var secondResponse = await httpClient.PostAsync( |
|
||||||
secondUrl, |
var firstResult = firstForm |
||||||
#pragma warning disable CS8620 |
.QuerySelectorAll<IHtmlInputElement>("input") |
||||||
new FormUrlEncodedContent(firstResult) |
.Where(elem => elem.Name != null) |
||||||
#pragma warning restore CS8620 |
.ToDictionary(elem => elem.Name!, elem => elem.Value); |
||||||
); |
|
||||||
var secondResponseContent = await secondResponse.Content.ReadAsStringAsync(); |
var secondUrl = "".AppendPathSegments("Stations", "StationsResult"); |
||||||
var secondDocument = await asContext.OpenAsync( |
var secondResponse = await httpClient.PostAsync( |
||||||
req => req.Content(secondResponseContent) |
secondUrl, |
||||||
); |
#pragma warning disable CS8620 |
||||||
|
new FormUrlEncodedContent(firstResult) |
||||||
var (stationInfoDiv, (_, (departuresDiv, (arrivalsDiv, _)))) = secondDocument |
#pragma warning restore CS8620 |
||||||
.QuerySelectorAll("body > div"); |
); |
||||||
|
var secondResponseContent = await secondResponse.Content.ReadAsStringAsync(); |
||||||
(result.StationName, (result.Date, _)) = (StationInfoRegex.Match( |
var secondDocument = await asContext.OpenAsync( |
||||||
stationInfoDiv |
req => req.Content(secondResponseContent) |
||||||
.QuerySelector(":scope > h2")! |
); |
||||||
.Text() |
|
||||||
.WithCollapsedSpaces() |
var (stationInfoDiv, (_, (departuresDiv, (arrivalsDiv, _)))) = secondDocument |
||||||
).Groups as IEnumerable<Group>).Skip(1).Select(group => group.Value); |
.QuerySelectorAll("body > div"); |
||||||
|
|
||||||
var (dateDay, (dateMonth, (dateYear, _))) = result.Date.Split('.').Select(int.Parse); |
(result.StationName, (result.Date, _)) = (StationInfoRegex.Match( |
||||||
|
stationInfoDiv |
||||||
void ParseArrDepList(IElement element, Action<Action<StationArrDep>> adder) { |
.QuerySelector(":scope > h2")! |
||||||
Utils.DateTimeSequencer dtSeq = new(dateYear, dateMonth, dateDay); |
.Text() |
||||||
|
.WithCollapsedSpaces() |
||||||
if (element.QuerySelector(":scope > div > ul") == null) return; |
).Groups as IEnumerable<Group>).Skip(1).Select(group => group.Value); |
||||||
|
|
||||||
foreach (var trainElement in element.QuerySelectorAll(":scope > div > ul > li")) { |
var (dateDay, (dateMonth, (dateYear, _))) = result.Date.Split('.').Select(int.Parse); |
||||||
adder(arrDep => { |
|
||||||
var divs = trainElement.QuerySelectorAll(":scope > div"); |
void ParseArrDepList(IElement element, Action<Action<StationArrDep>> adder) { |
||||||
var dataDiv = divs[0]; |
Utils.DateTimeSequencer dtSeq = new(dateYear, dateMonth, dateDay); |
||||||
var statusDiv = divs.Length >= 2 ? divs[1] : null; |
|
||||||
|
if (element.QuerySelector(":scope > div > ul") == null) return; |
||||||
var (dataMainDiv, (dataDetailsDiv, _)) = dataDiv |
|
||||||
.QuerySelectorAll(":scope > div"); |
foreach (var trainElement in element.QuerySelectorAll(":scope > div > ul > li")) { |
||||||
var (timeDiv, (destDiv, (trainDiv, _))) = dataMainDiv |
adder(arrDep => { |
||||||
.QuerySelectorAll(":scope > div"); |
var divs = trainElement.QuerySelectorAll(":scope > div"); |
||||||
var (operatorDiv, (routeDiv, (stoppingTimeDiv, _))) = dataDetailsDiv |
var dataDiv = divs[0]; |
||||||
.QuerySelectorAll(":scope > div > div"); |
var statusDiv = divs.Length >= 2 ? divs[1] : null; |
||||||
|
|
||||||
var timeResult = timeDiv |
var (dataMainDiv, (dataDetailsDiv, _)) = dataDiv |
||||||
.QuerySelectorAll(":scope > div > div > div")[1] |
.QuerySelectorAll(":scope > div"); |
||||||
.Text() |
var (timeDiv, (destDiv, (trainDiv, _))) = dataMainDiv |
||||||
.WithCollapsedSpaces(); |
.QuerySelectorAll(":scope > div"); |
||||||
var (stHr, (stMin, _)) = timeResult.Split(':').Select(int.Parse); |
var (operatorDiv, (routeDiv, (stoppingTimeDiv, _))) = dataDetailsDiv |
||||||
arrDep.Time = BucharestTz.AtLeniently( |
.QuerySelectorAll(":scope > div > div"); |
||||||
dtSeq.Next(stHr, stMin).ToLocalDateTime() |
|
||||||
).ToDateTimeOffset(); |
var timeResult = timeDiv |
||||||
|
.QuerySelectorAll(":scope > div > div > div")[1] |
||||||
// ReSharper disable once UnusedVariable // stOppositeTime: might be useful in the future |
.Text() |
||||||
var (unknownSt, (st, (minsec, (stOppositeTime, _)))) = (StoppingTimeRegex.Match( |
.WithCollapsedSpaces(); |
||||||
stoppingTimeDiv.QuerySelectorAll(":scope > div > div")[1] |
var (stHr, (stMin, _)) = timeResult.Split(':').Select(int.Parse); |
||||||
.Text() |
arrDep.Time = BucharestTz.AtLeniently( |
||||||
.WithCollapsedSpaces() |
dtSeq.Next(stHr, stMin).ToLocalDateTime() |
||||||
).Groups as IEnumerable<Group>).Skip(1).Select(group => group.Value); |
).ToDateTimeOffset(); |
||||||
if (unknownSt.Length == 0 && st.Length > 0) { |
|
||||||
arrDep.StoppingTime = int.Parse(st); |
// ReSharper disable once UnusedVariable // stOppositeTime: might be useful in the future |
||||||
if (minsec == "min") { |
var (unknownSt, (st, (minsec, (stOppositeTime, _)))) = (StoppingTimeRegex.Match( |
||||||
arrDep.StoppingTime *= 60; |
stoppingTimeDiv.QuerySelectorAll(":scope > div > div")[1] |
||||||
} |
.Text() |
||||||
} |
.WithCollapsedSpaces() |
||||||
|
).Groups as IEnumerable<Group>).Skip(1).Select(group => group.Value); |
||||||
arrDep.ModifyableTrain.Rank = trainDiv |
if (unknownSt.Length == 0 && st.Length > 0) { |
||||||
.QuerySelectorAll(":scope > div > div > div")[1] |
arrDep.StoppingTime = int.Parse(st); |
||||||
.QuerySelector(":scope > span")! |
if (minsec == "min") { |
||||||
.Text() |
arrDep.StoppingTime *= 60; |
||||||
.WithCollapsedSpaces(); |
} |
||||||
arrDep.ModifyableTrain.Number = trainDiv |
} |
||||||
.QuerySelectorAll(":scope > div > div > div")[1] |
|
||||||
.QuerySelector(":scope > a")! |
arrDep.ModifyableTrain.Rank = trainDiv |
||||||
.Text() |
.QuerySelectorAll(":scope > div > div > div")[1] |
||||||
.WithCollapsedSpaces(); |
.QuerySelector(":scope > span")! |
||||||
var trainUri = new Uri( |
.Text() |
||||||
"http://localhost" + trainDiv |
.WithCollapsedSpaces(); |
||||||
.QuerySelectorAll(":scope > div > div > div")[1] |
arrDep.ModifyableTrain.Number = trainDiv |
||||||
.QuerySelector(":scope > a")! |
.QuerySelectorAll(":scope > div > div > div")[1] |
||||||
.GetAttribute("href")! |
.QuerySelector(":scope > a")! |
||||||
); |
.Text() |
||||||
var (trainDepDay, (trainDepMonth, (trainDepYear, _))) = TrainUrlDateRegex |
.WithCollapsedSpaces(); |
||||||
.Match(trainUri.Query) |
var trainUri = new Uri( |
||||||
.Groups |
"http://localhost" + trainDiv |
||||||
.Values |
.QuerySelectorAll(":scope > div > div > div")[1] |
||||||
.Skip(1) |
.QuerySelector(":scope > a")! |
||||||
.Select(g => int.Parse(g.Value)); |
.GetAttribute("href")! |
||||||
arrDep.ModifyableTrain.DepartureDate = BucharestTz |
); |
||||||
.AtStartOfDay(new(trainDepYear, trainDepMonth, trainDepDay)) |
var (trainDepDay, (trainDepMonth, (trainDepYear, _))) = TrainUrlDateRegex |
||||||
.ToDateTimeOffset() |
.Match(trainUri.Query) |
||||||
.ToUniversalTime(); |
.Groups |
||||||
arrDep.ModifyableTrain.Terminus = destDiv |
.Values |
||||||
.QuerySelectorAll(":scope > div > div > div")[1] |
.Skip(1) |
||||||
.Text() |
.Select(g => int.Parse(g.Value)); |
||||||
.WithCollapsedSpaces(); |
arrDep.ModifyableTrain.DepartureDate = BucharestTz |
||||||
arrDep.ModifyableTrain.Operator = operatorDiv |
.AtStartOfDay(new(trainDepYear, trainDepMonth, trainDepDay)) |
||||||
.QuerySelectorAll(":scope > div > div")[1] |
.ToDateTimeOffset() |
||||||
.Text() |
.ToUniversalTime(); |
||||||
.WithCollapsedSpaces(); |
arrDep.ModifyableTrain.Terminus = destDiv |
||||||
foreach (var station in routeDiv.QuerySelectorAll(":scope > div > div")[1] |
.QuerySelectorAll(":scope > div > div > div")[1] |
||||||
.Text() |
.Text() |
||||||
.WithCollapsedSpaces() |
.WithCollapsedSpaces(); |
||||||
.Split(" - ")) { |
arrDep.ModifyableTrain.Operator = operatorDiv |
||||||
arrDep.ModifyableTrain.AddRouteStation(station); |
.QuerySelectorAll(":scope > div > div")[1] |
||||||
} |
.Text() |
||||||
|
.WithCollapsedSpaces(); |
||||||
if (statusDiv == null) { |
foreach (var station in routeDiv.QuerySelectorAll(":scope > div > div")[1] |
||||||
return; |
.Text() |
||||||
} |
.WithCollapsedSpaces() |
||||||
|
.Split(" - ")) { |
||||||
var statusDivComponents = statusDiv |
arrDep.ModifyableTrain.AddRouteStation(station); |
||||||
.QuerySelectorAll(":scope > div")[0] |
} |
||||||
.QuerySelectorAll(":scope > div"); |
|
||||||
|
if (statusDiv == null) { |
||||||
var delayDiv = statusDivComponents[0]; |
return; |
||||||
|
} |
||||||
var (delayMin, (approx, _)) = (StatusRegex.Match( |
|
||||||
delayDiv |
var statusDivComponents = statusDiv |
||||||
.Text() |
.QuerySelectorAll(":scope > div")[0] |
||||||
.WithCollapsedSpaces() |
.QuerySelectorAll(":scope > div"); |
||||||
).Groups as IEnumerable<Group>).Skip(1).Select(group => group.Value); |
|
||||||
if (delayMin is null && delayDiv.Text().WithCollapsedSpaces() == "anulat") { |
var delayDiv = statusDivComponents[0]; |
||||||
arrDep.ModifyableStatus.Cancelled = true; |
|
||||||
} |
var (delayMin, (approx, _)) = (StatusRegex.Match( |
||||||
else if (delayMin is null) { |
delayDiv |
||||||
throw new Exception($"Unexpected delayDiv value: {delayDiv.Text().WithCollapsedSpaces()}"); |
.Text() |
||||||
} |
.WithCollapsedSpaces() |
||||||
else { |
).Groups as IEnumerable<Group>).Skip(1).Select(group => group.Value); |
||||||
arrDep.ModifyableStatus.Real = string.IsNullOrEmpty(approx); |
if (delayMin is null && delayDiv.Text().WithCollapsedSpaces() == "anulat") { |
||||||
arrDep.ModifyableStatus.Delay = delayMin.Length == 0 ? 0 : int.Parse(delayMin); |
arrDep.ModifyableStatus.Cancelled = true; |
||||||
} |
} |
||||||
|
else if (delayMin is null) { |
||||||
if (statusDivComponents.Length < 2) return; |
throw new Exception($"Unexpected delayDiv value: {delayDiv.Text().WithCollapsedSpaces()}"); |
||||||
|
} |
||||||
var platformDiv = statusDivComponents[1]; |
else { |
||||||
arrDep.ModifyableStatus.Platform = PlatformRegex.Match(platformDiv.Text().WithCollapsedSpaces()) |
arrDep.ModifyableStatus.Real = string.IsNullOrEmpty(approx); |
||||||
.Groups[1].Value; |
arrDep.ModifyableStatus.Delay = delayMin.Length == 0 ? 0 : int.Parse(delayMin); |
||||||
}); |
} |
||||||
} |
|
||||||
} |
if (statusDivComponents.Length < 2) return; |
||||||
|
|
||||||
ParseArrDepList(departuresDiv, result.AddNewStationDeparture); |
var platformDiv = statusDivComponents[1]; |
||||||
ParseArrDepList(arrivalsDiv, result.AddNewStationArrival); |
arrDep.ModifyableStatus.Platform = PlatformRegex.Match(platformDiv.Text().WithCollapsedSpaces()) |
||||||
|
.Groups[1].Value; |
||||||
return result; |
}); |
||||||
} |
} |
||||||
} |
} |
||||||
} |
|
||||||
|
ParseArrDepList(departuresDiv, result.AddNewStationDeparture); |
||||||
|
ParseArrDepList(arrivalsDiv, result.AddNewStationArrival); |
||||||
|
|
||||||
|
return result; |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
Loading…
Reference in new issue