Kenneth Bruen
2 years ago
4 changed files with 302 additions and 1 deletions
@ -0,0 +1,62 @@
|
||||
using System; |
||||
using System.Collections.Generic; |
||||
|
||||
namespace scraper.Models.Itinerary; |
||||
|
||||
#region Interfaces |
||||
|
||||
public interface IItinerary { |
||||
public IReadOnlyList<IItineraryTrain> Trains { get; } |
||||
} |
||||
|
||||
public interface IItineraryTrain { |
||||
public string From { get; } |
||||
public string To { get; } |
||||
public IReadOnlyList<string> IntermediateStops { get; } |
||||
public DateTimeOffset DepartureDate { get; } |
||||
public DateTimeOffset ArrivalDate { get; } |
||||
public int Km { get; } |
||||
public string Operator { get; } |
||||
public string TrainRank { get; } |
||||
public string TrainNumber { get; } |
||||
} |
||||
|
||||
#endregion |
||||
|
||||
#region Implementations |
||||
|
||||
internal record Itinerary : IItinerary { |
||||
private List<IItineraryTrain> ModifyableTrains { get; set; } = new(); |
||||
|
||||
public IReadOnlyList<IItineraryTrain> Trains => ModifyableTrains; |
||||
|
||||
internal void AddTrain(IItineraryTrain train) { |
||||
ModifyableTrains.Add(train); |
||||
} |
||||
|
||||
internal void AddTrain(Action<ItineraryTrain> configurator) { |
||||
ItineraryTrain newTrain = new(); |
||||
configurator(newTrain); |
||||
AddTrain(newTrain); |
||||
} |
||||
} |
||||
|
||||
internal record ItineraryTrain : IItineraryTrain { |
||||
private List<string> ModifyableIntermediateStops { get; set; } = new(); |
||||
|
||||
public string From { get; internal set; } = ""; |
||||
public string To { get; internal set; } = ""; |
||||
public IReadOnlyList<string> IntermediateStops => ModifyableIntermediateStops; |
||||
public DateTimeOffset DepartureDate { get; internal set; } = new(); |
||||
public DateTimeOffset ArrivalDate { get; internal set; } = new(); |
||||
public int Km { get; internal set; } = 0; |
||||
public string Operator { get; internal set; } = ""; |
||||
public string TrainRank { get; internal set; } = ""; |
||||
public string TrainNumber { get; internal set; } = ""; |
||||
|
||||
internal void AddIntermediateStop(string stop) { |
||||
ModifyableIntermediateStops.Add(stop); |
||||
} |
||||
} |
||||
|
||||
#endregion |
@ -0,0 +1,207 @@
|
||||
using System; |
||||
using System.Collections.Generic; |
||||
using System.Linq; |
||||
using System.Net; |
||||
using System.Net.Http; |
||||
using System.Text.RegularExpressions; |
||||
using System.Threading.Tasks; |
||||
using AngleSharp; |
||||
using AngleSharp.Dom; |
||||
using AngleSharp.Html.Dom; |
||||
using Flurl; |
||||
using InfoferScraper.Models.Train; |
||||
using NodaTime; |
||||
using NodaTime.Extensions; |
||||
using scraper.Models.Itinerary; |
||||
|
||||
namespace InfoferScraper.Scrapers; |
||||
|
||||
public static class RouteScraper { |
||||
private const string BaseUrl = "https://mersultrenurilor.infofer.ro/ro-RO/"; |
||||
private static readonly DateTimeZone BucharestTz = DateTimeZoneProviders.Tzdb["Europe/Bucharest"]; |
||||
|
||||
private static readonly CookieContainer CookieContainer = new(); |
||||
|
||||
private static readonly HttpClient HttpClient = new(new HttpClientHandler { |
||||
CookieContainer = CookieContainer, |
||||
UseCookies = true, |
||||
}) { |
||||
BaseAddress = new Uri(BaseUrl), |
||||
DefaultRequestVersion = new Version(2, 0), |
||||
}; |
||||
|
||||
private static readonly Regex KmTrainRankNoRegex = new(@"^([0-9]+)\skm\scu\s([A-Z-]+)\s([0-9]+)$"); |
||||
private static readonly Regex OperatorRegex = new(@$"^Operat\sde\s([{Utils.RoLetters}\s]+)$"); |
||||
private static readonly Regex DepArrRegex = new(@"^(Ple|Sos)\s([0-9]+)\s([a-z]+)\.?\s([0-9]+):([0-9]+)$"); |
||||
|
||||
private static readonly Dictionary<string, int> Months = new Dictionary<string, int>() { |
||||
["ian"] = 1, |
||||
["feb"] = 2, |
||||
["mar"] = 3, |
||||
["apr"] = 4, |
||||
["mai"] = 5, |
||||
["iun"] = 6, |
||||
["iul"] = 7, |
||||
["aug"] = 8, |
||||
["sep"] = 9, |
||||
["oct"] = 10, |
||||
["noi"] = 11, |
||||
["dec"] = 12, |
||||
}; |
||||
|
||||
public static async Task<List<IItinerary>?> Scrape(string from, string to, DateTimeOffset? dateOverride = null) { |
||||
var dateOverrideInstant = dateOverride?.ToInstant().InZone(BucharestTz); |
||||
dateOverride = dateOverrideInstant?.ToDateTimeOffset(); |
||||
TrainScrapeResult result = new(); |
||||
|
||||
var asConfig = Configuration.Default; |
||||
var asContext = BrowsingContext.New(asConfig); |
||||
|
||||
var firstUrl = "Rute-trenuri" |
||||
.AppendPathSegment(from) |
||||
.AppendPathSegment(to); |
||||
if (dateOverride != null) { |
||||
firstUrl = firstUrl.SetQueryParam("DepartureDate", $"{dateOverride:d.MM.yyyy}"); |
||||
} |
||||
firstUrl = firstUrl.SetQueryParam("OrderingTypeId", "0"); |
||||
firstUrl = firstUrl.SetQueryParam("TimeSelectionId", "0"); |
||||
firstUrl = firstUrl.SetQueryParam("MinutesInDay", "0"); |
||||
firstUrl = firstUrl.SetQueryParam("ConnectionsTypeId", "1"); |
||||
firstUrl = firstUrl.SetQueryParam("BetweenTrainsMinimumMinutes", "5"); |
||||
firstUrl = firstUrl.SetQueryParam("ChangeStationName", ""); |
||||
|
||||
var firstResponse = await HttpClient.GetStringAsync(firstUrl); |
||||
var firstDocument = await asContext.OpenAsync(req => req.Content(firstResponse)); |
||||
var firstForm = firstDocument.GetElementById("form-search")!; |
||||
|
||||
var firstResult = firstForm |
||||
.QuerySelectorAll<IHtmlInputElement>("input") |
||||
.Where(elem => elem.Name != null) |
||||
.ToDictionary(elem => elem.Name!, elem => elem.Value); |
||||
|
||||
var secondUrl = "".AppendPathSegments("Itineraries", "GetItineraries"); |
||||
var secondResponse = await HttpClient.PostAsync( |
||||
secondUrl, |
||||
#pragma warning disable CS8620 |
||||
new FormUrlEncodedContent(firstResult) |
||||
#pragma warning restore CS8620 |
||||
); |
||||
var secondResponseContent = await secondResponse.Content.ReadAsStringAsync(); |
||||
var secondDocument = await asContext.OpenAsync( |
||||
req => req.Content(secondResponseContent) |
||||
); |
||||
|
||||
var (itineraryInfoDiv, _) = secondDocument |
||||
.QuerySelectorAll("body > div"); |
||||
|
||||
if (itineraryInfoDiv == null) { |
||||
return null; |
||||
} |
||||
|
||||
var itinerariesLi = secondDocument |
||||
.QuerySelectorAll("body > ul > li"); |
||||
var itineraries = new List<IItinerary>(); |
||||
foreach (var itineraryLi in itinerariesLi) { |
||||
var itinerary = new Itinerary(); |
||||
|
||||
var cardDivs = itineraryLi.QuerySelectorAll(":scope > div > div > div > div"); |
||||
var detailsDivs = cardDivs[3] |
||||
.QuerySelectorAll(":scope > div > div")[1] |
||||
.QuerySelectorAll(":scope > div"); |
||||
var trainItineraryAndDetailsLis = detailsDivs[0] |
||||
.QuerySelectorAll(":scope > ul > li"); |
||||
var stations = new List<string>(); |
||||
var details = new List<ItineraryTrain>(); |
||||
foreach (var (idx, li) in trainItineraryAndDetailsLis.Select((li, idx) => (idx, li))) { |
||||
if (idx % 2 == 0) { |
||||
// Station |
||||
stations.Add( |
||||
li |
||||
.QuerySelectorAll(":scope > div > div > div > div")[1] |
||||
.Text() |
||||
.WithCollapsedSpaces() |
||||
); |
||||
} |
||||
else { |
||||
var now = LocalDateTime.FromDateTime(DateTime.Now); |
||||
// Detail |
||||
var detailColumns = li.QuerySelectorAll(":scope > div > div"); |
||||
var leftSideDivs = detailColumns[0].QuerySelectorAll(":scope > div"); |
||||
|
||||
var departureDateText = leftSideDivs[0] |
||||
.QuerySelectorAll(":scope > div")[1] |
||||
.Text() |
||||
.WithCollapsedSpaces(); |
||||
var departureDateMatch = DepArrRegex.Match(departureDateText); |
||||
var departureDate = new LocalDateTime( |
||||
now.Year, |
||||
Months[departureDateMatch.Groups[3].Value], |
||||
int.Parse(departureDateMatch.Groups[2].Value), |
||||
int.Parse(departureDateMatch.Groups[4].Value), |
||||
int.Parse(departureDateMatch.Groups[5].Value), |
||||
0 |
||||
); |
||||
if (departureDate < now.PlusDays(-1)) { |
||||
departureDate = departureDate.PlusYears(1); |
||||
} |
||||
|
||||
var arrivalDateText = leftSideDivs[3] |
||||
.QuerySelectorAll(":scope > div")[1] |
||||
.Text() |
||||
.WithCollapsedSpaces(); |
||||
var arrivalDateMatch = DepArrRegex.Match(arrivalDateText); |
||||
var arrivalDate = new LocalDateTime( |
||||
now.Year, |
||||
Months[arrivalDateMatch.Groups[3].Value], |
||||
int.Parse(arrivalDateMatch.Groups[2].Value), |
||||
int.Parse(arrivalDateMatch.Groups[4].Value), |
||||
int.Parse(arrivalDateMatch.Groups[5].Value), |
||||
0 |
||||
); |
||||
if (arrivalDate < now.PlusDays(-1)) { |
||||
arrivalDate = arrivalDate.PlusYears(1); |
||||
} |
||||
|
||||
var rightSideDivs = detailColumns[1].QuerySelectorAll(":scope > div > div"); |
||||
var kmRankNumberText = rightSideDivs[0] |
||||
.QuerySelectorAll(":scope > div > div")[0] |
||||
.Text() |
||||
.WithCollapsedSpaces(); |
||||
var kmRankNumberMatch = KmTrainRankNoRegex.Match(kmRankNumberText); |
||||
|
||||
var operatorText = rightSideDivs[0] |
||||
.QuerySelectorAll(":scope > div > div")[1] |
||||
.Text() |
||||
.WithCollapsedSpaces(); |
||||
var operatorMatch = OperatorRegex.Match(operatorText); |
||||
|
||||
var train = new ItineraryTrain { |
||||
ArrivalDate = BucharestTz.AtLeniently(arrivalDate).ToDateTimeOffset(), |
||||
DepartureDate = BucharestTz.AtLeniently(departureDate).ToDateTimeOffset(), |
||||
Km = int.Parse(kmRankNumberMatch.Groups[1].Value), |
||||
TrainRank = kmRankNumberMatch.Groups[2].Value, |
||||
TrainNumber = kmRankNumberMatch.Groups[3].Value, |
||||
Operator = operatorMatch.Groups[1].Value, |
||||
}; |
||||
|
||||
foreach (var div in leftSideDivs[2] |
||||
.QuerySelectorAll(":scope > div") |
||||
.Where((_, i) => i % 2 != 0)) { |
||||
train.AddIntermediateStop(div.Text().WithCollapsedSpaces()); |
||||
} |
||||
|
||||
details.Add(train); |
||||
} |
||||
} |
||||
foreach (var ((iFrom, iTo), detail) in stations.Zip(stations.Skip(1)).Zip(details)) { |
||||
detail.From = iFrom; |
||||
detail.To = iTo; |
||||
itinerary.AddTrain(detail); |
||||
} |
||||
|
||||
itineraries.Add(itinerary); |
||||
} |
||||
|
||||
return itineraries; |
||||
} |
||||
} |
Loading…
Reference in new issue