Browse Source

Add initial itinerary scraping

master
Kenneth Bruen 2 years ago
parent
commit
1d9db5b491
Signed by: kbruen
GPG Key ID: C1980A470C3EE5B1
  1. 2
      ConsoleTest/ConsoleTest.csproj
  2. 32
      ConsoleTest/Program.cs
  3. 62
      scraper/src/Models/Itinerary.cs
  4. 207
      scraper/src/Scrapers/Route.cs

2
ConsoleTest/ConsoleTest.csproj

@ -6,7 +6,7 @@
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net6.0</TargetFramework>
<TargetFrameworks>net6.0;net7.0</TargetFrameworks>
</PropertyGroup>
</Project>

32
ConsoleTest/Program.cs

@ -1,4 +1,5 @@
using System;
using System.Linq;
using System.Text.Json;
using System.Threading.Tasks;
using InfoferScraper;
@ -7,6 +8,7 @@ using InfoferScraper.Scrapers;
while (true) {
Console.WriteLine("1. Scrape Train");
Console.WriteLine("2. Scrape Station");
Console.WriteLine("3. Scrape Itineraries");
Console.WriteLine("0. Exit");
var input = Console.ReadLine()?.Trim();
@ -17,6 +19,9 @@ while (true) {
case "2":
await PrintStation();
break;
case "3":
await ScrapeItineraries();
break;
case null:
case "0":
goto INPUT_LOOP_BREAK;
@ -61,3 +66,30 @@ async Task PrintStation() {
)
);
}
async Task ScrapeItineraries() {
Console.Write("From station: ");
var from = Console.ReadLine();
Console.Write("To station: ");
var to = Console.ReadLine();
if (from == null || to == null) return;
var data = await RouteScraper.Scrape(from, to);
Console.WriteLine($"{data.Count} itineraries:");
Console.WriteLine();
void PrintArrDepLine(DateTimeOffset date, string station) {
Console.WriteLine($"{date:HH:mm} {station}");
}
foreach (var itinerary in data) {
foreach (var train in itinerary.Trains) {
PrintArrDepLine(train.DepartureDate, train.From);
Console.WriteLine($" {train.TrainRank,-4} {train.TrainNumber,-5} ({train.Operator}), {train.Km,3} km via {string.Join(", ", train.IntermediateStops)}");
PrintArrDepLine(train.ArrivalDate, train.To);
}
Console.WriteLine();
}
}

62
scraper/src/Models/Itinerary.cs

@ -0,0 +1,62 @@
using System;
using System.Collections.Generic;
namespace scraper.Models.Itinerary;
#region Interfaces
public interface IItinerary {
public IReadOnlyList<IItineraryTrain> Trains { get; }
}
public interface IItineraryTrain {
public string From { get; }
public string To { get; }
public IReadOnlyList<string> IntermediateStops { get; }
public DateTimeOffset DepartureDate { get; }
public DateTimeOffset ArrivalDate { get; }
public int Km { get; }
public string Operator { get; }
public string TrainRank { get; }
public string TrainNumber { get; }
}
#endregion
#region Implementations
internal record Itinerary : IItinerary {
private List<IItineraryTrain> ModifyableTrains { get; set; } = new();
public IReadOnlyList<IItineraryTrain> Trains => ModifyableTrains;
internal void AddTrain(IItineraryTrain train) {
ModifyableTrains.Add(train);
}
internal void AddTrain(Action<ItineraryTrain> configurator) {
ItineraryTrain newTrain = new();
configurator(newTrain);
AddTrain(newTrain);
}
}
internal record ItineraryTrain : IItineraryTrain {
private List<string> ModifyableIntermediateStops { get; set; } = new();
public string From { get; internal set; } = "";
public string To { get; internal set; } = "";
public IReadOnlyList<string> IntermediateStops => ModifyableIntermediateStops;
public DateTimeOffset DepartureDate { get; internal set; } = new();
public DateTimeOffset ArrivalDate { get; internal set; } = new();
public int Km { get; internal set; } = 0;
public string Operator { get; internal set; } = "";
public string TrainRank { get; internal set; } = "";
public string TrainNumber { get; internal set; } = "";
internal void AddIntermediateStop(string stop) {
ModifyableIntermediateStops.Add(stop);
}
}
#endregion

207
scraper/src/Scrapers/Route.cs

@ -0,0 +1,207 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Net.Http;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using AngleSharp;
using AngleSharp.Dom;
using AngleSharp.Html.Dom;
using Flurl;
using InfoferScraper.Models.Train;
using NodaTime;
using NodaTime.Extensions;
using scraper.Models.Itinerary;
namespace InfoferScraper.Scrapers;
public static class RouteScraper {
private const string BaseUrl = "https://mersultrenurilor.infofer.ro/ro-RO/";
private static readonly DateTimeZone BucharestTz = DateTimeZoneProviders.Tzdb["Europe/Bucharest"];
private static readonly CookieContainer CookieContainer = new();
private static readonly HttpClient HttpClient = new(new HttpClientHandler {
CookieContainer = CookieContainer,
UseCookies = true,
}) {
BaseAddress = new Uri(BaseUrl),
DefaultRequestVersion = new Version(2, 0),
};
private static readonly Regex KmTrainRankNoRegex = new(@"^([0-9]+)\skm\scu\s([A-Z-]+)\s([0-9]+)$");
private static readonly Regex OperatorRegex = new(@$"^Operat\sde\s([{Utils.RoLetters}\s]+)$");
private static readonly Regex DepArrRegex = new(@"^(Ple|Sos)\s([0-9]+)\s([a-z]+)\.?\s([0-9]+):([0-9]+)$");
private static readonly Dictionary<string, int> Months = new Dictionary<string, int>() {
["ian"] = 1,
["feb"] = 2,
["mar"] = 3,
["apr"] = 4,
["mai"] = 5,
["iun"] = 6,
["iul"] = 7,
["aug"] = 8,
["sep"] = 9,
["oct"] = 10,
["noi"] = 11,
["dec"] = 12,
};
public static async Task<List<IItinerary>?> Scrape(string from, string to, DateTimeOffset? dateOverride = null) {
var dateOverrideInstant = dateOverride?.ToInstant().InZone(BucharestTz);
dateOverride = dateOverrideInstant?.ToDateTimeOffset();
TrainScrapeResult result = new();
var asConfig = Configuration.Default;
var asContext = BrowsingContext.New(asConfig);
var firstUrl = "Rute-trenuri"
.AppendPathSegment(from)
.AppendPathSegment(to);
if (dateOverride != null) {
firstUrl = firstUrl.SetQueryParam("DepartureDate", $"{dateOverride:d.MM.yyyy}");
}
firstUrl = firstUrl.SetQueryParam("OrderingTypeId", "0");
firstUrl = firstUrl.SetQueryParam("TimeSelectionId", "0");
firstUrl = firstUrl.SetQueryParam("MinutesInDay", "0");
firstUrl = firstUrl.SetQueryParam("ConnectionsTypeId", "1");
firstUrl = firstUrl.SetQueryParam("BetweenTrainsMinimumMinutes", "5");
firstUrl = firstUrl.SetQueryParam("ChangeStationName", "");
var firstResponse = await HttpClient.GetStringAsync(firstUrl);
var firstDocument = await asContext.OpenAsync(req => req.Content(firstResponse));
var firstForm = firstDocument.GetElementById("form-search")!;
var firstResult = firstForm
.QuerySelectorAll<IHtmlInputElement>("input")
.Where(elem => elem.Name != null)
.ToDictionary(elem => elem.Name!, elem => elem.Value);
var secondUrl = "".AppendPathSegments("Itineraries", "GetItineraries");
var secondResponse = await HttpClient.PostAsync(
secondUrl,
#pragma warning disable CS8620
new FormUrlEncodedContent(firstResult)
#pragma warning restore CS8620
);
var secondResponseContent = await secondResponse.Content.ReadAsStringAsync();
var secondDocument = await asContext.OpenAsync(
req => req.Content(secondResponseContent)
);
var (itineraryInfoDiv, _) = secondDocument
.QuerySelectorAll("body > div");
if (itineraryInfoDiv == null) {
return null;
}
var itinerariesLi = secondDocument
.QuerySelectorAll("body > ul > li");
var itineraries = new List<IItinerary>();
foreach (var itineraryLi in itinerariesLi) {
var itinerary = new Itinerary();
var cardDivs = itineraryLi.QuerySelectorAll(":scope > div > div > div > div");
var detailsDivs = cardDivs[3]
.QuerySelectorAll(":scope > div > div")[1]
.QuerySelectorAll(":scope > div");
var trainItineraryAndDetailsLis = detailsDivs[0]
.QuerySelectorAll(":scope > ul > li");
var stations = new List<string>();
var details = new List<ItineraryTrain>();
foreach (var (idx, li) in trainItineraryAndDetailsLis.Select((li, idx) => (idx, li))) {
if (idx % 2 == 0) {
// Station
stations.Add(
li
.QuerySelectorAll(":scope > div > div > div > div")[1]
.Text()
.WithCollapsedSpaces()
);
}
else {
var now = LocalDateTime.FromDateTime(DateTime.Now);
// Detail
var detailColumns = li.QuerySelectorAll(":scope > div > div");
var leftSideDivs = detailColumns[0].QuerySelectorAll(":scope > div");
var departureDateText = leftSideDivs[0]
.QuerySelectorAll(":scope > div")[1]
.Text()
.WithCollapsedSpaces();
var departureDateMatch = DepArrRegex.Match(departureDateText);
var departureDate = new LocalDateTime(
now.Year,
Months[departureDateMatch.Groups[3].Value],
int.Parse(departureDateMatch.Groups[2].Value),
int.Parse(departureDateMatch.Groups[4].Value),
int.Parse(departureDateMatch.Groups[5].Value),
0
);
if (departureDate < now.PlusDays(-1)) {
departureDate = departureDate.PlusYears(1);
}
var arrivalDateText = leftSideDivs[3]
.QuerySelectorAll(":scope > div")[1]
.Text()
.WithCollapsedSpaces();
var arrivalDateMatch = DepArrRegex.Match(arrivalDateText);
var arrivalDate = new LocalDateTime(
now.Year,
Months[arrivalDateMatch.Groups[3].Value],
int.Parse(arrivalDateMatch.Groups[2].Value),
int.Parse(arrivalDateMatch.Groups[4].Value),
int.Parse(arrivalDateMatch.Groups[5].Value),
0
);
if (arrivalDate < now.PlusDays(-1)) {
arrivalDate = arrivalDate.PlusYears(1);
}
var rightSideDivs = detailColumns[1].QuerySelectorAll(":scope > div > div");
var kmRankNumberText = rightSideDivs[0]
.QuerySelectorAll(":scope > div > div")[0]
.Text()
.WithCollapsedSpaces();
var kmRankNumberMatch = KmTrainRankNoRegex.Match(kmRankNumberText);
var operatorText = rightSideDivs[0]
.QuerySelectorAll(":scope > div > div")[1]
.Text()
.WithCollapsedSpaces();
var operatorMatch = OperatorRegex.Match(operatorText);
var train = new ItineraryTrain {
ArrivalDate = BucharestTz.AtLeniently(arrivalDate).ToDateTimeOffset(),
DepartureDate = BucharestTz.AtLeniently(departureDate).ToDateTimeOffset(),
Km = int.Parse(kmRankNumberMatch.Groups[1].Value),
TrainRank = kmRankNumberMatch.Groups[2].Value,
TrainNumber = kmRankNumberMatch.Groups[3].Value,
Operator = operatorMatch.Groups[1].Value,
};
foreach (var div in leftSideDivs[2]
.QuerySelectorAll(":scope > div")
.Where((_, i) => i % 2 != 0)) {
train.AddIntermediateStop(div.Text().WithCollapsedSpaces());
}
details.Add(train);
}
}
foreach (var ((iFrom, iTo), detail) in stations.Zip(stations.Skip(1)).Zip(details)) {
detail.From = iFrom;
detail.To = iTo;
itinerary.AddTrain(detail);
}
itineraries.Add(itinerary);
}
return itineraries;
}
}
Loading…
Cancel
Save