import re import requests from bs4 import BeautifulSoup import pandas as pd from typing import Dict, Any class JourneybookScraper: def __init__(self, username: str, password: str, vehicle_registration: str = "4SH1148"): self.username = username self.password = password self.vehicle_registration = vehicle_registration self.base_url = "https://kj.colsys.cz/prehled_mesic.php" @staticmethod def normalize_date(date_str: str) -> str: return re.sub(r'\s+', '', date_str) def scrape_month(self, month: str) -> pd.DataFrame: """ Scrape journeybook data for a given month. Returns DataFrame with columns: Datum, Počáteční stav, Koncový stav, Ujeto [km], Natankováno [l|kg] """ url = f"{self.base_url}?rz={self.vehicle_registration}&den={month}-01" response = requests.get(url, auth=(self.username, self.password), verify=False) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') table = soup.find('table', class_='table table-striped table-bordered table-condensed table-sm') if not table: raise ValueError("Journeybook table not found") headers = [th.text.strip() for th in table.find('thead').find_all('th')] headers = [header.replace(" ", "") for header in headers] columns_to_keep = ["Datum", "Počátečnístav", "Koncovýstav", "Ujeto[km]"] new_headers = ["Datum", "Počáteční stav", "Koncový stav", "Ujeto [km]", "Natankováno [l|kg]"] for col in columns_to_keep: if col not in headers: raise ValueError(f"Column '{col}' not found. Headers: {headers}") rows = [] for row in table.find('tbody').find_all('tr'): if "Tankováno" in row.text: refuel_text = row.text.strip() amount_match = re.search(r'natankováno\s(\d+\.\d+)\s\[l\|kg\]', refuel_text) amount = amount_match.group(1) if amount_match else "" rows.append([""] * len(columns_to_keep) + [amount]) elif row.find('form'): cells = [] for cell in row.find_all('td'): input_field = cell.find('input') if input_field: cells.append(input_field.get('value', '')) else: if headers[len(cells)] == "Datum": date_match = re.search(r'\d{1,2}\.\s\d{1,2}\.\s\d{4}', cell.text.strip()) if date_match: cells.append(self.normalize_date(date_match.group())) else: cells.append(cell.text.strip()) else: cells.append(cell.text.strip()) filtered_cells = [cells[headers.index(col)] for col in columns_to_keep] filtered_cells.append("") rows.append(filtered_cells) df = pd.DataFrame(rows, columns=new_headers) return df