kniha_jizd_web/backend/scrapers/journeybook_scraper.py

import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
from typing import Dict, Any


class JourneybookScraper:
    def __init__(self, username: str, password: str, vehicle_registration: str = "4SH1148"):
        self.username = username
        self.password = password
        self.vehicle_registration = vehicle_registration
        self.base_url = "https://kj.colsys.cz/prehled_mesic.php"

    @staticmethod
    def normalize_date(date_str: str) -> str:
        return re.sub(r'\s+', '', date_str)

    def scrape_month(self, month: str) -> pd.DataFrame:
        """
        Scrape journeybook data for a given month.
        Returns DataFrame with columns: Datum, Počáteční stav, Koncový stav, Ujeto [km], Natankováno [l|kg]
        """
        url = f"{self.base_url}?rz={self.vehicle_registration}&den={month}-01"

        response = requests.get(url, auth=(self.username, self.password), verify=False)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', class_='table table-striped table-bordered table-condensed table-sm')

        if not table:
            raise ValueError("Journeybook table not found")

        headers = [th.text.strip() for th in table.find('thead').find_all('th')]
        headers = [header.replace(" ", "") for header in headers]

        columns_to_keep = ["Datum", "Počátečnístav", "Koncovýstav", "Ujeto[km]"]
        new_headers = ["Datum", "Počáteční stav", "Koncový stav", "Ujeto [km]", "Natankováno [l|kg]"]

        for col in columns_to_keep:
            if col not in headers:
                raise ValueError(f"Column '{col}' not found. Headers: {headers}")

        rows = []
        for row in table.find('tbody').find_all('tr'):
            if "Tankováno" in row.text:
                refuel_text = row.text.strip()
                amount_match = re.search(r'natankováno\s(\d+\.\d+)\s\[l\|kg\]', refuel_text)
                amount = amount_match.group(1) if amount_match else ""
                rows.append([""] * len(columns_to_keep) + [amount])
            elif row.find('form'):
                cells = []
                for cell in row.find_all('td'):
                    input_field = cell.find('input')
                    if input_field:
                        cells.append(input_field.get('value', ''))
                    else:
                        if headers[len(cells)] == "Datum":
                            date_match = re.search(r'\d{1,2}\.\s\d{1,2}\.\s\d{4}', cell.text.strip())
                            if date_match:
                                cells.append(self.normalize_date(date_match.group()))
                            else:
                                cells.append(cell.text.strip())
                        else:
                            cells.append(cell.text.strip())

                filtered_cells = [cells[headers.index(col)] for col in columns_to_keep]
                filtered_cells.append("")
                rows.append(filtered_cells)

        df = pd.DataFrame(rows, columns=new_headers)
        return df