import pandas as pd import numpy as np from typing import Optional import logging logger = logging.getLogger(__name__) class KilometerCalculator: @staticmethod def recalculate( df: pd.DataFrame, start_km: Optional[int] = None, end_km: Optional[int] = None, variance: float = 0.1 ) -> pd.DataFrame: """ Recalculate kilometers with random distribution. Args: df: DataFrame with journey data start_km: Override starting kilometers (uses first row if None) end_km: Override ending kilometers (uses last row if None) variance: Random variance factor (default 0.1 = 10%) """ df = df.copy() if start_km is None: start_km = df.iloc[0]["Počáteční stav"] if end_km is None: end_km = df.iloc[-1]["Koncový stav"] logger.info(f"Start KM: {start_km}, End KM: {end_km}") # Set deterministic random seed based on start/end km to ensure consistent results # This ensures the same input always produces the same output seed = (start_km * 1000 + end_km) % (2**31) np.random.seed(seed) logger.info(f"Using deterministic seed: {seed}") # Reset index FIRST to ensure continuous indices after filtering df = df.reset_index(drop=True) # Merge refueling rows into journey rows (consolidate by date) # Scraped data structure: journey rows have dates, refueling rows follow with empty date journey_rows = [] refuel_data = {} # Store refueling data by date last_date = None for i in range(len(df)): datum = df.at[i, "Datum"] refuel_amount = df.at[i, "Natankováno [l|kg]"] if pd.notna(datum) and datum != "": # This is a journey row with a date journey_rows.append(i) last_date = datum elif pd.notna(refuel_amount) and refuel_amount != "" and last_date: # This is a refueling row (no date, but has refueling amount) # Associate it with the last journey date if last_date not in refuel_data: refuel_data[last_date] = [] refuel_data[last_date].append(refuel_amount) # Maximum 2 refuelings per day max_refuelings = 2 logger.info(f"Consolidated to {len(journey_rows)} journey days, {len(refuel_data)} days with refueling") # Create new dataframe with only journey rows df = df.iloc[journey_rows].copy() df = df.reset_index(drop=True) # Remove original refueling column df = df.drop(columns=["Natankováno [l|kg]"]) # Create exactly 2 refueling columns (always) df["Natankováno 1 [l|kg]"] = None df["Tankováno při 1 [km]"] = None df["Natankováno 2 [l|kg]"] = None df["Tankováno při 2 [km]"] = None # Fill in refueling data (max 2 per day) for i in range(len(df)): datum = df.at[i, "Datum"] if datum in refuel_data: amounts = refuel_data[datum][:2] # Take only first 2 refuelings for idx, amount in enumerate(amounts, start=1): df.at[i, f"Natankováno {idx} [l|kg]"] = amount date_mask = df["Datum"].notna() num_days = date_mask.sum() if num_days == 0: raise ValueError("No valid days found") total_kilometers = end_km - start_km logger.info(f"Total km to distribute: {total_kilometers} across {num_days} days") avg_km_per_day = total_kilometers / num_days km_per_day = np.abs(np.random.normal(avg_km_per_day, avg_km_per_day * variance, num_days)) km_per_day = np.round(km_per_day).astype(int) difference = total_kilometers - np.sum(km_per_day) logger.info(f"Difference to distribute: {difference}") if difference != 0: adjustment = int(difference // num_days) km_per_day += adjustment remaining = int(difference % num_days) km_per_day[:remaining] += 1 df.loc[date_mask, "Ujeto [km]"] = km_per_day # Recalculate km states for journey rows current_km = start_km for i in range(len(df)): df.at[i, "Počáteční stav"] = current_km df.at[i, "Koncový stav"] = current_km + int(df.at[i, "Ujeto [km]"]) current_km = df.at[i, "Koncový stav"] # Set final end km for last row df.at[len(df) - 1, "Koncový stav"] = end_km # Calculate "Tankováno při [km]" for rows with refueling data for i in range(len(df)): start_km_val = df.at[i, "Počáteční stav"] end_km_val = df.at[i, "Koncový stav"] if isinstance(start_km_val, (int, float)) and isinstance(end_km_val, (int, float)): # Check each refueling column for refuel_num in range(1, max_refuelings + 1): refuel_col = f"Natankováno {refuel_num} [l|kg]" km_col = f"Tankováno při {refuel_num} [km]" if refuel_col in df.columns and pd.notna(df.at[i, refuel_col]) and df.at[i, refuel_col] != "": # Generate random km within the journey range if end_km_val > start_km_val: refuel_km = np.random.randint(int(start_km_val), int(end_km_val) + 1) else: refuel_km = start_km_val df.at[i, km_col] = refuel_km # Replace NaN values with None for JSON serialization df = df.replace({np.nan: None}) return df