Initial commit - Journey book (kniha jízd) automation system

Features:
- FastAPI backend for scraping attendance and journey book data
- Deterministic kilometer distribution with random variance
- Refueling form filling with km values
- Next.js frontend with date range selector
- Docker deployment setup

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Docker Config Backup
2025-10-10 15:41:11 +02:00
commit 3b5d9fd940
40 changed files with 3777 additions and 0 deletions

4
backend/.env.example Normal file
View File

@@ -0,0 +1,4 @@
USERNAME=your_username
PASSWORD=your_password
VEHICLE_REGISTRATION=4SH1148
DATABASE_URL=sqlite:///./journeybook.db

14
backend/Dockerfile Normal file
View File

@@ -0,0 +1,14 @@
FROM python:3.11-slim
WORKDIR /app
RUN apt-get update && apt-get install -y \
gcc \
&& rm -rf /var/lib/apt/lists/*
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
CMD ["uvicorn", "api.main:app", "--host", "0.0.0.0", "--port", "8002", "--reload"]

243
backend/api/main.py Normal file
View File

@@ -0,0 +1,243 @@
from fastapi import FastAPI, HTTPException, Depends
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing import Optional
import pandas as pd
from io import BytesIO
from fastapi.responses import StreamingResponse
import logging
from datetime import datetime
from dateutil.relativedelta import relativedelta
from scrapers import AttendanceScraper, JourneybookScraper
from calculators.kilometer_calculator import KilometerCalculator
from fillers import JourneybookFiller
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = FastAPI(title="Kniha Jízd API", version="1.0.0")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
class ScrapeRequest(BaseModel):
username: str
password: str
month: str
vehicle_registration: str = "4SH1148"
class CalculateRequest(BaseModel):
username: str
password: str
start_date: str # Format: YYYY-MM-DD
end_date: str # Format: YYYY-MM-DD
start_km: int
end_km: int
vehicle_registration: str = "4SH1148"
variance: float = 0.1
class FillRequest(CalculateRequest):
dry_run: bool = True
@app.get("/health")
async def health_check():
return {"status": "healthy"}
@app.post("/api/scrape/attendance")
async def scrape_attendance(request: ScrapeRequest):
"""Scrape attendance data for a month"""
try:
scraper = AttendanceScraper(request.username, request.password)
attendance_dates = scraper.scrape_month(request.month)
return {
"month": request.month,
"sick_vacation_days": attendance_dates,
"count": len(attendance_dates)
}
except Exception as e:
logger.error(f"Attendance scraping failed: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/api/scrape/journeybook")
async def scrape_journeybook(request: ScrapeRequest):
"""Scrape journeybook data for a month"""
try:
scraper = JourneybookScraper(request.username, request.password, request.vehicle_registration)
df = scraper.scrape_month(request.month)
return {
"month": request.month,
"entries": df.to_dict(orient='records'),
"count": len(df)
}
except Exception as e:
logger.error(f"Journeybook scraping failed: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/api/calculate")
async def calculate_kilometers(request: CalculateRequest):
"""Scrape data, filter sick days, and recalculate kilometers"""
try:
attendance_scraper = AttendanceScraper(request.username, request.password)
journeybook_scraper = JourneybookScraper(request.username, request.password, request.vehicle_registration)
# Get all months in the date range
start = datetime.strptime(request.start_date, "%Y-%m-%d")
end = datetime.strptime(request.end_date, "%Y-%m-%d")
# Collect data from all months
all_attendance_dates = []
all_dfs = []
current = start
while current <= end:
month_str = current.strftime("%Y-%m")
logger.info(f"Scraping attendance for {month_str}")
attendance_dates = attendance_scraper.scrape_month(month_str)
all_attendance_dates.extend(attendance_dates)
logger.info(f"Scraping journeybook for {month_str}")
df_month = journeybook_scraper.scrape_month(month_str)
all_dfs.append(df_month)
current = current + relativedelta(months=1)
# Combine all months
df = pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()
# Filter by actual date range, but preserve refueling rows (empty Datum)
if not df.empty:
date_parsed = pd.to_datetime(df["Datum"], format="%d.%m.%Y", errors='coerce')
is_refuel = df["Datum"].isna() | (df["Datum"] == "")
is_in_range = (date_parsed >= start) & (date_parsed <= end)
df = df[is_in_range | is_refuel]
logger.info(f"Filtering out {len(all_attendance_dates)} sick/vacation days")
# Only filter journey rows, not refueling rows
is_refuel = df["Datum"].isna() | (df["Datum"] == "")
df = df[~df["Datum"].isin(all_attendance_dates) | is_refuel]
logger.info("Recalculating kilometers")
df = KilometerCalculator.recalculate(df, request.start_km, request.end_km, request.variance)
return {
"month": f"{request.start_date} - {request.end_date}",
"start_km": request.start_km,
"end_km": request.end_km,
"filtered_days": len(all_attendance_dates),
"entries": df.to_dict(orient='records'),
"total_entries": len(df)
}
except Exception as e:
import traceback
logger.error(f"Calculate failed: {e}")
logger.error(traceback.format_exc())
raise HTTPException(status_code=500, detail=str(e))
@app.post("/api/export/excel")
async def export_to_excel(request: CalculateRequest):
"""Generate and download Excel file"""
try:
attendance_scraper = AttendanceScraper(request.username, request.password)
journeybook_scraper = JourneybookScraper(request.username, request.password, request.vehicle_registration)
# Get all months in the date range
start = datetime.strptime(request.start_date, "%Y-%m-%d")
end = datetime.strptime(request.end_date, "%Y-%m-%d")
# Collect data from all months
all_attendance_dates = []
all_dfs = []
current = start
while current <= end:
month_str = current.strftime("%Y-%m")
attendance_dates = attendance_scraper.scrape_month(month_str)
all_attendance_dates.extend(attendance_dates)
df_month = journeybook_scraper.scrape_month(month_str)
all_dfs.append(df_month)
current = current + relativedelta(months=1)
# Combine all months
df = pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()
# Filter by actual date range, but preserve refueling rows (empty Datum)
if not df.empty:
date_parsed = pd.to_datetime(df["Datum"], format="%d.%m.%Y", errors='coerce')
is_refuel = df["Datum"].isna() | (df["Datum"] == "")
is_in_range = (date_parsed >= start) & (date_parsed <= end)
df = df[is_in_range | is_refuel]
# Only filter journey rows, not refueling rows
is_refuel = df["Datum"].isna() | (df["Datum"] == "")
df = df[~df["Datum"].isin(all_attendance_dates) | is_refuel]
df = KilometerCalculator.recalculate(df, request.start_km, request.end_km, request.variance)
output = BytesIO()
df.to_excel(output, index=False, engine='openpyxl')
output.seek(0)
filename = f"journeybook_{request.start_date}_{request.end_date}.xlsx"
return StreamingResponse(
output,
media_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
headers={"Content-Disposition": f"attachment; filename={filename}"}
)
except Exception as e:
logger.error(f"Export failed: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/api/fill/journeybook")
async def fill_journeybook(request: FillRequest):
"""Fill calculated data back to kj.colsys.cz (restricted to January 2025)"""
try:
logger.info(f"Fill request received with dry_run={request.dry_run}")
# Only allow January 2025 for testing
if request.start_date != "2025-01-01" or request.end_date < "2025-01-31":
raise HTTPException(status_code=400, detail="Only January 2025 is allowed for testing")
attendance_scraper = AttendanceScraper(request.username, request.password)
journeybook_scraper = JourneybookScraper(request.username, request.password, request.vehicle_registration)
# Get January data
month_str = "2025-01"
attendance_dates = attendance_scraper.scrape_month(month_str)
df = journeybook_scraper.scrape_month(month_str)
# Filter sick/vacation days
df = df[~df["Datum"].isin(attendance_dates)]
# Recalculate kilometers
df = KilometerCalculator.recalculate(df, request.start_km, request.end_km, request.variance)
# Fill data back (supports dry_run mode)
filler = JourneybookFiller(request.username, request.password, request.vehicle_registration)
result = filler.fill_month(df, month_str, dry_run=request.dry_run)
return result
except Exception as e:
import traceback
logger.error(f"Fill failed: {e}")
logger.error(traceback.format_exc())
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)

View File

@@ -0,0 +1,147 @@
import pandas as pd
import numpy as np
from typing import Optional
import logging
logger = logging.getLogger(__name__)
class KilometerCalculator:
@staticmethod
def recalculate(
df: pd.DataFrame,
start_km: Optional[int] = None,
end_km: Optional[int] = None,
variance: float = 0.1
) -> pd.DataFrame:
"""
Recalculate kilometers with random distribution.
Args:
df: DataFrame with journey data
start_km: Override starting kilometers (uses first row if None)
end_km: Override ending kilometers (uses last row if None)
variance: Random variance factor (default 0.1 = 10%)
"""
df = df.copy()
if start_km is None:
start_km = df.iloc[0]["Počáteční stav"]
if end_km is None:
end_km = df.iloc[-1]["Koncový stav"]
logger.info(f"Start KM: {start_km}, End KM: {end_km}")
# Set deterministic random seed based on start/end km to ensure consistent results
# This ensures the same input always produces the same output
seed = (start_km * 1000 + end_km) % (2**31)
np.random.seed(seed)
logger.info(f"Using deterministic seed: {seed}")
# Reset index FIRST to ensure continuous indices after filtering
df = df.reset_index(drop=True)
# Merge refueling rows into journey rows (consolidate by date)
# Scraped data structure: journey rows have dates, refueling rows follow with empty date
journey_rows = []
refuel_data = {} # Store refueling data by date
last_date = None
for i in range(len(df)):
datum = df.at[i, "Datum"]
refuel_amount = df.at[i, "Natankováno [l|kg]"]
if pd.notna(datum) and datum != "":
# This is a journey row with a date
journey_rows.append(i)
last_date = datum
elif pd.notna(refuel_amount) and refuel_amount != "" and last_date:
# This is a refueling row (no date, but has refueling amount)
# Associate it with the last journey date
if last_date not in refuel_data:
refuel_data[last_date] = []
refuel_data[last_date].append(refuel_amount)
# Maximum 2 refuelings per day
max_refuelings = 2
logger.info(f"Consolidated to {len(journey_rows)} journey days, {len(refuel_data)} days with refueling")
# Create new dataframe with only journey rows
df = df.iloc[journey_rows].copy()
df = df.reset_index(drop=True)
# Remove original refueling column
df = df.drop(columns=["Natankováno [l|kg]"])
# Create exactly 2 refueling columns (always)
df["Natankováno 1 [l|kg]"] = None
df["Tankováno při 1 [km]"] = None
df["Natankováno 2 [l|kg]"] = None
df["Tankováno při 2 [km]"] = None
# Fill in refueling data (max 2 per day)
for i in range(len(df)):
datum = df.at[i, "Datum"]
if datum in refuel_data:
amounts = refuel_data[datum][:2] # Take only first 2 refuelings
for idx, amount in enumerate(amounts, start=1):
df.at[i, f"Natankováno {idx} [l|kg]"] = amount
date_mask = df["Datum"].notna()
num_days = date_mask.sum()
if num_days == 0:
raise ValueError("No valid days found")
total_kilometers = end_km - start_km
logger.info(f"Total km to distribute: {total_kilometers} across {num_days} days")
avg_km_per_day = total_kilometers / num_days
km_per_day = np.abs(np.random.normal(avg_km_per_day, avg_km_per_day * variance, num_days))
km_per_day = np.round(km_per_day).astype(int)
difference = total_kilometers - np.sum(km_per_day)
logger.info(f"Difference to distribute: {difference}")
if difference != 0:
adjustment = int(difference // num_days)
km_per_day += adjustment
remaining = int(difference % num_days)
km_per_day[:remaining] += 1
df.loc[date_mask, "Ujeto [km]"] = km_per_day
# Recalculate km states for journey rows
current_km = start_km
for i in range(len(df)):
df.at[i, "Počáteční stav"] = current_km
df.at[i, "Koncový stav"] = current_km + int(df.at[i, "Ujeto [km]"])
current_km = df.at[i, "Koncový stav"]
# Set final end km for last row
df.at[len(df) - 1, "Koncový stav"] = end_km
# Calculate "Tankováno při [km]" for rows with refueling data
for i in range(len(df)):
start_km_val = df.at[i, "Počáteční stav"]
end_km_val = df.at[i, "Koncový stav"]
if isinstance(start_km_val, (int, float)) and isinstance(end_km_val, (int, float)):
# Check each refueling column
for refuel_num in range(1, max_refuelings + 1):
refuel_col = f"Natankováno {refuel_num} [l|kg]"
km_col = f"Tankováno při {refuel_num} [km]"
if refuel_col in df.columns and pd.notna(df.at[i, refuel_col]) and df.at[i, refuel_col] != "":
# Generate random km within the journey range
if end_km_val > start_km_val:
refuel_km = np.random.randint(int(start_km_val), int(end_km_val) + 1)
else:
refuel_km = start_km_val
df.at[i, km_col] = refuel_km
# Replace NaN values with None for JSON serialization
df = df.replace({np.nan: None})
return df

View File

@@ -0,0 +1,3 @@
from .journeybook_filler import JourneybookFiller
__all__ = ['JourneybookFiller']

View File

@@ -0,0 +1,281 @@
import requests
from bs4 import BeautifulSoup
import pandas as pd
import logging
from typing import Dict, Any, List
import time
logger = logging.getLogger(__name__)
class JourneybookFiller:
def __init__(self, username: str, password: str, vehicle_registration: str = "4SH1148"):
self.username = username
self.password = password
self.vehicle_registration = vehicle_registration
self.base_url = "https://kj.colsys.cz/prehled_mesic.php"
self.session = requests.Session()
self.session.auth = (username, password)
self.session.verify = False
def fill_month(self, df: pd.DataFrame, month: str, dry_run: bool = True) -> Dict[str, Any]:
"""
Fill journeybook data for a given month.
Args:
df: DataFrame with calculated journey data
month: Month in format YYYY-MM (e.g., "2025-01")
dry_run: If True, only show what would be filled without actually submitting
Returns:
Dict with results including success/failure counts
"""
url = f"{self.base_url}?rz={self.vehicle_registration}&den={month}-01"
logger.info(f"Fetching form for month {month}, vehicle {self.vehicle_registration}")
response = self.session.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table', class_='table table-striped table-bordered table-condensed table-sm')
if not table:
raise ValueError("Journeybook table not found")
# Extract forms with their parent rows to get dates
import re
table_rows = table.find('tbody').find_all('tr')
logger.info(f"Found {len(table_rows)} table rows")
logger.info(f"DataFrame has {len(df)} rows")
updates = []
deletes = []
for i, row in enumerate(table_rows):
form = row.find('form')
if not form:
continue # Skip rows without forms
# Extract date from the row's first cell
cells = row.find_all('td')
if not cells:
continue
# Get text from first cell (including button text)
date_text = cells[0].get_text(strip=True)
# Extract date pattern "2. 1. 2025" from text like "Zapiš2. 1. 2025Zapiš"
date_match = re.search(r'(\d{1,2}\.\s*\d{1,2}\.\s*\d{4})', date_text)
if date_match:
clean_date = date_match.group(1).replace(' ', '')
# Determine if this is a journey form or refueling form
form_data = self._extract_form_data(form)
is_refueling_form = any(btn.get("name") == "f_ulozitkm" for btn in form_data["buttons"])
# Match with DataFrame
matching_rows = df[df["Datum"] == clean_date]
if len(matching_rows) > 0:
# Row exists in our data - update it
row_data = matching_rows.iloc[0]
update = self._prepare_update(form_data, row_data, update_mode=True, is_refueling=is_refueling_form)
updates.append(update)
form_type = "refueling" if is_refueling_form else "journey"
logger.info(f"Matched {form_type} row {i} with date {clean_date}")
else:
# Row exists on website but not in our data - delete it
delete = self._prepare_update(form_data, None, update_mode=False, is_refueling=is_refueling_form)
deletes.append(delete)
logger.info(f"Will delete row {i} with date {clean_date}")
else:
logger.debug(f"Skipping row {i} (no date pattern)")
if dry_run:
logger.info("DRY RUN MODE - No data will be submitted")
return {
"dry_run": True,
"month": month,
"updates_prepared": len(updates),
"deletes_prepared": len(deletes),
"updates": updates[:5], # Show first 5 as sample
"deletes": deletes[:5]
}
# Actually submit the data
results = {
"month": month,
"updates_total": len(updates),
"deletes_total": len(deletes),
"updates_successful": 0,
"updates_failed": 0,
"deletes_successful": 0,
"deletes_failed": 0,
"errors": []
}
# First, submit all updates
for i, update in enumerate(updates):
try:
logger.info(f"Updating row {i+1}/{len(updates)}")
self._submit_form(update)
results["updates_successful"] += 1
time.sleep(0.5) # Rate limiting
except Exception as e:
logger.error(f"Failed to update row {i+1}: {e}")
results["updates_failed"] += 1
results["errors"].append({"type": "update", "row": i+1, "error": str(e)})
# Then, submit all deletes
for i, delete in enumerate(deletes):
try:
logger.info(f"Deleting row {i+1}/{len(deletes)}")
self._submit_form(delete)
results["deletes_successful"] += 1
time.sleep(0.5) # Rate limiting
except Exception as e:
logger.error(f"Failed to delete row {i+1}: {e}")
results["deletes_failed"] += 1
results["errors"].append({"type": "delete", "row": i+1, "error": str(e)})
return results
def _extract_form_data(self, form) -> Dict[str, Any]:
"""Extract all form fields and their current values"""
form_data = {
"action": form.get('action', ''),
"method": form.get('method', 'post'),
"fields": {},
"buttons": []
}
# Get all input fields
for input_field in form.find_all('input'):
name = input_field.get('name', '')
value = input_field.get('value', '')
field_type = input_field.get('type', 'text')
if name:
form_data["fields"][name] = {
"value": value,
"type": field_type
}
# Track input buttons
if field_type in ['submit', 'button']:
form_data["buttons"].append({
"name": name,
"value": value,
"type": field_type
})
# CRITICAL: Also get <button> elements (not just <input type="submit">)
for button in form.find_all('button'):
name = button.get('name', '')
value = button.get('value', button.get_text(strip=True))
btn_type = button.get('type', 'submit')
if name:
form_data["buttons"].append({
"name": name,
"value": value,
"type": btn_type
})
return form_data
def _prepare_update(self, form_data: Dict, row_data: pd.Series, update_mode: bool = True, is_refueling: bool = False) -> Dict[str, Any]:
"""Prepare form data with updated values from DataFrame or for deletion
Args:
form_data: Extracted form data
row_data: DataFrame row with journey data (None for delete)
update_mode: True to update row, False to delete row
is_refueling: True if this is a refueling form, False if journey form
"""
update = {
"action": form_data["action"],
"method": form_data["method"],
"data": {},
"buttons": form_data.get("buttons", [])
}
# Copy all existing fields
for field_name, field_info in form_data["fields"].items():
update["data"][field_name] = field_info["value"]
if not update_mode:
# Delete mode - find and add "Smazat" (Delete) button
for button in update["buttons"]:
button_value = button.get("value", "")
if "Smazat" in button_value or "smazat" in button.get("name", "").lower():
if button.get("name"):
update["data"][button["name"]] = button["value"]
logger.info(f"Adding DELETE button: {button['name']}={button['value']}")
break
return update
# Update mode - handle refueling forms vs journey forms differently
if is_refueling:
# Refueling form - fill the km value from Tankováno při column
# We need to determine if this is refueling 1 or 2 for this date
# The form should have an f_km field that needs to be filled
# Check if this date has refueling data in the DataFrame
if "Tankováno při 1 [km]" in row_data and pd.notna(row_data["Tankováno při 1 [km]"]):
if "f_km" in update["data"]:
# Check if this is the first or second refueling form
current_km = update["data"].get("f_km", "0")
refuel_1_km = int(row_data["Tankováno při 1 [km]"])
# If f_km is 0 or empty, fill with refuel 1
if current_km == "0" or current_km == "":
update["data"]["f_km"] = str(refuel_1_km)
logger.info(f"Filling refuel km: f_km={refuel_1_km}")
# Otherwise check if there's a second refueling
elif "Tankováno při 2 [km]" in row_data and pd.notna(row_data["Tankováno při 2 [km]"]):
refuel_2_km = int(row_data["Tankováno při 2 [km]"])
if int(current_km) != refuel_1_km:
# This might be the second refueling form
update["data"]["f_km"] = str(refuel_2_km)
logger.info(f"Filling refuel 2 km: f_km={refuel_2_km}")
else:
# Journey form - fill with data from DataFrame
# ONLY update f_ujeto (distance traveled)
# Let kj.colsys.cz calculate f_cil_km (end km) automatically
if "Ujeto [km]" in row_data and pd.notna(row_data["Ujeto [km]"]):
if "f_ujeto" in update["data"]:
update["data"]["f_ujeto"] = str(int(row_data["Ujeto [km]"]))
# Add button click - look for "Uložit km" or "Přepočítat" buttons
# Exclude "Uzavřít měsíc" button
button_added = False
for button in update["buttons"]:
button_value = button.get("value", "")
button_name = button.get("name", "")
if "Uložit" in button_value or "Přepočítat" in button_value or "ulozit" in button_name.lower():
# Include the button in the POST data to trigger its action
if button_name:
update["data"][button_name] = button_value
logger.info(f"Adding button to POST: {button_name}={button_value}")
button_added = True
break
if not button_added:
logger.warning(f"No save button found! Available buttons: {update['buttons']}")
return update
def _submit_form(self, update: Dict[str, Any]):
"""Submit a form update"""
url = update["action"] if update["action"].startswith('http') else f"https://kj.colsys.cz/{update['action']}"
response = self.session.post(
url,
data=update["data"],
allow_redirects=False
)
response.raise_for_status()
logger.info(f"Form submitted successfully: {response.status_code}")

View File

@@ -0,0 +1,3 @@
from .journey import Journey, JourneyEntry, RefuelingEntry
__all__ = ["Journey", "JourneyEntry", "RefuelingEntry"]

37
backend/models/journey.py Normal file
View File

@@ -0,0 +1,37 @@
from datetime import datetime
from typing import Optional
from pydantic import BaseModel, Field
class JourneyEntry(BaseModel):
date: str
start_km: Optional[int] = None
end_km: Optional[int] = None
distance_km: Optional[int] = None
is_sick_day: bool = False
is_vacation: bool = False
class RefuelingEntry(BaseModel):
date: str
amount_liters: float
km_at_refuel: Optional[int] = None
class Journey(BaseModel):
month: str = Field(..., pattern=r"^\d{4}-\d{2}$")
start_km: int = Field(..., gt=0)
end_km: int = Field(..., gt=0)
entries: list[JourneyEntry] = []
refueling_entries: list[RefuelingEntry] = []
class Config:
json_schema_extra = {
"example": {
"month": "2024-03",
"start_km": 12000,
"end_km": 13500,
"entries": [],
"refueling_entries": []
}
}

15
backend/requirements.txt Normal file
View File

@@ -0,0 +1,15 @@
fastapi==0.109.0
uvicorn[standard]==0.27.0
pydantic==2.5.3
pydantic-settings==2.1.0
sqlalchemy==2.0.25
alembic==1.13.1
pandas==2.2.0
numpy==1.26.3
requests==2.31.0
beautifulsoup4==4.12.3
playwright==1.41.0
python-dotenv==1.0.0
python-multipart==0.0.6
openpyxl==3.1.2
python-dateutil==2.8.2

View File

@@ -0,0 +1,4 @@
from .attendance_scraper import AttendanceScraper
from .journeybook_scraper import JourneybookScraper
__all__ = ["AttendanceScraper", "JourneybookScraper"]

View File

@@ -0,0 +1,45 @@
import re
import requests
from bs4 import BeautifulSoup
from typing import List
class AttendanceScraper:
def __init__(self, username: str, password: str):
self.username = username
self.password = password
self.base_url = "https://agenda.colsys.cz/dochazka/index.php"
@staticmethod
def normalize_date(date_str: str) -> str:
return re.sub(r'\s+', '', date_str)
def scrape_month(self, month: str) -> List[str]:
"""
Scrape attendance data for a given month.
Returns list of dates with sick days, vacation, or unpaid leave.
"""
url = f"{self.base_url}?kdy={month}-01"
response = requests.get(url, auth=(self.username, self.password), verify=False)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table', class_='restrikce')
if not table:
raise ValueError("Attendance table not found")
attendance_dates = []
for row in table.find_all('tr')[1:]:
cells = row.find_all('td')
if len(cells) >= 3:
date = cells[0].text.strip()
presence = cells[2].text.strip()
if ("sick day" in presence.lower() or
"dovolená" in presence.lower() or
"neplacené volno" in presence.lower()):
attendance_dates.append(self.normalize_date(date))
return attendance_dates

View File

@@ -0,0 +1,73 @@
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
from typing import Dict, Any
class JourneybookScraper:
def __init__(self, username: str, password: str, vehicle_registration: str = "4SH1148"):
self.username = username
self.password = password
self.vehicle_registration = vehicle_registration
self.base_url = "https://kj.colsys.cz/prehled_mesic.php"
@staticmethod
def normalize_date(date_str: str) -> str:
return re.sub(r'\s+', '', date_str)
def scrape_month(self, month: str) -> pd.DataFrame:
"""
Scrape journeybook data for a given month.
Returns DataFrame with columns: Datum, Počáteční stav, Koncový stav, Ujeto [km], Natankováno [l|kg]
"""
url = f"{self.base_url}?rz={self.vehicle_registration}&den={month}-01"
response = requests.get(url, auth=(self.username, self.password), verify=False)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table', class_='table table-striped table-bordered table-condensed table-sm')
if not table:
raise ValueError("Journeybook table not found")
headers = [th.text.strip() for th in table.find('thead').find_all('th')]
headers = [header.replace(" ", "") for header in headers]
columns_to_keep = ["Datum", "Počátečnístav", "Koncovýstav", "Ujeto[km]"]
new_headers = ["Datum", "Počáteční stav", "Koncový stav", "Ujeto [km]", "Natankováno [l|kg]"]
for col in columns_to_keep:
if col not in headers:
raise ValueError(f"Column '{col}' not found. Headers: {headers}")
rows = []
for row in table.find('tbody').find_all('tr'):
if "Tankováno" in row.text:
refuel_text = row.text.strip()
amount_match = re.search(r'natankováno\s(\d+\.\d+)\s\[l\|kg\]', refuel_text)
amount = amount_match.group(1) if amount_match else ""
rows.append([""] * len(columns_to_keep) + [amount])
elif row.find('form'):
cells = []
for cell in row.find_all('td'):
input_field = cell.find('input')
if input_field:
cells.append(input_field.get('value', ''))
else:
if headers[len(cells)] == "Datum":
date_match = re.search(r'\d{1,2}\.\s\d{1,2}\.\s\d{4}', cell.text.strip())
if date_match:
cells.append(self.normalize_date(date_match.group()))
else:
cells.append(cell.text.strip())
else:
cells.append(cell.text.strip())
filtered_cells = [cells[headers.index(col)] for col in columns_to_keep]
filtered_cells.append("")
rows.append(filtered_cells)
df = pd.DataFrame(rows, columns=new_headers)
return df