import re import requests from bs4 import BeautifulSoup from typing import List class AttendanceScraper: def __init__(self, username: str, password: str): self.username = username self.password = password self.base_url = "https://agenda.colsys.cz/dochazka/index.php" @staticmethod def normalize_date(date_str: str) -> str: return re.sub(r'\s+', '', date_str) def scrape_month(self, month: str) -> List[str]: """ Scrape attendance data for a given month. Returns list of dates with sick days, vacation, or unpaid leave. """ url = f"{self.base_url}?kdy={month}-01" response = requests.get(url, auth=(self.username, self.password), verify=False) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') table = soup.find('table', class_='restrikce') if not table: raise ValueError("Attendance table not found") attendance_dates = [] for row in table.find_all('tr')[1:]: cells = row.find_all('td') if len(cells) >= 3: date = cells[0].text.strip() presence = cells[2].text.strip() if ("sick day" in presence.lower() or "dovolená" in presence.lower() or "neplacené volno" in presence.lower()): attendance_dates.append(self.normalize_date(date)) return attendance_dates