"""Detect VV sheets in an Excel workbook and find items with inconsistent unit prices across them. A "VV" sheet is identified by either: - Its name contains "VV" (case-insensitive), OR - It has a typical VV header row with columns matching Poř./Kód/Popis/MJ/Výměra/cena. Items are matched by their description text (normalised: trimmed, multiple spaces collapsed). The unit-price comparison is exact (rounded to 2 decimals to absorb floating-point noise). """ import logging import re from collections import defaultdict from pathlib import Path from typing import Iterable import openpyxl from openpyxl.utils import get_column_letter from openpyxl.styles import Alignment, Border, Font, PatternFill, Side from openpyxl.workbook import Workbook logger = logging.getLogger(__name__) # Heuristic header keywords (Czech). We look for the row with at least three of these. HEADER_HINTS = { "popis": ["popis", "název", "název položky", "naziv"], "mj": ["mj", "j.j.", "jednotka", "měrná jednotka"], "vymera": ["výměra", "vymera", "množství", "mnozstvi", "počet", "pocet"], # Unit-price column. Common Czech spellings include "Jedn. cena", # "J. cena", "Jednotková cena", "cena/jed", "kč/mj", ... "cena_jed": ["jednotková cena", "jednotkova cena", "jedn. cena", "jedn.cena", "jedn cena", "j. cena", "j.cena", "j cena", "jed. cena", "jed.cena", "jed cena", "cena/jed", "cena za jednotku", "cena j.", "cena jed", "cena za mj", "kč/mj"], "cena_tot": ["cena celkem", "cena", "celkem"], } # Allowed VV-name patterns (case-insensitive substring match). VV_NAME_PATTERNS = ["vv", "výkaz", "vykaz"] def normalise(text) -> str: if text is None: return "" return re.sub(r"\s+", " ", str(text).strip()).lower() def is_vv_sheet(ws) -> tuple[bool, dict | None]: """Return (is_vv, header_columns) where header_columns maps role → col index (1-based).""" name_match = any(p in ws.title.lower() for p in VV_NAME_PATTERNS) # Scan first 12 rows for a header row header_row = None header_cols: dict[str, int] = {} for row_idx in range(1, min(13, ws.max_row + 1)): row_values = [(c, normalise(ws.cell(row=row_idx, column=c).value)) for c in range(1, min(15, ws.max_column + 1))] matched_roles = {} for col_idx, val in row_values: for role, hints in HEADER_HINTS.items(): if role in matched_roles: continue if any(val == h or val.startswith(h) for h in hints): matched_roles[role] = col_idx break if len(matched_roles) >= 3 and "popis" in matched_roles \ and ("cena_jed" in matched_roles or "cena_tot" in matched_roles): header_row = row_idx header_cols = matched_roles header_cols["_header_row"] = row_idx break return ((name_match or header_row is not None), header_cols if header_row else None) def extract_items(ws, header_cols: dict) -> list[dict]: """Yield item dicts from a VV sheet given its header columns. Returns items even when unit_price is missing (None) so the UI can report "this sheet is a VV but has no prices" instead of silently dropping everything. """ header_row = header_cols.get("_header_row", 1) popis_col = header_cols.get("popis") mj_col = header_cols.get("mj") vymera_col = header_cols.get("vymera") cena_jed_col = header_cols.get("cena_jed") if not popis_col: return [] items: list[dict] = [] for r in range(header_row + 1, ws.max_row + 1): popis = ws.cell(row=r, column=popis_col).value if popis is None or not str(popis).strip(): continue popis_text = str(popis).strip() # Skip section rows like "001: Rozvaděče" — empty MJ + colon in popis mj_val = ws.cell(row=r, column=mj_col).value if mj_col else None if not mj_val and ":" in popis_text and len(popis_text) < 60: continue up: float | None = None if cena_jed_col: raw_price = ws.cell(row=r, column=cena_jed_col).value up = _to_float(raw_price) if up is not None: up = round(up, 2) items.append({ "row": r, "description": popis_text, "description_norm": normalise(popis_text), "mj": str(mj_val).strip() if mj_val else "", "quantity": _to_float(ws.cell(row=r, column=vymera_col).value) if vymera_col else None, "unit_price": up, }) return items def _to_float(v): if v is None or v == "": return None try: return float(v) except (ValueError, TypeError): return None def analyse(xlsx_path: Path) -> dict: """Run the full price-check analysis. Returns a structured report.""" wb = openpyxl.load_workbook(xlsx_path, data_only=True) sheets_info = [] vv_items: dict[str, list[dict]] = {} for ws in wb.worksheets: is_vv, header_cols = is_vv_sheet(ws) info = { "name": ws.title, "is_vv": bool(is_vv), "items": 0, # total rows recognised as items "priced_items": 0, # items with a unit price filled in "has_unit_price_col": False, } if is_vv and header_cols: info["has_unit_price_col"] = bool(header_cols.get("cena_jed")) items = extract_items(ws, header_cols) info["items"] = len(items) info["priced_items"] = sum(1 for it in items if it["unit_price"] is not None) vv_items[ws.title] = items sheets_info.append(info) # Only items WITH a unit price participate in the price-consistency check grouped: dict[str, list[tuple[str, dict]]] = defaultdict(list) for sheet_name, items in vv_items.items(): for it in items: if it["unit_price"] is None: continue grouped[it["description_norm"]].append((sheet_name, it)) # Inconsistencies: same description appearing in 2+ sheets with different price inconsistencies = [] for desc_norm, entries in grouped.items(): if len(entries) < 2: continue sheets_present = {s for s, _ in entries} if len(sheets_present) < 2: continue # appears multiple times in same sheet — not a cross-sheet issue prices = {round(it["unit_price"], 2) for _, it in entries} if len(prices) < 2: continue # Use the longest seen description as canonical (more readable) canonical = max((it["description"] for _, it in entries), key=len) rows = [] for sheet_name, it in entries: rows.append({ "sheet": sheet_name, "row": it["row"], "mj": it["mj"], "unit_price": it["unit_price"], }) inconsistencies.append({ "description": canonical, "occurrences": len(entries), "distinct_prices": sorted(prices), "rows": rows, }) # Sort by description for stable output inconsistencies.sort(key=lambda x: x["description"].lower()) vv_sheets_with_prices = sum( 1 for s in sheets_info if s["is_vv"] and s["priced_items"] > 0 ) return { "sheets": sheets_info, "vv_sheet_count": sum(1 for s in sheets_info if s["is_vv"]), "vv_sheets_with_prices": vv_sheets_with_prices, "total_inconsistencies": len(inconsistencies), "inconsistencies": inconsistencies, } # ── Excel report writer ───────────────────────────────────────────── BLUE = "1F4E78" WHITE = "FFFFFF" GRAY = "F2F2F2" RED_BG = "FCE4E4" THIN = Side(style="thin", color="BFBFBF") BORDER = Border(left=THIN, right=THIN, top=THIN, bottom=THIN) def write_report(result: dict, source_filename: str, out_path: Path) -> Path: wb = Workbook() ws = wb.active ws.title = "Nesoulady" # Title row ws.cell(row=1, column=1, value="Kontrola jednotkových cen ve výkazech výměr").font = \ Font(name="Arial", bold=True, size=14, color=BLUE) ws.merge_cells("A1:F1") ws.cell(row=2, column=1, value=f"Zdroj: {source_filename}").font = \ Font(name="Arial", italic=True, size=10, color="595959") ws.merge_cells("A2:F2") # Header headers = ["Název položky", "List VV", "Řádek", "MJ", "Jednotková cena", "Poznámka"] for c, h in enumerate(headers, 1): cell = ws.cell(row=4, column=c, value=h) cell.font = Font(name="Arial", bold=True, size=11, color=WHITE) cell.fill = PatternFill("solid", fgColor=BLUE) cell.alignment = Alignment(horizontal="center", vertical="center") cell.border = BORDER row = 5 if not result["inconsistencies"]: ws.cell(row=row, column=1, value="Žádné nesoulady — všechny položky se stejným názvem mají shodné jednotkové ceny.").font = \ Font(name="Arial", size=11, color="006100", italic=True) ws.merge_cells(start_row=row, start_column=1, end_row=row, end_column=6) else: for inc in result["inconsistencies"]: min_price = min(r["unit_price"] for r in inc["rows"]) max_price = max(r["unit_price"] for r in inc["rows"]) for r_info in inc["rows"]: note_parts = [] if r_info["unit_price"] == min_price: note_parts.append("nejnižší") if r_info["unit_price"] == max_price: note_parts.append("nejvyšší") note = ", ".join(note_parts) values = [ inc["description"], r_info["sheet"], r_info["row"], r_info["mj"], r_info["unit_price"], note, ] for c, v in enumerate(values, 1): cell = ws.cell(row=row, column=c, value=v) cell.font = Font(name="Arial", size=10) cell.border = BORDER cell.alignment = Alignment(vertical="top", wrap_text=(c == 1)) if c == 5: cell.number_format = '#,##0.00 "Kč";[Red]-#,##0.00 "Kč";"-"' cell.alignment = Alignment(horizontal="right", vertical="top") # Highlight rows with the highest price as a visual cue if r_info["unit_price"] == max_price and min_price != max_price: for c in range(1, 7): ws.cell(row=row, column=c).fill = PatternFill("solid", fgColor=RED_BG) row += 1 row += 0 # Column widths widths = {1: 56, 2: 22, 3: 8, 4: 8, 5: 16, 6: 16} for c, w in widths.items(): ws.column_dimensions[get_column_letter(c)].width = w ws.freeze_panes = "A5" ws.auto_filter.ref = f"A4:F{max(5, row - 1)}" # ── Second sheet: per-sheet breakdown ───────────────────── s2 = wb.create_sheet("Detekované listy") s2.cell(row=1, column=1, value="Přehled listů v sešitu").font = \ Font(name="Arial", bold=True, size=14, color=BLUE) s2.merge_cells("A1:D1") s2_headers = ["Název listu", "Je VV?", "Počet položek s cenou", "Poznámka"] for c, h in enumerate(s2_headers, 1): cell = s2.cell(row=3, column=c, value=h) cell.font = Font(name="Arial", bold=True, size=11, color=WHITE) cell.fill = PatternFill("solid", fgColor=BLUE) cell.border = BORDER r = 4 for s in result["sheets"]: priced = s.get("priced_items", 0) s2.cell(row=r, column=1, value=s["name"]).border = BORDER s2.cell(row=r, column=2, value=("Ano" if s["is_vv"] else "Ne")).border = BORDER s2.cell(row=r, column=3, value=priced).border = BORDER note = "" if s["is_vv"] and priced == 0: note = "list VV bez jednotkových cen — nelze kontrolovat" elif not s["is_vv"]: note = "neidentifikován jako VV" s2.cell(row=r, column=4, value=note).border = BORDER for c in range(1, 5): s2.cell(row=r, column=c).font = Font(name="Arial", size=10) r += 1 for c, w in {1: 30, 2: 10, 3: 22, 4: 40}.items(): s2.column_dimensions[get_column_letter(c)].width = w wb.save(str(out_path)) return out_path