AI_portal/vv-check/vv_logic.py

"""Detect VV sheets in an Excel workbook and find items with inconsistent
unit prices across them.

A "VV" sheet is identified by either:
  - Its name contains "VV" (case-insensitive), OR
  - It has a typical VV header row with columns matching Poř./Kód/Popis/MJ/Výměra/cena.

Items are matched by their description text (normalised: trimmed, multiple
spaces collapsed). The unit-price comparison is exact (rounded to 2 decimals
to absorb floating-point noise).
"""
import logging
import re
from collections import defaultdict
from pathlib import Path
from typing import Iterable

import openpyxl
from openpyxl.utils import get_column_letter
from openpyxl.styles import Alignment, Border, Font, PatternFill, Side
from openpyxl.workbook import Workbook

logger = logging.getLogger(__name__)

# Heuristic header keywords (Czech). We look for the row with at least three of these.
HEADER_HINTS = {
    "popis": ["popis", "název", "název položky", "naziv"],
    "mj":    ["mj", "j.j.", "jednotka", "měrná jednotka"],
    "vymera": ["výměra", "vymera", "množství", "mnozstvi", "počet", "pocet"],
    # Unit-price column. Common Czech spellings include "Jedn. cena",
    # "J. cena", "Jednotková cena", "cena/jed", "kč/mj", ...
    "cena_jed": ["jednotková cena", "jednotkova cena",
                 "jedn. cena", "jedn.cena", "jedn cena",
                 "j. cena", "j.cena", "j cena",
                 "jed. cena", "jed.cena", "jed cena",
                 "cena/jed", "cena za jednotku", "cena j.",
                 "cena jed", "cena za mj", "kč/mj"],
    "cena_tot": ["cena celkem", "cena", "celkem"],
}

# Allowed VV-name patterns (case-insensitive substring match).
VV_NAME_PATTERNS = ["vv", "výkaz", "vykaz"]


def normalise(text) -> str:
    if text is None:
        return ""
    return re.sub(r"\s+", " ", str(text).strip()).lower()


def is_vv_sheet(ws) -> tuple[bool, dict | None]:
    """Return (is_vv, header_columns) where header_columns maps role → col index (1-based)."""
    name_match = any(p in ws.title.lower() for p in VV_NAME_PATTERNS)
    # Scan first 12 rows for a header row
    header_row = None
    header_cols: dict[str, int] = {}
    for row_idx in range(1, min(13, ws.max_row + 1)):
        row_values = [(c, normalise(ws.cell(row=row_idx, column=c).value))
                      for c in range(1, min(15, ws.max_column + 1))]
        matched_roles = {}
        for col_idx, val in row_values:
            for role, hints in HEADER_HINTS.items():
                if role in matched_roles:
                    continue
                if any(val == h or val.startswith(h) for h in hints):
                    matched_roles[role] = col_idx
                    break
        if len(matched_roles) >= 3 and "popis" in matched_roles \
                and ("cena_jed" in matched_roles or "cena_tot" in matched_roles):
            header_row = row_idx
            header_cols = matched_roles
            header_cols["_header_row"] = row_idx
            break
    return ((name_match or header_row is not None), header_cols if header_row else None)


def extract_items(ws, header_cols: dict) -> list[dict]:
    """Yield item dicts from a VV sheet given its header columns.

    Returns items even when unit_price is missing (None) so the UI can
    report "this sheet is a VV but has no prices" instead of silently
    dropping everything.
    """
    header_row = header_cols.get("_header_row", 1)
    popis_col = header_cols.get("popis")
    mj_col = header_cols.get("mj")
    vymera_col = header_cols.get("vymera")
    cena_jed_col = header_cols.get("cena_jed")
    if not popis_col:
        return []

    items: list[dict] = []
    for r in range(header_row + 1, ws.max_row + 1):
        popis = ws.cell(row=r, column=popis_col).value
        if popis is None or not str(popis).strip():
            continue
        popis_text = str(popis).strip()
        # Skip section rows like "001: Rozvaděče" — empty MJ + colon in popis
        mj_val = ws.cell(row=r, column=mj_col).value if mj_col else None
        if not mj_val and ":" in popis_text and len(popis_text) < 60:
            continue
        up: float | None = None
        if cena_jed_col:
            raw_price = ws.cell(row=r, column=cena_jed_col).value
            up = _to_float(raw_price)
            if up is not None:
                up = round(up, 2)
        items.append({
            "row": r,
            "description": popis_text,
            "description_norm": normalise(popis_text),
            "mj": str(mj_val).strip() if mj_val else "",
            "quantity": _to_float(ws.cell(row=r, column=vymera_col).value) if vymera_col else None,
            "unit_price": up,
        })
    return items


def _to_float(v):
    if v is None or v == "":
        return None
    try:
        return float(v)
    except (ValueError, TypeError):
        return None


def analyse(xlsx_path: Path) -> dict:
    """Run the full price-check analysis. Returns a structured report."""
    wb = openpyxl.load_workbook(xlsx_path, data_only=True)
    sheets_info = []
    vv_items: dict[str, list[dict]] = {}

    for ws in wb.worksheets:
        is_vv, header_cols = is_vv_sheet(ws)
        info = {
            "name": ws.title,
            "is_vv": bool(is_vv),
            "items": 0,            # total rows recognised as items
            "priced_items": 0,     # items with a unit price filled in
            "has_unit_price_col": False,
        }
        if is_vv and header_cols:
            info["has_unit_price_col"] = bool(header_cols.get("cena_jed"))
            items = extract_items(ws, header_cols)
            info["items"] = len(items)
            info["priced_items"] = sum(1 for it in items if it["unit_price"] is not None)
            vv_items[ws.title] = items
        sheets_info.append(info)

    # Only items WITH a unit price participate in the price-consistency check
    grouped: dict[str, list[tuple[str, dict]]] = defaultdict(list)
    for sheet_name, items in vv_items.items():
        for it in items:
            if it["unit_price"] is None:
                continue
            grouped[it["description_norm"]].append((sheet_name, it))

    # Inconsistencies: same description appearing in 2+ sheets with different price
    inconsistencies = []
    for desc_norm, entries in grouped.items():
        if len(entries) < 2:
            continue
        sheets_present = {s for s, _ in entries}
        if len(sheets_present) < 2:
            continue  # appears multiple times in same sheet — not a cross-sheet issue
        prices = {round(it["unit_price"], 2) for _, it in entries}
        if len(prices) < 2:
            continue
        # Use the longest seen description as canonical (more readable)
        canonical = max((it["description"] for _, it in entries), key=len)
        rows = []
        for sheet_name, it in entries:
            rows.append({
                "sheet": sheet_name,
                "row": it["row"],
                "mj": it["mj"],
                "unit_price": it["unit_price"],
            })
        inconsistencies.append({
            "description": canonical,
            "occurrences": len(entries),
            "distinct_prices": sorted(prices),
            "rows": rows,
        })

    # Sort by description for stable output
    inconsistencies.sort(key=lambda x: x["description"].lower())

    vv_sheets_with_prices = sum(
        1 for s in sheets_info if s["is_vv"] and s["priced_items"] > 0
    )
    return {
        "sheets": sheets_info,
        "vv_sheet_count": sum(1 for s in sheets_info if s["is_vv"]),
        "vv_sheets_with_prices": vv_sheets_with_prices,
        "total_inconsistencies": len(inconsistencies),
        "inconsistencies": inconsistencies,
    }


# ── Excel report writer ─────────────────────────────────────────────

BLUE = "1F4E78"
WHITE = "FFFFFF"
GRAY = "F2F2F2"
RED_BG = "FCE4E4"
THIN = Side(style="thin", color="BFBFBF")
BORDER = Border(left=THIN, right=THIN, top=THIN, bottom=THIN)


def write_report(result: dict, source_filename: str, out_path: Path) -> Path:
    wb = Workbook()
    ws = wb.active
    ws.title = "Nesoulady"

    # Title row
    ws.cell(row=1, column=1, value="Kontrola jednotkových cen ve výkazech výměr").font = \
        Font(name="Arial", bold=True, size=14, color=BLUE)
    ws.merge_cells("A1:F1")
    ws.cell(row=2, column=1, value=f"Zdroj: {source_filename}").font = \
        Font(name="Arial", italic=True, size=10, color="595959")
    ws.merge_cells("A2:F2")

    # Header
    headers = ["Název položky", "List VV", "Řádek", "MJ", "Jednotková cena", "Poznámka"]
    for c, h in enumerate(headers, 1):
        cell = ws.cell(row=4, column=c, value=h)
        cell.font = Font(name="Arial", bold=True, size=11, color=WHITE)
        cell.fill = PatternFill("solid", fgColor=BLUE)
        cell.alignment = Alignment(horizontal="center", vertical="center")
        cell.border = BORDER

    row = 5
    if not result["inconsistencies"]:
        ws.cell(row=row, column=1,
                value="Žádné nesoulady — všechny položky se stejným názvem mají shodné jednotkové ceny.").font = \
            Font(name="Arial", size=11, color="006100", italic=True)
        ws.merge_cells(start_row=row, start_column=1, end_row=row, end_column=6)
    else:
        for inc in result["inconsistencies"]:
            min_price = min(r["unit_price"] for r in inc["rows"])
            max_price = max(r["unit_price"] for r in inc["rows"])
            for r_info in inc["rows"]:
                note_parts = []
                if r_info["unit_price"] == min_price:
                    note_parts.append("nejnižší")
                if r_info["unit_price"] == max_price:
                    note_parts.append("nejvyšší")
                note = ", ".join(note_parts)
                values = [
                    inc["description"],
                    r_info["sheet"],
                    r_info["row"],
                    r_info["mj"],
                    r_info["unit_price"],
                    note,
                ]
                for c, v in enumerate(values, 1):
                    cell = ws.cell(row=row, column=c, value=v)
                    cell.font = Font(name="Arial", size=10)
                    cell.border = BORDER
                    cell.alignment = Alignment(vertical="top",
                                               wrap_text=(c == 1))
                    if c == 5:
                        cell.number_format = '#,##0.00 "Kč";[Red]-#,##0.00 "Kč";"-"'
                        cell.alignment = Alignment(horizontal="right", vertical="top")
                # Highlight rows with the highest price as a visual cue
                if r_info["unit_price"] == max_price and min_price != max_price:
                    for c in range(1, 7):
                        ws.cell(row=row, column=c).fill = PatternFill("solid", fgColor=RED_BG)
                row += 1
            row += 0

    # Column widths
    widths = {1: 56, 2: 22, 3: 8, 4: 8, 5: 16, 6: 16}
    for c, w in widths.items():
        ws.column_dimensions[get_column_letter(c)].width = w
    ws.freeze_panes = "A5"
    ws.auto_filter.ref = f"A4:F{max(5, row - 1)}"

    # ── Second sheet: per-sheet breakdown ─────────────────────
    s2 = wb.create_sheet("Detekované listy")
    s2.cell(row=1, column=1, value="Přehled listů v sešitu").font = \
        Font(name="Arial", bold=True, size=14, color=BLUE)
    s2.merge_cells("A1:D1")
    s2_headers = ["Název listu", "Je VV?", "Počet položek s cenou", "Poznámka"]
    for c, h in enumerate(s2_headers, 1):
        cell = s2.cell(row=3, column=c, value=h)
        cell.font = Font(name="Arial", bold=True, size=11, color=WHITE)
        cell.fill = PatternFill("solid", fgColor=BLUE)
        cell.border = BORDER
    r = 4
    for s in result["sheets"]:
        priced = s.get("priced_items", 0)
        s2.cell(row=r, column=1, value=s["name"]).border = BORDER
        s2.cell(row=r, column=2, value=("Ano" if s["is_vv"] else "Ne")).border = BORDER
        s2.cell(row=r, column=3, value=priced).border = BORDER
        note = ""
        if s["is_vv"] and priced == 0:
            note = "list VV bez jednotkových cen — nelze kontrolovat"
        elif not s["is_vv"]:
            note = "neidentifikován jako VV"
        s2.cell(row=r, column=4, value=note).border = BORDER
        for c in range(1, 5):
            s2.cell(row=r, column=c).font = Font(name="Arial", size=10)
        r += 1
    for c, w in {1: 30, 2: 10, 3: 22, 4: 40}.items():
        s2.column_dimensions[get_column_letter(c)].width = w

    wb.save(str(out_path))
    return out_path