AI_portal/vv-compare/vv_compare.py

"""Compare original vs new VV (Výkaz Výměr) Excel files.

Produces a 4-sheet report:
  - Souhrn   — overview table with per-sheet counts + grand totals
  - Změny    — items present in both but with different quantity / MJ
  - Přidané  — items in new but not in original
  - Odebrané — items in original but not in new

Following Jirka's spec exactly: section rows are skipped, change = qty/MJ
diff only (NOT price), heavy use of colour-coding.
"""
import logging
import re
from collections import defaultdict
from pathlib import Path

import openpyxl
from openpyxl.styles import Alignment, Border, Font, PatternFill, Side
from openpyxl.utils import get_column_letter
from openpyxl.workbook import Workbook

logger = logging.getLogger(__name__)

HEADER_HINTS = {
    "por":   ["poř.", "por.", "pořadí", "č.", "č"],
    "kod":   ["kód", "kod"],
    "popis": ["popis", "název", "název položky"],
    "mj":    ["mj", "j.j.", "jednotka"],
    "vymera": ["výměra", "vymera", "množství", "mnozstvi"],
    "cena_jed": ["jednotková cena", "jednotkova cena", "j. cena", "j.cena",
                 "jed. cena", "cena/jed", "cena j.", "cena za jednotku"],
    "cena_tot": ["cena celkem", "celkem", "cena"],
}

# Colors per spec
BLUE = "1F4E78"
WHITE = "FFFFFF"
GRAY = "F2F2F2"
YELLOW = "FFF2CC"      # changes
GREEN_BG = "D9EAD3"    # added
RED_BG = "F4CCCC"      # removed
GREEN_DIFF = "E4F0DC"  # positive qty diff
RED_DIFF = "FCE4E4"    # negative qty diff
DARK_GREEN = "006100"
DARK_RED = "C00000"
GRAY_TEXT = "595959"
BORDER_GRAY = "BFBFBF"

THIN = Side(style="thin", color=BORDER_GRAY)
BORDER = Border(left=THIN, right=THIN, top=THIN, bottom=THIN)


def normalise(text) -> str:
    if text is None:
        return ""
    return re.sub(r"\s+", " ", str(text).strip())


def _to_float(v):
    if v is None or v == "":
        return None
    try:
        return float(v)
    except (ValueError, TypeError):
        return None


def find_header(ws) -> dict | None:
    """Return header column mapping or None if no VV-like header found."""
    for row_idx in range(1, min(13, ws.max_row + 1)):
        matched = {}
        for col_idx in range(1, min(15, ws.max_column + 1)):
            val = normalise(ws.cell(row=row_idx, column=col_idx).value).lower()
            for role, hints in HEADER_HINTS.items():
                if role in matched:
                    continue
                if any(val == h or val.startswith(h) for h in hints):
                    matched[role] = col_idx
                    break
        if "popis" in matched and "vymera" in matched and "mj" in matched:
            matched["_header_row"] = row_idx
            return matched
    return None


def extract_items(ws, header: dict) -> tuple[list[dict], str | None]:
    """Return (items, hala_name)."""
    header_row = header["_header_row"]
    # Hall/object name: row 4 col C per spec, but be flexible
    hala = None
    for r in range(1, header_row):
        for c in range(1, min(8, ws.max_column + 1)):
            v = ws.cell(row=r, column=c).value
            if v and isinstance(v, str) and 5 < len(v) < 200 \
                    and "vykaz" not in v.lower() and "výkaz" not in v.lower():
                hala = v.strip()
                break
        if hala:
            break

    items = []
    current_section = None
    popis_col = header["popis"]
    mj_col = header["mj"]
    vymera_col = header["vymera"]
    cena_jed_col = header.get("cena_jed")
    cena_tot_col = header.get("cena_tot")
    kod_col = header.get("kod")

    for r in range(header_row + 1, ws.max_row + 1):
        popis = ws.cell(row=r, column=popis_col).value
        if popis is None or not str(popis).strip():
            continue
        popis_text = str(popis).strip()
        mj_val = ws.cell(row=r, column=mj_col).value
        # Detect section row: empty MJ + description contains ":"
        if (not mj_val or not str(mj_val).strip()) and ":" in popis_text \
                and len(popis_text) < 100:
            current_section = popis_text
            continue
        vymera = _to_float(ws.cell(row=r, column=vymera_col).value)
        # Skip rows without quantity (probably subtotals)
        if vymera is None:
            continue
        items.append({
            "row": r,
            "section": current_section,
            "kod": ws.cell(row=r, column=kod_col).value if kod_col else None,
            "description": popis_text,
            "description_norm": normalise(popis_text),
            "mj": str(mj_val).strip() if mj_val else "",
            "vymera": vymera,
            "cena_jed": _to_float(ws.cell(row=r, column=cena_jed_col).value) if cena_jed_col else None,
            "cena_tot": _to_float(ws.cell(row=r, column=cena_tot_col).value) if cena_tot_col else None,
        })
    return items, hala


def analyse_workbook(path: Path) -> dict:
    """Return {sheet_name: {items, hala, header_found}}."""
    wb = openpyxl.load_workbook(path, data_only=True)
    out = {}
    for ws in wb.worksheets:
        header = find_header(ws)
        if not header:
            continue
        items, hala = extract_items(ws, header)
        if not items:
            continue
        out[ws.title] = {"items": items, "hala": hala, "header": header}
    wb.close()
    return out


def compare(orig_path: Path, new_path: Path) -> dict:
    orig_sheets = analyse_workbook(orig_path)
    new_sheets = analyse_workbook(new_path)

    # Match sheets by exact name first; fall back to position-based match for unmatched.
    matched_pairs: list[tuple[str | None, str | None]] = []
    orig_used: set[str] = set()
    new_used: set[str] = set()
    for name in orig_sheets:
        if name in new_sheets:
            matched_pairs.append((name, name))
            orig_used.add(name)
            new_used.add(name)
    remaining_orig = [s for s in orig_sheets if s not in orig_used]
    remaining_new = [s for s in new_sheets if s not in new_used]
    # Position match by order of declaration
    for o, n in zip(remaining_orig, remaining_new):
        matched_pairs.append((o, n))
    # Orphans (only orig)
    for o in remaining_orig[len(remaining_new):]:
        matched_pairs.append((o, None))
    for n in remaining_new[len(remaining_orig):]:
        matched_pairs.append((None, n))

    per_sheet = []
    all_changes = []
    all_added = []
    all_removed = []

    for orig_name, new_name in matched_pairs:
        orig_items = orig_sheets.get(orig_name, {}).get("items", []) if orig_name else []
        new_items  = new_sheets.get(new_name, {}).get("items", []) if new_name else []
        hala = (new_sheets.get(new_name, {}).get("hala") if new_name else None) \
            or (orig_sheets.get(orig_name, {}).get("hala") if orig_name else None) \
            or ""
        sheet_label = new_name or orig_name or ""

        # Pair items by (section, description_norm)
        orig_index = defaultdict(list)
        for it in orig_items:
            orig_index[(it["section"] or "", it["description_norm"])].append(it)
        new_index = defaultdict(list)
        for it in new_items:
            new_index[(it["section"] or "", it["description_norm"])].append(it)

        changes = []
        added = []
        removed = []

        # Items present in both
        for key, new_list in new_index.items():
            orig_list = orig_index.get(key, [])
            if not orig_list:
                for it in new_list:
                    added.append({
                        "sheet": sheet_label, "hala": hala,
                        "section": key[0], "description": it["description"],
                        "mj": it["mj"], "vymera": it["vymera"],
                        "cena_jed": it["cena_jed"], "cena_tot": it["cena_tot"],
                    })
                continue
            # Compare in pairs (one-to-one by index)
            for idx, new_it in enumerate(new_list):
                if idx >= len(orig_list):
                    # Duplicate added entry
                    added.append({
                        "sheet": sheet_label, "hala": hala,
                        "section": key[0], "description": new_it["description"],
                        "mj": new_it["mj"], "vymera": new_it["vymera"],
                        "cena_jed": new_it["cena_jed"], "cena_tot": new_it["cena_tot"],
                    })
                    continue
                orig_it = orig_list[idx]
                qty_diff = (new_it["vymera"] or 0) - (orig_it["vymera"] or 0)
                mj_diff = orig_it["mj"] != new_it["mj"]
                if abs(qty_diff) > 1e-9 or mj_diff:
                    changes.append({
                        "sheet": sheet_label, "hala": hala,
                        "section": key[0], "description": new_it["description"],
                        "mj_orig": orig_it["mj"], "mj_new": new_it["mj"],
                        "vymera_orig": orig_it["vymera"], "vymera_new": new_it["vymera"],
                        "vymera_diff": qty_diff,
                        "cena_jed_orig": orig_it["cena_jed"],
                        "cena_orig": orig_it["cena_tot"],
                        "cena_new": new_it["cena_tot"],
                    })
        # Items only in original
        for key, orig_list in orig_index.items():
            if key in new_index:
                # Handle leftover originals (orig had more duplicates than new)
                used = min(len(new_index[key]), len(orig_list))
                for it in orig_list[used:]:
                    removed.append({
                        "sheet": sheet_label, "hala": hala,
                        "section": key[0], "description": it["description"],
                        "mj": it["mj"], "vymera": it["vymera"],
                        "cena_jed": it["cena_jed"], "cena_tot": it["cena_tot"],
                    })
                continue
            for it in orig_list:
                removed.append({
                    "sheet": sheet_label, "hala": hala,
                    "section": key[0], "description": it["description"],
                    "mj": it["mj"], "vymera": it["vymera"],
                    "cena_jed": it["cena_jed"], "cena_tot": it["cena_tot"],
                })

        per_sheet.append({
            "sheet": sheet_label,
            "hala": hala,
            "orig_count": len(orig_items),
            "new_count": len(new_items),
            "changes": len(changes),
            "added": len(added),
            "removed": len(removed),
        })
        all_changes.extend(changes)
        all_added.extend(added)
        all_removed.extend(removed)

    return {
        "per_sheet": per_sheet,
        "changes": all_changes,
        "added": all_added,
        "removed": all_removed,
    }


# ── Excel report writer (4 sheets per spec) ───────────────────

NUM_QTY = '#,##0.##;[Red]-#,##0.##;"-"'
NUM_CZK = '#,##0.00 "Kč";[Red]-#,##0.00 "Kč";"-"'


def _title(ws, text: str, subtitle: str = ""):
    cell = ws.cell(row=1, column=1, value=text)
    cell.font = Font(name="Arial", bold=True, size=14, color=BLUE)
    ws.merge_cells(start_row=1, start_column=1, end_row=1, end_column=12)
    if subtitle:
        sub = ws.cell(row=2, column=1, value=subtitle)
        sub.font = Font(name="Arial", italic=True, size=10, color=GRAY_TEXT)
        ws.merge_cells(start_row=2, start_column=1, end_row=2, end_column=12)


def _hdr_cell(cell):
    cell.font = Font(name="Arial", bold=True, size=11, color=WHITE)
    cell.fill = PatternFill("solid", fgColor=BLUE)
    cell.alignment = Alignment(horizontal="center", vertical="center", wrap_text=True)
    cell.border = BORDER


def _body_cell(cell, num_format: str | None = None, fill: str | None = None,
               bold: bool = False, color: str | None = None,
               horizontal: str | None = None):
    cell.font = Font(name="Arial", size=10, bold=bold,
                     color=color or "000000")
    cell.border = BORDER
    if fill:
        cell.fill = PatternFill("solid", fgColor=fill)
    if num_format:
        cell.number_format = num_format
    cell.alignment = Alignment(horizontal=horizontal or "left",
                               vertical="top",
                               wrap_text=True)


def write_compare_report(result: dict, orig_filename: str, new_filename: str,
                         out_path: Path) -> Path:
    wb = Workbook()

    # ── 1. Souhrn ─────────────────────────────────────────
    ws = wb.active
    ws.title = "Souhrn"
    _title(ws, "Porovnání výkazu výměr",
           f"Původní: {orig_filename} · Nový: {new_filename}")

    headers = ["List", "Hala / objekt", "Počet pol. (původní)", "Počet pol. (nový)",
               "Změněné položky", "Přidané položky", "Odebrané položky"]
    for c, h in enumerate(headers, 1):
        _hdr_cell(ws.cell(row=4, column=c, value=h))
    r = 5
    for ps in result["per_sheet"]:
        ws.cell(row=r, column=1, value=ps["sheet"])
        ws.cell(row=r, column=2, value=ps["hala"])
        ws.cell(row=r, column=3, value=ps["orig_count"])
        ws.cell(row=r, column=4, value=ps["new_count"])
        ws.cell(row=r, column=5, value=ps["changes"])
        ws.cell(row=r, column=6, value=ps["added"])
        ws.cell(row=r, column=7, value=ps["removed"])
        for c in range(1, 8):
            _body_cell(ws.cell(row=r, column=c))
        # Colored highlights when > 0
        if ps["changes"]:
            _body_cell(ws.cell(row=r, column=5), fill=YELLOW, bold=True,
                       horizontal="right")
        if ps["added"]:
            _body_cell(ws.cell(row=r, column=6), fill=GREEN_BG, bold=True,
                       color=DARK_GREEN, horizontal="right")
        if ps["removed"]:
            _body_cell(ws.cell(row=r, column=7), fill=RED_BG, bold=True,
                       color=DARK_RED, horizontal="right")
        r += 1
    # CELKEM row
    if result["per_sheet"]:
        first_data_row = 5
        last_data_row = r - 1
        ws.cell(row=r, column=1, value="CELKEM")
        for c in range(3, 8):
            col_letter = get_column_letter(c)
            ws.cell(row=r, column=c,
                    value=f"=SUM({col_letter}{first_data_row}:{col_letter}{last_data_row})")
        for c in range(1, 8):
            _body_cell(ws.cell(row=r, column=c),
                       fill=GRAY, bold=True,
                       horizontal=("right" if c >= 3 else "left"))
        r += 1

    # Recap box
    r += 1
    ws.cell(row=r, column=1, value="Rekapitulace změn (celkem)").font = \
        Font(name="Arial", bold=True, size=12, color=BLUE)
    r += 1
    for label, cnt, fill, color in [
        ("Změněné", len(result["changes"]), YELLOW, None),
        ("Přidané", len(result["added"]), GREEN_BG, DARK_GREEN),
        ("Odebrané", len(result["removed"]), RED_BG, DARK_RED),
    ]:
        _body_cell(ws.cell(row=r, column=1, value=label), bold=True)
        _body_cell(ws.cell(row=r, column=2, value=cnt),
                   fill=fill, bold=True, color=color, horizontal="right")
        r += 1

    r += 1
    note = ws.cell(row=r, column=1,
                   value=("Za „změnu\" je považován pouze rozdíl ve výměře nebo MJ. "
                          "Cenové rozdíly se ignorují, protože nový VV obvykle ceny "
                          "neobsahuje."))
    note.font = Font(name="Arial", italic=True, size=9, color=GRAY_TEXT)
    note.alignment = Alignment(wrap_text=True)
    ws.merge_cells(start_row=r, start_column=1, end_row=r, end_column=7)

    for c, w in {1: 22, 2: 30, 3: 18, 4: 18, 5: 18, 6: 18, 7: 18}.items():
        ws.column_dimensions[get_column_letter(c)].width = w
    ws.row_dimensions[4].height = 32

    # ── 2. Změny ─────────────────────────────────────────
    ws = wb.create_sheet("Změny")
    _title(ws, "Změněné položky (rozdíl ve výměře nebo MJ)")
    cols = ["List", "Hala", "Sekce", "Popis položky",
            "MJ orig.", "MJ nová", "Výměra orig.", "Výměra nová", "Rozdíl výměra",
            "Jed. cena orig.", "Cena orig.", "Cena nová"]
    for c, h in enumerate(cols, 1):
        _hdr_cell(ws.cell(row=4, column=c, value=h))
    r = 5
    for ch in result["changes"]:
        ws.cell(row=r, column=1, value=ch["sheet"])
        ws.cell(row=r, column=2, value=ch["hala"])
        ws.cell(row=r, column=3, value=ch["section"])
        ws.cell(row=r, column=4, value=ch["description"])
        ws.cell(row=r, column=5, value=ch["mj_orig"])
        ws.cell(row=r, column=6, value=ch["mj_new"])
        ws.cell(row=r, column=7, value=ch["vymera_orig"])
        ws.cell(row=r, column=8, value=ch["vymera_new"])
        ws.cell(row=r, column=9, value=ch["vymera_diff"])
        ws.cell(row=r, column=10, value=ch["cena_jed_orig"])
        ws.cell(row=r, column=11, value=ch["cena_orig"])
        ws.cell(row=r, column=12, value=ch["cena_new"])
        for c in range(1, 13):
            _body_cell(ws.cell(row=r, column=c))
        # Number formats
        for c in (7, 8, 9):
            ws.cell(row=r, column=c).number_format = NUM_QTY
            ws.cell(row=r, column=c).alignment = Alignment(horizontal="right", vertical="top")
        for c in (10, 11, 12):
            ws.cell(row=r, column=c).number_format = NUM_CZK
            ws.cell(row=r, column=c).alignment = Alignment(horizontal="right", vertical="top")
        # Highlight diff cell
        diff_fill = GREEN_DIFF if ch["vymera_diff"] > 0 else RED_DIFF
        _body_cell(ws.cell(row=r, column=9), fill=diff_fill, bold=True,
                   num_format=NUM_QTY, horizontal="right")
        r += 1

    widths = {1: 14, 2: 22, 3: 22, 4: 50, 5: 10, 6: 10, 7: 14, 8: 14, 9: 14, 10: 14, 11: 14, 12: 14}
    for c, w in widths.items():
        ws.column_dimensions[get_column_letter(c)].width = w
    ws.freeze_panes = "A5"
    ws.auto_filter.ref = f"A4:L{max(5, r - 1)}"
    ws.row_dimensions[4].height = 32

    # ── 3. Přidané ─────────────────────────────────────────
    ws = wb.create_sheet("Přidané")
    _title(ws, "Přidané položky (jsou v novém VV, nebyly v původním)")
    add_cols = ["List", "Hala", "Sekce", "Popis položky", "MJ", "Výměra",
                "Jed. cena", "Cena"]
    for c, h in enumerate(add_cols, 1):
        _hdr_cell(ws.cell(row=4, column=c, value=h))
    r = 5
    for it in result["added"]:
        vals = [it["sheet"], it["hala"], it["section"], it["description"],
                it["mj"], it["vymera"], it["cena_jed"], it["cena_tot"]]
        for c, v in enumerate(vals, 1):
            _body_cell(ws.cell(row=r, column=c, value=v),
                       fill=GREEN_BG, color=DARK_GREEN)
        ws.cell(row=r, column=6).number_format = NUM_QTY
        ws.cell(row=r, column=6).alignment = Alignment(horizontal="right", vertical="top")
        ws.cell(row=r, column=7).number_format = NUM_CZK
        ws.cell(row=r, column=7).alignment = Alignment(horizontal="right", vertical="top")
        ws.cell(row=r, column=8).number_format = NUM_CZK
        ws.cell(row=r, column=8).alignment = Alignment(horizontal="right", vertical="top")
        r += 1
    for c, w in {1: 14, 2: 22, 3: 22, 4: 50, 5: 10, 6: 14, 7: 14, 8: 14}.items():
        ws.column_dimensions[get_column_letter(c)].width = w
    ws.freeze_panes = "A5"
    ws.auto_filter.ref = f"A4:H{max(5, r - 1)}"
    ws.row_dimensions[4].height = 32

    # ── 4. Odebrané ─────────────────────────────────────────
    ws = wb.create_sheet("Odebrané")
    _title(ws, "Odebrané položky (byly v původním VV, v novém nejsou)")
    for c, h in enumerate(add_cols, 1):
        _hdr_cell(ws.cell(row=4, column=c, value=h))
    r = 5
    first_data_row = r
    for it in result["removed"]:
        vals = [it["sheet"], it["hala"], it["section"], it["description"],
                it["mj"], it["vymera"], it["cena_jed"], it["cena_tot"]]
        for c, v in enumerate(vals, 1):
            _body_cell(ws.cell(row=r, column=c, value=v),
                       fill=RED_BG, color=DARK_RED)
        ws.cell(row=r, column=6).number_format = NUM_QTY
        ws.cell(row=r, column=6).alignment = Alignment(horizontal="right", vertical="top")
        ws.cell(row=r, column=7).number_format = NUM_CZK
        ws.cell(row=r, column=7).alignment = Alignment(horizontal="right", vertical="top")
        ws.cell(row=r, column=8).number_format = NUM_CZK
        ws.cell(row=r, column=8).alignment = Alignment(horizontal="right", vertical="top")
        r += 1
    last_data_row = r - 1
    if last_data_row >= first_data_row:
        # Sum cena_tot column
        ws.cell(row=r, column=1, value="Součet")
        ws.cell(row=r, column=8,
                value=f"=SUM(H{first_data_row}:H{last_data_row})")
        for c in range(1, 9):
            _body_cell(ws.cell(row=r, column=c), fill=GRAY, bold=True,
                       horizontal=("right" if c == 8 else "left"))
        ws.cell(row=r, column=8).number_format = NUM_CZK
    for c, w in {1: 14, 2: 22, 3: 22, 4: 50, 5: 10, 6: 14, 7: 14, 8: 14}.items():
        ws.column_dimensions[get_column_letter(c)].width = w
    ws.freeze_panes = "A5"
    ws.row_dimensions[4].height = 32

    wb.save(str(out_path))
    return out_path