AI_portal/contract-check/pdf_annotator.py

"""Add color-coded highlights and a Czech-correct summary page to a PDF."""
import logging
from pathlib import Path

import fitz  # PyMuPDF

logger = logging.getLogger(__name__)

# RGB 0-1 for PyMuPDF
COLORS = {
    "ok":      (0.69, 0.91, 0.69),  # green
    "warning": (1.00, 0.90, 0.45),  # yellow
    "problem": (1.00, 0.65, 0.65),  # red
    "missing": (0.85, 0.85, 0.85),  # grey
}

# DejaVu Sans is shipped via fonts-dejavu-core; supports full Czech glyph set.
FONT_PATH_SANS = "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"
FONT_PATH_BOLD = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"


def annotate(input_pdf: Path, output_pdf: Path, items: list[dict],
             overall_summary: str = "", risk_level: str = "",
             skip_highlights: bool = False,
             contract_name: str = "") -> Path:
    """Open input_pdf, highlight excerpts, prepend a summary page."""
    doc = fitz.open(str(input_pdf))

    highlighted_count = 0
    not_found_count = 0
    items_to_annotate = [] if skip_highlights else items
    if skip_highlights:
        logger.info("Skipping per-excerpt highlights (OCR'd PDF — no text layer)")
    for item in items_to_annotate:
        color = COLORS.get(item.get("status", "warning"), COLORS["warning"])
        title = item.get("title") or item.get("label") or item.get("id", "")
        for ex in item.get("excerpts") or []:
            quote = (ex.get("text") or "").strip()
            comment = (ex.get("comment") or "").strip()
            if not quote:
                continue
            found_any = False
            for page in doc:
                rects = page.search_for(quote, quads=False)
                if not rects and len(quote) > 20:
                    rects = page.search_for(quote[:20], quads=False)
                if not rects:
                    continue
                for rect in rects:
                    annot = page.add_highlight_annot(rect)
                    annot.set_colors(stroke=color)
                    annot.set_info(
                        title=title,
                        content=f"{title}\n\n{comment}" if comment else title,
                    )
                    annot.update()
                found_any = True
            if found_any:
                highlighted_count += 1
            else:
                not_found_count += 1
                logger.info("Quote not found in PDF: %r", quote[:60])

    # Build & prepend summary
    summary = _build_summary_pdf(doc, items, overall_summary, risk_level,
                                 contract_name)
    if summary:
        doc.insert_pdf(summary, start_at=0)
        summary.close()

    # Set PDF metadata title for nice display in viewers
    if contract_name:
        meta = doc.metadata or {}
        meta["title"] = f"Kontrola: {contract_name}"
        doc.set_metadata(meta)

    doc.save(str(output_pdf), garbage=4, deflate=True)
    doc.close()
    logger.info("Annotated PDF: highlighted=%d not_found=%d", highlighted_count, not_found_count)
    return output_pdf


def _build_summary_pdf(orig_doc, items: list[dict],
                       overall_summary: str, risk_level: str,
                       contract_name: str):
    """Build a 1-N page summary PDF using a Czech-supporting font."""
    if not orig_doc.page_count:
        return None
    src_rect = orig_doc[0].rect
    width = max(float(src_rect.width), 595.0)  # ensure at least A4
    height = max(float(src_rect.height), 842.0)

    new = fitz.open()
    page = new.new_page(width=width, height=height)
    _register_fonts(page)

    margin_x = 50.0
    y = 50.0
    title_size = 18
    body_size = 10.5
    line_h = body_size * 1.45

    # Header line: filename
    if contract_name:
        _draw_text(page, contract_name, margin_x, y, font="sans",
                   size=11, color=(0.40, 0.45, 0.55))
        y += 18

    # Title
    _draw_text(page, "Kontrola smluvních podmínek", margin_x, y,
               font="bold", size=title_size, color=(0.06, 0.10, 0.20))
    y += title_size * 1.6

    # Risk badge line
    if risk_level:
        labels = {"low": "NÍZKÉ", "medium": "STŘEDNÍ", "high": "VYSOKÉ"}
        colors = {
            "low": (0.20, 0.65, 0.32),
            "medium": (0.85, 0.55, 0.10),
            "high": (0.80, 0.20, 0.20),
        }
        _draw_text(page, f"Celková míra rizika: {labels.get(risk_level, risk_level.upper())}",
                   margin_x, y, font="bold", size=12,
                   color=colors.get(risk_level, (0.4, 0.4, 0.4)))
        y += 22

    # Overall summary
    if overall_summary:
        y = _wrap_text(page, overall_summary, margin_x, y,
                       width - 2 * margin_x, body_size, font="sans")
        y += 12

    y += 6
    page.draw_line((margin_x, y), (width - margin_x, y),
                   color=(0.85, 0.85, 0.85))
    y += 16

    _draw_text(page, "Položky kontroly", margin_x, y,
               font="bold", size=12, color=(0.15, 0.20, 0.30))
    y += 18

    status_labels = {"ok": "OK", "warning": "POZOR",
                     "problem": "PROBLÉM", "missing": "CHYBÍ"}
    for item in items:
        # Need new page?
        if y > height - 80:
            page = new.new_page(width=width, height=height)
            _register_fonts(page)
            y = 50

        status = item.get("status", "")
        color = COLORS.get(status, (0.6, 0.6, 0.6))
        label = status_labels.get(status, status.upper())
        title = item.get("title") or item.get("label") or item.get("id", "")

        # Colored bullet square
        page.draw_rect(
            fitz.Rect(margin_x, y, margin_x + 10, y + 10),
            color=color, fill=color,
        )
        _draw_text(page, f"[{label}]  {title}",
                   margin_x + 18, y, font="bold", size=11,
                   color=(0.06, 0.10, 0.20))
        y += line_h + 4

        summary = item.get("summary", "")
        if summary:
            y = _wrap_text(page, summary, margin_x + 18, y,
                           width - 2 * margin_x - 18, body_size,
                           font="sans", color=(0.30, 0.35, 0.45))

        # List page references for each excerpt
        excerpts = item.get("excerpts") or []
        if excerpts:
            for ex in excerpts:
                pg = ex.get("page")
                text = (ex.get("text") or "").strip()
                if not text:
                    continue
                pg_str = f"str. {pg}: " if pg else ""
                snippet = text if len(text) <= 90 else text[:87] + "…"
                y = _wrap_text(page, f"• {pg_str}„{snippet}\"",
                               margin_x + 18, y,
                               width - 2 * margin_x - 18, body_size - 0.5,
                               font="sans", color=(0.25, 0.30, 0.40))
                cmt = (ex.get("comment") or "").strip()
                if cmt:
                    y = _wrap_text(page, f"   — {cmt}",
                                   margin_x + 18, y,
                                   width - 2 * margin_x - 18, body_size - 0.5,
                                   font="sans", color=(0.45, 0.50, 0.60))

        y += 12

    return new


# ── font helpers ─────────────────────────────────────────

def _register_fonts(page):
    """Insert DejaVu Sans (regular + bold) on the page if available."""
    try:
        page.insert_font(fontname="sans", fontfile=FONT_PATH_SANS)
    except Exception as e:
        logger.warning("Could not register DejaVuSans: %s", e)
    try:
        page.insert_font(fontname="bold", fontfile=FONT_PATH_BOLD)
    except Exception:
        # Fall back to regular for bold
        try:
            page.insert_font(fontname="bold", fontfile=FONT_PATH_SANS)
        except Exception:
            pass


def _draw_text(page, text: str, x: float, y: float,
               font: str = "sans", size: float = 10.5,
               color: tuple = (0.06, 0.10, 0.20)):
    """Render a single line at baseline y+size."""
    try:
        page.insert_text((x, y + size), text,
                         fontname=font, fontsize=size, color=color)
    except Exception:
        # Fallback to PyMuPDF built-in (may mangle diacritics but won't crash)
        page.insert_text((x, y + size), text,
                         fontsize=size, color=color)


def _wrap_text(page, text: str, x: float, y: float, max_width: float,
               font_size: float, font: str = "sans",
               color: tuple = (0.06, 0.10, 0.20)) -> float:
    """Word-wrap `text` and return the new y position."""
    line_h = font_size * 1.45
    # PyMuPDF has Page.get_text_length() for width calculation
    def measure(s: str) -> float:
        try:
            return fitz.get_text_length(s, fontname=font, fontsize=font_size)
        except Exception:
            return len(s) * font_size * 0.50

    words = text.split()
    if not words:
        return y
    line = ""
    for word in words:
        candidate = (line + " " + word).strip()
        if measure(candidate) > max_width and line:
            _draw_text(page, line, x, y, font=font,
                       size=font_size, color=color)
            y += line_h
            line = word
        else:
            line = candidate
    if line:
        _draw_text(page, line, x, y, font=font, size=font_size, color=color)
        y += line_h
    return y