"""Add color-coded highlights and a Czech-correct summary page to a PDF.""" import logging from pathlib import Path import fitz # PyMuPDF logger = logging.getLogger(__name__) # RGB 0-1 for PyMuPDF COLORS = { "ok": (0.69, 0.91, 0.69), # green "warning": (1.00, 0.90, 0.45), # yellow "problem": (1.00, 0.65, 0.65), # red "missing": (0.85, 0.85, 0.85), # grey } # DejaVu Sans is shipped via fonts-dejavu-core; supports full Czech glyph set. FONT_PATH_SANS = "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf" FONT_PATH_BOLD = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf" def annotate(input_pdf: Path, output_pdf: Path, items: list[dict], overall_summary: str = "", risk_level: str = "", skip_highlights: bool = False, contract_name: str = "") -> Path: """Open input_pdf, highlight excerpts, prepend a summary page.""" doc = fitz.open(str(input_pdf)) highlighted_count = 0 not_found_count = 0 items_to_annotate = [] if skip_highlights else items if skip_highlights: logger.info("Skipping per-excerpt highlights (OCR'd PDF — no text layer)") for item in items_to_annotate: color = COLORS.get(item.get("status", "warning"), COLORS["warning"]) title = item.get("title") or item.get("label") or item.get("id", "") for ex in item.get("excerpts") or []: quote = (ex.get("text") or "").strip() comment = (ex.get("comment") or "").strip() if not quote: continue found_any = False for page in doc: rects = page.search_for(quote, quads=False) if not rects and len(quote) > 20: rects = page.search_for(quote[:20], quads=False) if not rects: continue for rect in rects: annot = page.add_highlight_annot(rect) annot.set_colors(stroke=color) annot.set_info( title=title, content=f"{title}\n\n{comment}" if comment else title, ) annot.update() found_any = True if found_any: highlighted_count += 1 else: not_found_count += 1 logger.info("Quote not found in PDF: %r", quote[:60]) # Build & prepend summary summary = _build_summary_pdf(doc, items, overall_summary, risk_level, contract_name) if summary: doc.insert_pdf(summary, start_at=0) summary.close() # Set PDF metadata title for nice display in viewers if contract_name: meta = doc.metadata or {} meta["title"] = f"Kontrola: {contract_name}" doc.set_metadata(meta) doc.save(str(output_pdf), garbage=4, deflate=True) doc.close() logger.info("Annotated PDF: highlighted=%d not_found=%d", highlighted_count, not_found_count) return output_pdf def _build_summary_pdf(orig_doc, items: list[dict], overall_summary: str, risk_level: str, contract_name: str): """Build a 1-N page summary PDF using a Czech-supporting font.""" if not orig_doc.page_count: return None src_rect = orig_doc[0].rect width = max(float(src_rect.width), 595.0) # ensure at least A4 height = max(float(src_rect.height), 842.0) new = fitz.open() page = new.new_page(width=width, height=height) _register_fonts(page) margin_x = 50.0 y = 50.0 title_size = 18 body_size = 10.5 line_h = body_size * 1.45 # Header line: filename if contract_name: _draw_text(page, contract_name, margin_x, y, font="sans", size=11, color=(0.40, 0.45, 0.55)) y += 18 # Title _draw_text(page, "Kontrola smluvních podmínek", margin_x, y, font="bold", size=title_size, color=(0.06, 0.10, 0.20)) y += title_size * 1.6 # Risk badge line if risk_level: labels = {"low": "NÍZKÉ", "medium": "STŘEDNÍ", "high": "VYSOKÉ"} colors = { "low": (0.20, 0.65, 0.32), "medium": (0.85, 0.55, 0.10), "high": (0.80, 0.20, 0.20), } _draw_text(page, f"Celková míra rizika: {labels.get(risk_level, risk_level.upper())}", margin_x, y, font="bold", size=12, color=colors.get(risk_level, (0.4, 0.4, 0.4))) y += 22 # Overall summary if overall_summary: y = _wrap_text(page, overall_summary, margin_x, y, width - 2 * margin_x, body_size, font="sans") y += 12 y += 6 page.draw_line((margin_x, y), (width - margin_x, y), color=(0.85, 0.85, 0.85)) y += 16 _draw_text(page, "Položky kontroly", margin_x, y, font="bold", size=12, color=(0.15, 0.20, 0.30)) y += 18 status_labels = {"ok": "OK", "warning": "POZOR", "problem": "PROBLÉM", "missing": "CHYBÍ"} for item in items: # Need new page? if y > height - 80: page = new.new_page(width=width, height=height) _register_fonts(page) y = 50 status = item.get("status", "") color = COLORS.get(status, (0.6, 0.6, 0.6)) label = status_labels.get(status, status.upper()) title = item.get("title") or item.get("label") or item.get("id", "") # Colored bullet square page.draw_rect( fitz.Rect(margin_x, y, margin_x + 10, y + 10), color=color, fill=color, ) _draw_text(page, f"[{label}] {title}", margin_x + 18, y, font="bold", size=11, color=(0.06, 0.10, 0.20)) y += line_h + 4 summary = item.get("summary", "") if summary: y = _wrap_text(page, summary, margin_x + 18, y, width - 2 * margin_x - 18, body_size, font="sans", color=(0.30, 0.35, 0.45)) # List page references for each excerpt excerpts = item.get("excerpts") or [] if excerpts: for ex in excerpts: pg = ex.get("page") text = (ex.get("text") or "").strip() if not text: continue pg_str = f"str. {pg}: " if pg else "" snippet = text if len(text) <= 90 else text[:87] + "…" y = _wrap_text(page, f"• {pg_str}„{snippet}\"", margin_x + 18, y, width - 2 * margin_x - 18, body_size - 0.5, font="sans", color=(0.25, 0.30, 0.40)) cmt = (ex.get("comment") or "").strip() if cmt: y = _wrap_text(page, f" — {cmt}", margin_x + 18, y, width - 2 * margin_x - 18, body_size - 0.5, font="sans", color=(0.45, 0.50, 0.60)) y += 12 return new # ── font helpers ───────────────────────────────────────── def _register_fonts(page): """Insert DejaVu Sans (regular + bold) on the page if available.""" try: page.insert_font(fontname="sans", fontfile=FONT_PATH_SANS) except Exception as e: logger.warning("Could not register DejaVuSans: %s", e) try: page.insert_font(fontname="bold", fontfile=FONT_PATH_BOLD) except Exception: # Fall back to regular for bold try: page.insert_font(fontname="bold", fontfile=FONT_PATH_SANS) except Exception: pass def _draw_text(page, text: str, x: float, y: float, font: str = "sans", size: float = 10.5, color: tuple = (0.06, 0.10, 0.20)): """Render a single line at baseline y+size.""" try: page.insert_text((x, y + size), text, fontname=font, fontsize=size, color=color) except Exception: # Fallback to PyMuPDF built-in (may mangle diacritics but won't crash) page.insert_text((x, y + size), text, fontsize=size, color=color) def _wrap_text(page, text: str, x: float, y: float, max_width: float, font_size: float, font: str = "sans", color: tuple = (0.06, 0.10, 0.20)) -> float: """Word-wrap `text` and return the new y position.""" line_h = font_size * 1.45 # PyMuPDF has Page.get_text_length() for width calculation def measure(s: str) -> float: try: return fitz.get_text_length(s, fontname=font, fontsize=font_size) except Exception: return len(s) * font_size * 0.50 words = text.split() if not words: return y line = "" for word in words: candidate = (line + " " + word).strip() if measure(candidate) > max_width and line: _draw_text(page, line, x, y, font=font, size=font_size, color=color) y += line_h line = word else: line = candidate if line: _draw_text(page, line, x, y, font=font, size=font_size, color=color) y += line_h return y