Apps: - dwg-rooms: extract room numbers from DWG/DXF - dwg-counting: count symbols in PDF drawings (OpenCV template matching) - contract-check: review PDF contracts against a checklist (Claude vision + Tesseract OCR fallback) - email-drafter: bullet notes → polished Czech/English business emails - invoice-extractor: PDF/image invoice → structured data → Excel - translator: Czech-first translator across 19 languages with tone control - vv-check: find inconsistent unit prices across VV sheets in one workbook - vv-compare: diff original vs new VV files (changes / added / removed) - feature-request: portal users submit ideas + sample files Infrastructure: - LiteLLM gateway with per-app virtual keys + budgets - Langfuse observability - Geist font, shared theme, cross-subdomain back link + theme sync via cookie/URL - Caddy reverse proxy on *.klas.chat
258 lines
9.2 KiB
Python
258 lines
9.2 KiB
Python
"""Add color-coded highlights and a Czech-correct summary page to a PDF."""
|
|
import logging
|
|
from pathlib import Path
|
|
|
|
import fitz # PyMuPDF
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# RGB 0-1 for PyMuPDF
|
|
COLORS = {
|
|
"ok": (0.69, 0.91, 0.69), # green
|
|
"warning": (1.00, 0.90, 0.45), # yellow
|
|
"problem": (1.00, 0.65, 0.65), # red
|
|
"missing": (0.85, 0.85, 0.85), # grey
|
|
}
|
|
|
|
# DejaVu Sans is shipped via fonts-dejavu-core; supports full Czech glyph set.
|
|
FONT_PATH_SANS = "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"
|
|
FONT_PATH_BOLD = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"
|
|
|
|
|
|
def annotate(input_pdf: Path, output_pdf: Path, items: list[dict],
|
|
overall_summary: str = "", risk_level: str = "",
|
|
skip_highlights: bool = False,
|
|
contract_name: str = "") -> Path:
|
|
"""Open input_pdf, highlight excerpts, prepend a summary page."""
|
|
doc = fitz.open(str(input_pdf))
|
|
|
|
highlighted_count = 0
|
|
not_found_count = 0
|
|
items_to_annotate = [] if skip_highlights else items
|
|
if skip_highlights:
|
|
logger.info("Skipping per-excerpt highlights (OCR'd PDF — no text layer)")
|
|
for item in items_to_annotate:
|
|
color = COLORS.get(item.get("status", "warning"), COLORS["warning"])
|
|
title = item.get("title") or item.get("label") or item.get("id", "")
|
|
for ex in item.get("excerpts") or []:
|
|
quote = (ex.get("text") or "").strip()
|
|
comment = (ex.get("comment") or "").strip()
|
|
if not quote:
|
|
continue
|
|
found_any = False
|
|
for page in doc:
|
|
rects = page.search_for(quote, quads=False)
|
|
if not rects and len(quote) > 20:
|
|
rects = page.search_for(quote[:20], quads=False)
|
|
if not rects:
|
|
continue
|
|
for rect in rects:
|
|
annot = page.add_highlight_annot(rect)
|
|
annot.set_colors(stroke=color)
|
|
annot.set_info(
|
|
title=title,
|
|
content=f"{title}\n\n{comment}" if comment else title,
|
|
)
|
|
annot.update()
|
|
found_any = True
|
|
if found_any:
|
|
highlighted_count += 1
|
|
else:
|
|
not_found_count += 1
|
|
logger.info("Quote not found in PDF: %r", quote[:60])
|
|
|
|
# Build & prepend summary
|
|
summary = _build_summary_pdf(doc, items, overall_summary, risk_level,
|
|
contract_name)
|
|
if summary:
|
|
doc.insert_pdf(summary, start_at=0)
|
|
summary.close()
|
|
|
|
# Set PDF metadata title for nice display in viewers
|
|
if contract_name:
|
|
meta = doc.metadata or {}
|
|
meta["title"] = f"Kontrola: {contract_name}"
|
|
doc.set_metadata(meta)
|
|
|
|
doc.save(str(output_pdf), garbage=4, deflate=True)
|
|
doc.close()
|
|
logger.info("Annotated PDF: highlighted=%d not_found=%d", highlighted_count, not_found_count)
|
|
return output_pdf
|
|
|
|
|
|
def _build_summary_pdf(orig_doc, items: list[dict],
|
|
overall_summary: str, risk_level: str,
|
|
contract_name: str):
|
|
"""Build a 1-N page summary PDF using a Czech-supporting font."""
|
|
if not orig_doc.page_count:
|
|
return None
|
|
src_rect = orig_doc[0].rect
|
|
width = max(float(src_rect.width), 595.0) # ensure at least A4
|
|
height = max(float(src_rect.height), 842.0)
|
|
|
|
new = fitz.open()
|
|
page = new.new_page(width=width, height=height)
|
|
_register_fonts(page)
|
|
|
|
margin_x = 50.0
|
|
y = 50.0
|
|
title_size = 18
|
|
body_size = 10.5
|
|
line_h = body_size * 1.45
|
|
|
|
# Header line: filename
|
|
if contract_name:
|
|
_draw_text(page, contract_name, margin_x, y, font="sans",
|
|
size=11, color=(0.40, 0.45, 0.55))
|
|
y += 18
|
|
|
|
# Title
|
|
_draw_text(page, "Kontrola smluvních podmínek", margin_x, y,
|
|
font="bold", size=title_size, color=(0.06, 0.10, 0.20))
|
|
y += title_size * 1.6
|
|
|
|
# Risk badge line
|
|
if risk_level:
|
|
labels = {"low": "NÍZKÉ", "medium": "STŘEDNÍ", "high": "VYSOKÉ"}
|
|
colors = {
|
|
"low": (0.20, 0.65, 0.32),
|
|
"medium": (0.85, 0.55, 0.10),
|
|
"high": (0.80, 0.20, 0.20),
|
|
}
|
|
_draw_text(page, f"Celková míra rizika: {labels.get(risk_level, risk_level.upper())}",
|
|
margin_x, y, font="bold", size=12,
|
|
color=colors.get(risk_level, (0.4, 0.4, 0.4)))
|
|
y += 22
|
|
|
|
# Overall summary
|
|
if overall_summary:
|
|
y = _wrap_text(page, overall_summary, margin_x, y,
|
|
width - 2 * margin_x, body_size, font="sans")
|
|
y += 12
|
|
|
|
y += 6
|
|
page.draw_line((margin_x, y), (width - margin_x, y),
|
|
color=(0.85, 0.85, 0.85))
|
|
y += 16
|
|
|
|
_draw_text(page, "Položky kontroly", margin_x, y,
|
|
font="bold", size=12, color=(0.15, 0.20, 0.30))
|
|
y += 18
|
|
|
|
status_labels = {"ok": "OK", "warning": "POZOR",
|
|
"problem": "PROBLÉM", "missing": "CHYBÍ"}
|
|
for item in items:
|
|
# Need new page?
|
|
if y > height - 80:
|
|
page = new.new_page(width=width, height=height)
|
|
_register_fonts(page)
|
|
y = 50
|
|
|
|
status = item.get("status", "")
|
|
color = COLORS.get(status, (0.6, 0.6, 0.6))
|
|
label = status_labels.get(status, status.upper())
|
|
title = item.get("title") or item.get("label") or item.get("id", "")
|
|
|
|
# Colored bullet square
|
|
page.draw_rect(
|
|
fitz.Rect(margin_x, y, margin_x + 10, y + 10),
|
|
color=color, fill=color,
|
|
)
|
|
_draw_text(page, f"[{label}] {title}",
|
|
margin_x + 18, y, font="bold", size=11,
|
|
color=(0.06, 0.10, 0.20))
|
|
y += line_h + 4
|
|
|
|
summary = item.get("summary", "")
|
|
if summary:
|
|
y = _wrap_text(page, summary, margin_x + 18, y,
|
|
width - 2 * margin_x - 18, body_size,
|
|
font="sans", color=(0.30, 0.35, 0.45))
|
|
|
|
# List page references for each excerpt
|
|
excerpts = item.get("excerpts") or []
|
|
if excerpts:
|
|
for ex in excerpts:
|
|
pg = ex.get("page")
|
|
text = (ex.get("text") or "").strip()
|
|
if not text:
|
|
continue
|
|
pg_str = f"str. {pg}: " if pg else ""
|
|
snippet = text if len(text) <= 90 else text[:87] + "…"
|
|
y = _wrap_text(page, f"• {pg_str}„{snippet}\"",
|
|
margin_x + 18, y,
|
|
width - 2 * margin_x - 18, body_size - 0.5,
|
|
font="sans", color=(0.25, 0.30, 0.40))
|
|
cmt = (ex.get("comment") or "").strip()
|
|
if cmt:
|
|
y = _wrap_text(page, f" — {cmt}",
|
|
margin_x + 18, y,
|
|
width - 2 * margin_x - 18, body_size - 0.5,
|
|
font="sans", color=(0.45, 0.50, 0.60))
|
|
|
|
y += 12
|
|
|
|
return new
|
|
|
|
|
|
# ── font helpers ─────────────────────────────────────────
|
|
|
|
def _register_fonts(page):
|
|
"""Insert DejaVu Sans (regular + bold) on the page if available."""
|
|
try:
|
|
page.insert_font(fontname="sans", fontfile=FONT_PATH_SANS)
|
|
except Exception as e:
|
|
logger.warning("Could not register DejaVuSans: %s", e)
|
|
try:
|
|
page.insert_font(fontname="bold", fontfile=FONT_PATH_BOLD)
|
|
except Exception:
|
|
# Fall back to regular for bold
|
|
try:
|
|
page.insert_font(fontname="bold", fontfile=FONT_PATH_SANS)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def _draw_text(page, text: str, x: float, y: float,
|
|
font: str = "sans", size: float = 10.5,
|
|
color: tuple = (0.06, 0.10, 0.20)):
|
|
"""Render a single line at baseline y+size."""
|
|
try:
|
|
page.insert_text((x, y + size), text,
|
|
fontname=font, fontsize=size, color=color)
|
|
except Exception:
|
|
# Fallback to PyMuPDF built-in (may mangle diacritics but won't crash)
|
|
page.insert_text((x, y + size), text,
|
|
fontsize=size, color=color)
|
|
|
|
|
|
def _wrap_text(page, text: str, x: float, y: float, max_width: float,
|
|
font_size: float, font: str = "sans",
|
|
color: tuple = (0.06, 0.10, 0.20)) -> float:
|
|
"""Word-wrap `text` and return the new y position."""
|
|
line_h = font_size * 1.45
|
|
# PyMuPDF has Page.get_text_length() for width calculation
|
|
def measure(s: str) -> float:
|
|
try:
|
|
return fitz.get_text_length(s, fontname=font, fontsize=font_size)
|
|
except Exception:
|
|
return len(s) * font_size * 0.50
|
|
|
|
words = text.split()
|
|
if not words:
|
|
return y
|
|
line = ""
|
|
for word in words:
|
|
candidate = (line + " " + word).strip()
|
|
if measure(candidate) > max_width and line:
|
|
_draw_text(page, line, x, y, font=font,
|
|
size=font_size, color=color)
|
|
y += line_h
|
|
line = word
|
|
else:
|
|
line = candidate
|
|
if line:
|
|
_draw_text(page, line, x, y, font=font, size=font_size, color=color)
|
|
y += line_h
|
|
return y
|