Initial portal commit: landing + 9 AI-powered apps
Apps: - dwg-rooms: extract room numbers from DWG/DXF - dwg-counting: count symbols in PDF drawings (OpenCV template matching) - contract-check: review PDF contracts against a checklist (Claude vision + Tesseract OCR fallback) - email-drafter: bullet notes → polished Czech/English business emails - invoice-extractor: PDF/image invoice → structured data → Excel - translator: Czech-first translator across 19 languages with tone control - vv-check: find inconsistent unit prices across VV sheets in one workbook - vv-compare: diff original vs new VV files (changes / added / removed) - feature-request: portal users submit ideas + sample files Infrastructure: - LiteLLM gateway with per-app virtual keys + budgets - Langfuse observability - Geist font, shared theme, cross-subdomain back link + theme sync via cookie/URL - Caddy reverse proxy on *.klas.chat
This commit is contained in:
257
contract-check/pdf_annotator.py
Normal file
257
contract-check/pdf_annotator.py
Normal file
@@ -0,0 +1,257 @@
|
||||
"""Add color-coded highlights and a Czech-correct summary page to a PDF."""
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import fitz # PyMuPDF
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# RGB 0-1 for PyMuPDF
|
||||
COLORS = {
|
||||
"ok": (0.69, 0.91, 0.69), # green
|
||||
"warning": (1.00, 0.90, 0.45), # yellow
|
||||
"problem": (1.00, 0.65, 0.65), # red
|
||||
"missing": (0.85, 0.85, 0.85), # grey
|
||||
}
|
||||
|
||||
# DejaVu Sans is shipped via fonts-dejavu-core; supports full Czech glyph set.
|
||||
FONT_PATH_SANS = "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"
|
||||
FONT_PATH_BOLD = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"
|
||||
|
||||
|
||||
def annotate(input_pdf: Path, output_pdf: Path, items: list[dict],
|
||||
overall_summary: str = "", risk_level: str = "",
|
||||
skip_highlights: bool = False,
|
||||
contract_name: str = "") -> Path:
|
||||
"""Open input_pdf, highlight excerpts, prepend a summary page."""
|
||||
doc = fitz.open(str(input_pdf))
|
||||
|
||||
highlighted_count = 0
|
||||
not_found_count = 0
|
||||
items_to_annotate = [] if skip_highlights else items
|
||||
if skip_highlights:
|
||||
logger.info("Skipping per-excerpt highlights (OCR'd PDF — no text layer)")
|
||||
for item in items_to_annotate:
|
||||
color = COLORS.get(item.get("status", "warning"), COLORS["warning"])
|
||||
title = item.get("title") or item.get("label") or item.get("id", "")
|
||||
for ex in item.get("excerpts") or []:
|
||||
quote = (ex.get("text") or "").strip()
|
||||
comment = (ex.get("comment") or "").strip()
|
||||
if not quote:
|
||||
continue
|
||||
found_any = False
|
||||
for page in doc:
|
||||
rects = page.search_for(quote, quads=False)
|
||||
if not rects and len(quote) > 20:
|
||||
rects = page.search_for(quote[:20], quads=False)
|
||||
if not rects:
|
||||
continue
|
||||
for rect in rects:
|
||||
annot = page.add_highlight_annot(rect)
|
||||
annot.set_colors(stroke=color)
|
||||
annot.set_info(
|
||||
title=title,
|
||||
content=f"{title}\n\n{comment}" if comment else title,
|
||||
)
|
||||
annot.update()
|
||||
found_any = True
|
||||
if found_any:
|
||||
highlighted_count += 1
|
||||
else:
|
||||
not_found_count += 1
|
||||
logger.info("Quote not found in PDF: %r", quote[:60])
|
||||
|
||||
# Build & prepend summary
|
||||
summary = _build_summary_pdf(doc, items, overall_summary, risk_level,
|
||||
contract_name)
|
||||
if summary:
|
||||
doc.insert_pdf(summary, start_at=0)
|
||||
summary.close()
|
||||
|
||||
# Set PDF metadata title for nice display in viewers
|
||||
if contract_name:
|
||||
meta = doc.metadata or {}
|
||||
meta["title"] = f"Kontrola: {contract_name}"
|
||||
doc.set_metadata(meta)
|
||||
|
||||
doc.save(str(output_pdf), garbage=4, deflate=True)
|
||||
doc.close()
|
||||
logger.info("Annotated PDF: highlighted=%d not_found=%d", highlighted_count, not_found_count)
|
||||
return output_pdf
|
||||
|
||||
|
||||
def _build_summary_pdf(orig_doc, items: list[dict],
|
||||
overall_summary: str, risk_level: str,
|
||||
contract_name: str):
|
||||
"""Build a 1-N page summary PDF using a Czech-supporting font."""
|
||||
if not orig_doc.page_count:
|
||||
return None
|
||||
src_rect = orig_doc[0].rect
|
||||
width = max(float(src_rect.width), 595.0) # ensure at least A4
|
||||
height = max(float(src_rect.height), 842.0)
|
||||
|
||||
new = fitz.open()
|
||||
page = new.new_page(width=width, height=height)
|
||||
_register_fonts(page)
|
||||
|
||||
margin_x = 50.0
|
||||
y = 50.0
|
||||
title_size = 18
|
||||
body_size = 10.5
|
||||
line_h = body_size * 1.45
|
||||
|
||||
# Header line: filename
|
||||
if contract_name:
|
||||
_draw_text(page, contract_name, margin_x, y, font="sans",
|
||||
size=11, color=(0.40, 0.45, 0.55))
|
||||
y += 18
|
||||
|
||||
# Title
|
||||
_draw_text(page, "Kontrola smluvních podmínek", margin_x, y,
|
||||
font="bold", size=title_size, color=(0.06, 0.10, 0.20))
|
||||
y += title_size * 1.6
|
||||
|
||||
# Risk badge line
|
||||
if risk_level:
|
||||
labels = {"low": "NÍZKÉ", "medium": "STŘEDNÍ", "high": "VYSOKÉ"}
|
||||
colors = {
|
||||
"low": (0.20, 0.65, 0.32),
|
||||
"medium": (0.85, 0.55, 0.10),
|
||||
"high": (0.80, 0.20, 0.20),
|
||||
}
|
||||
_draw_text(page, f"Celková míra rizika: {labels.get(risk_level, risk_level.upper())}",
|
||||
margin_x, y, font="bold", size=12,
|
||||
color=colors.get(risk_level, (0.4, 0.4, 0.4)))
|
||||
y += 22
|
||||
|
||||
# Overall summary
|
||||
if overall_summary:
|
||||
y = _wrap_text(page, overall_summary, margin_x, y,
|
||||
width - 2 * margin_x, body_size, font="sans")
|
||||
y += 12
|
||||
|
||||
y += 6
|
||||
page.draw_line((margin_x, y), (width - margin_x, y),
|
||||
color=(0.85, 0.85, 0.85))
|
||||
y += 16
|
||||
|
||||
_draw_text(page, "Položky kontroly", margin_x, y,
|
||||
font="bold", size=12, color=(0.15, 0.20, 0.30))
|
||||
y += 18
|
||||
|
||||
status_labels = {"ok": "OK", "warning": "POZOR",
|
||||
"problem": "PROBLÉM", "missing": "CHYBÍ"}
|
||||
for item in items:
|
||||
# Need new page?
|
||||
if y > height - 80:
|
||||
page = new.new_page(width=width, height=height)
|
||||
_register_fonts(page)
|
||||
y = 50
|
||||
|
||||
status = item.get("status", "")
|
||||
color = COLORS.get(status, (0.6, 0.6, 0.6))
|
||||
label = status_labels.get(status, status.upper())
|
||||
title = item.get("title") or item.get("label") or item.get("id", "")
|
||||
|
||||
# Colored bullet square
|
||||
page.draw_rect(
|
||||
fitz.Rect(margin_x, y, margin_x + 10, y + 10),
|
||||
color=color, fill=color,
|
||||
)
|
||||
_draw_text(page, f"[{label}] {title}",
|
||||
margin_x + 18, y, font="bold", size=11,
|
||||
color=(0.06, 0.10, 0.20))
|
||||
y += line_h + 4
|
||||
|
||||
summary = item.get("summary", "")
|
||||
if summary:
|
||||
y = _wrap_text(page, summary, margin_x + 18, y,
|
||||
width - 2 * margin_x - 18, body_size,
|
||||
font="sans", color=(0.30, 0.35, 0.45))
|
||||
|
||||
# List page references for each excerpt
|
||||
excerpts = item.get("excerpts") or []
|
||||
if excerpts:
|
||||
for ex in excerpts:
|
||||
pg = ex.get("page")
|
||||
text = (ex.get("text") or "").strip()
|
||||
if not text:
|
||||
continue
|
||||
pg_str = f"str. {pg}: " if pg else ""
|
||||
snippet = text if len(text) <= 90 else text[:87] + "…"
|
||||
y = _wrap_text(page, f"• {pg_str}„{snippet}\"",
|
||||
margin_x + 18, y,
|
||||
width - 2 * margin_x - 18, body_size - 0.5,
|
||||
font="sans", color=(0.25, 0.30, 0.40))
|
||||
cmt = (ex.get("comment") or "").strip()
|
||||
if cmt:
|
||||
y = _wrap_text(page, f" — {cmt}",
|
||||
margin_x + 18, y,
|
||||
width - 2 * margin_x - 18, body_size - 0.5,
|
||||
font="sans", color=(0.45, 0.50, 0.60))
|
||||
|
||||
y += 12
|
||||
|
||||
return new
|
||||
|
||||
|
||||
# ── font helpers ─────────────────────────────────────────
|
||||
|
||||
def _register_fonts(page):
|
||||
"""Insert DejaVu Sans (regular + bold) on the page if available."""
|
||||
try:
|
||||
page.insert_font(fontname="sans", fontfile=FONT_PATH_SANS)
|
||||
except Exception as e:
|
||||
logger.warning("Could not register DejaVuSans: %s", e)
|
||||
try:
|
||||
page.insert_font(fontname="bold", fontfile=FONT_PATH_BOLD)
|
||||
except Exception:
|
||||
# Fall back to regular for bold
|
||||
try:
|
||||
page.insert_font(fontname="bold", fontfile=FONT_PATH_SANS)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _draw_text(page, text: str, x: float, y: float,
|
||||
font: str = "sans", size: float = 10.5,
|
||||
color: tuple = (0.06, 0.10, 0.20)):
|
||||
"""Render a single line at baseline y+size."""
|
||||
try:
|
||||
page.insert_text((x, y + size), text,
|
||||
fontname=font, fontsize=size, color=color)
|
||||
except Exception:
|
||||
# Fallback to PyMuPDF built-in (may mangle diacritics but won't crash)
|
||||
page.insert_text((x, y + size), text,
|
||||
fontsize=size, color=color)
|
||||
|
||||
|
||||
def _wrap_text(page, text: str, x: float, y: float, max_width: float,
|
||||
font_size: float, font: str = "sans",
|
||||
color: tuple = (0.06, 0.10, 0.20)) -> float:
|
||||
"""Word-wrap `text` and return the new y position."""
|
||||
line_h = font_size * 1.45
|
||||
# PyMuPDF has Page.get_text_length() for width calculation
|
||||
def measure(s: str) -> float:
|
||||
try:
|
||||
return fitz.get_text_length(s, fontname=font, fontsize=font_size)
|
||||
except Exception:
|
||||
return len(s) * font_size * 0.50
|
||||
|
||||
words = text.split()
|
||||
if not words:
|
||||
return y
|
||||
line = ""
|
||||
for word in words:
|
||||
candidate = (line + " " + word).strip()
|
||||
if measure(candidate) > max_width and line:
|
||||
_draw_text(page, line, x, y, font=font,
|
||||
size=font_size, color=color)
|
||||
y += line_h
|
||||
line = word
|
||||
else:
|
||||
line = candidate
|
||||
if line:
|
||||
_draw_text(page, line, x, y, font=font, size=font_size, color=color)
|
||||
y += line_h
|
||||
return y
|
||||
Reference in New Issue
Block a user