Initial portal commit: landing + 9 AI-powered apps
Apps: - dwg-rooms: extract room numbers from DWG/DXF - dwg-counting: count symbols in PDF drawings (OpenCV template matching) - contract-check: review PDF contracts against a checklist (Claude vision + Tesseract OCR fallback) - email-drafter: bullet notes → polished Czech/English business emails - invoice-extractor: PDF/image invoice → structured data → Excel - translator: Czech-first translator across 19 languages with tone control - vv-check: find inconsistent unit prices across VV sheets in one workbook - vv-compare: diff original vs new VV files (changes / added / removed) - feature-request: portal users submit ideas + sample files Infrastructure: - LiteLLM gateway with per-app virtual keys + budgets - Langfuse observability - Geist font, shared theme, cross-subdomain back link + theme sync via cookie/URL - Caddy reverse proxy on *.klas.chat
This commit is contained in:
190
invoice-extractor/extractor.py
Normal file
190
invoice-extractor/extractor.py
Normal file
@@ -0,0 +1,190 @@
|
||||
"""Invoice data extraction via Claude vision."""
|
||||
import base64
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import fitz # PyMuPDF
|
||||
from PIL import Image
|
||||
from openai import AsyncOpenAI
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_client: AsyncOpenAI | None = None
|
||||
MODEL = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-20250514")
|
||||
MAX_PAGES = 6 # cap to keep token cost predictable
|
||||
MAX_IMG_LONG_EDGE = 2200 # px — enough resolution for invoice details
|
||||
|
||||
|
||||
def _get_client() -> AsyncOpenAI:
|
||||
global _client
|
||||
if _client is None:
|
||||
_client = AsyncOpenAI(
|
||||
base_url=os.getenv("LITELLM_BASE_URL", "http://host.docker.internal:4000/v1"),
|
||||
api_key=os.getenv("LITELLM_API_KEY", "sk-dummy"),
|
||||
)
|
||||
return _client
|
||||
|
||||
|
||||
SYSTEM_PROMPT = """Jste přesný extraktor dat z českých faktur. Z obrázku/obrázků faktury vyextrahujete strukturovaná data pro účetní systém.
|
||||
|
||||
Vraťte POUZE platný JSON v tomto tvaru, žádné markdown obaly, žádný komentář mimo JSON:
|
||||
|
||||
{
|
||||
"invoice_number": "string|null",
|
||||
"issue_date": "YYYY-MM-DD|null",
|
||||
"due_date": "YYYY-MM-DD|null",
|
||||
"taxable_date": "YYYY-MM-DD|null",
|
||||
"variable_symbol": "string|null",
|
||||
"constant_symbol": "string|null",
|
||||
"specific_symbol": "string|null",
|
||||
"currency": "CZK",
|
||||
"supplier": {
|
||||
"name": "string|null",
|
||||
"ico": "string|null",
|
||||
"dic": "string|null",
|
||||
"address": "string|null"
|
||||
},
|
||||
"customer": {
|
||||
"name": "string|null",
|
||||
"ico": "string|null",
|
||||
"dic": "string|null",
|
||||
"address": "string|null"
|
||||
},
|
||||
"bank_account": "string|null",
|
||||
"iban": "string|null",
|
||||
"payment_method": "převod|hotově|karta|dobírka|jiné|null",
|
||||
"line_items": [
|
||||
{
|
||||
"description": "string",
|
||||
"quantity": "number|null",
|
||||
"unit": "string|null",
|
||||
"unit_price_excluding_vat": "number|null",
|
||||
"vat_rate": "number|null",
|
||||
"total_excluding_vat": "number|null",
|
||||
"total_including_vat": "number|null"
|
||||
}
|
||||
],
|
||||
"vat_breakdown": [
|
||||
{"rate": 21, "base": 0.0, "vat": 0.0, "total": 0.0}
|
||||
],
|
||||
"total_excluding_vat": "number|null",
|
||||
"total_vat": "number|null",
|
||||
"total_including_vat": "number|null",
|
||||
"notes": "string|null"
|
||||
}
|
||||
|
||||
Pravidla:
|
||||
- Datumy zapisujte ve formátu YYYY-MM-DD (ISO 8601).
|
||||
- Čísla zapisujte jako desetinná čísla s tečkou jako oddělovačem (např. 1234.56). NIKDY nepoužívejte čárku ani mezery v číslech.
|
||||
- Pokud údaj na faktuře není, použijte null. NEVYMÝŠLEJTE si.
|
||||
- IČO je 8místné číslo (může mít vedoucí nuly). DIČ obvykle začíná „CZ".
|
||||
- variable_symbol je obvykle stejný jako číslo faktury, ale ne vždy — zapisujte přesně co je na faktuře.
|
||||
- U položek (line_items) zachovejte přesné pořadí jak jsou na faktuře.
|
||||
- Pokud je faktura v jiné měně než CZK, zapište správný kód měny (EUR, USD, atd.).
|
||||
- Adresy zapisujte jako jeden řetězec (ulice + číslo, PSČ město)."""
|
||||
|
||||
|
||||
async def extract_invoice(pdf_or_image_path: Path) -> dict:
|
||||
"""Extract structured invoice data using Claude vision."""
|
||||
images = _to_images(pdf_or_image_path)
|
||||
if not images:
|
||||
raise RuntimeError("Soubor neobsahuje žádné zobrazitelné stránky")
|
||||
|
||||
content = [
|
||||
{"type": "text",
|
||||
"text": "Z následujících obrázků faktury vyextrahujte strukturovaná data podle definovaného JSON tvaru."},
|
||||
]
|
||||
for img_b64 in images:
|
||||
content.append({
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{img_b64}"},
|
||||
})
|
||||
|
||||
try:
|
||||
resp = await _get_client().chat.completions.create(
|
||||
model=MODEL,
|
||||
messages=[
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": content},
|
||||
],
|
||||
temperature=0.0,
|
||||
max_tokens=4000,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.exception("LLM call failed")
|
||||
raise RuntimeError(f"AI extrakce selhala: {exc}")
|
||||
|
||||
raw = (resp.choices[0].message.content or "").strip()
|
||||
raw = raw.removeprefix("```json").removeprefix("```").removesuffix("```").strip()
|
||||
try:
|
||||
data = json.loads(raw)
|
||||
except json.JSONDecodeError as exc:
|
||||
logger.error("JSON parse failed: %s\n%s", exc, raw[:500])
|
||||
raise RuntimeError(f"Nepodařilo se zpracovat odpověď AI: {exc}")
|
||||
|
||||
return _normalize(data)
|
||||
|
||||
|
||||
def _to_images(path: Path) -> list[str]:
|
||||
"""Return list of base64-encoded PNG strings, one per page."""
|
||||
suffix = path.suffix.lower()
|
||||
if suffix == ".pdf":
|
||||
doc = fitz.open(str(path))
|
||||
out = []
|
||||
for i, page in enumerate(doc):
|
||||
if i >= MAX_PAGES:
|
||||
break
|
||||
pix = page.get_pixmap(dpi=200, alpha=False)
|
||||
img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
|
||||
out.append(_compress(img))
|
||||
doc.close()
|
||||
return out
|
||||
elif suffix in (".jpg", ".jpeg", ".png", ".webp"):
|
||||
img = Image.open(path).convert("RGB")
|
||||
return [_compress(img)]
|
||||
raise RuntimeError(f"Nepodporovaný formát: {suffix}")
|
||||
|
||||
|
||||
def _compress(img: Image.Image) -> str:
|
||||
if max(img.size) > MAX_IMG_LONG_EDGE:
|
||||
ratio = MAX_IMG_LONG_EDGE / max(img.size)
|
||||
img = img.resize(
|
||||
(int(img.size[0] * ratio), int(img.size[1] * ratio)),
|
||||
Image.LANCZOS,
|
||||
)
|
||||
buf = io.BytesIO()
|
||||
img.save(buf, format="PNG", optimize=True)
|
||||
return base64.b64encode(buf.getvalue()).decode("ascii")
|
||||
|
||||
|
||||
def _normalize(data: dict) -> dict:
|
||||
"""Coerce known numeric fields to float and clean nulls."""
|
||||
def _num(v):
|
||||
if v is None or v == "":
|
||||
return None
|
||||
if isinstance(v, (int, float)):
|
||||
return float(v)
|
||||
try:
|
||||
return float(str(v).replace(",", ".").replace(" ", ""))
|
||||
except (ValueError, TypeError):
|
||||
return None
|
||||
|
||||
for k in ("total_excluding_vat", "total_vat", "total_including_vat"):
|
||||
if k in data:
|
||||
data[k] = _num(data[k])
|
||||
|
||||
for item in data.get("line_items") or []:
|
||||
for k in ("quantity", "unit_price_excluding_vat", "vat_rate",
|
||||
"total_excluding_vat", "total_including_vat"):
|
||||
if k in item:
|
||||
item[k] = _num(item[k])
|
||||
|
||||
for br in data.get("vat_breakdown") or []:
|
||||
for k in ("rate", "base", "vat", "total"):
|
||||
if k in br:
|
||||
br[k] = _num(br[k])
|
||||
|
||||
return data
|
||||
Reference in New Issue
Block a user