Initial portal commit: landing + 9 AI-powered apps

Apps: - dwg-rooms: extract room numbers from DWG/DXF - dwg-counting: count symbols in PDF drawings (OpenCV template matching) - contract-check: review PDF contracts against a checklist (Claude vision + Tesseract OCR fallback) - email-drafter: bullet notes → polished Czech/English business emails - invoice-extractor: PDF/image invoice → structured data → Excel - translator: Czech-first translator across 19 languages with tone control - vv-check: find inconsistent unit prices across VV sheets in one workbook - vv-compare: diff original vs new VV files (changes / added / removed) - feature-request: portal users submit ideas + sample files Infrastructure: - LiteLLM gateway with per-app virtual keys + budgets - Langfuse observability - Geist font, shared theme, cross-subdomain back link + theme sync via cookie/URL - Caddy reverse proxy on *.klas.chat
2026-05-13 15:25:04 +02:00
commit 48cef99257
139 changed files with 20171 additions and 0 deletions
--- a/dwg-counting/counting.py
+++ b/dwg-counting/counting.py
@@ -0,0 +1,286 @@
+"""Shape-based symbol counting via contour matching.
+
+Why not cv2.matchTemplate: CAD symbols are usually thin-line drawings on a
+white background (~10-20% ink). Normalized cross-correlation gives spuriously
+high scores for any sparse-ink region (e.g. wall edges), producing false
+positives everywhere.
+
+Approach used here:
+  1. Binarize template + drawing to "ink" maps.
+  2. Find external contours in both.
+  3. Use the template's main contour as a shape reference.
+  4. For every contour in the drawing, compare via cv2.matchShapes (Hu moments
+     — invariant to scale, rotation, translation).
+  5. Filter by area ratio (similar size to template, allowing ±factor).
+  6. Keep contours below a shape-distance threshold.
+
+This is the classic approach to CAD symbol takeoff.
+"""
+import logging
+from pathlib import Path
+from typing import Iterable
+
+import cv2
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+# Template matching: higher = stricter. 0.65 is permissive, 0.85 strict.
+DEFAULT_THRESHOLD = 0.7
+
+
+def _prep(img_path: Path) -> np.ndarray:
+    """Binarize to 'ink vs not-ink'.
+
+    No dilation — for contour-based matching we need lines to stay narrow
+    so distinct symbols don't merge into one mega-contour through CAD's
+    dense linework.
+    """
+    arr = cv2.imread(str(img_path), cv2.IMREAD_GRAYSCALE)
+    if arr is None:
+        raise RuntimeError(f"Could not load {img_path}")
+    _, ink = cv2.threshold(arr, 245, 255, cv2.THRESH_BINARY_INV)
+    return ink
+
+
+def _crop_to_content(template: np.ndarray, bg_threshold: int = 240) -> np.ndarray:
+    """Crop a template to its non-background bounding box.
+
+    After _prep the template is BINARY (ink=255, bg=0). So 'content' is
+    everything with value > 0. The bg_threshold param is kept for the
+    grayscale case for backward compat.
+    """
+    if template.max() <= 1 or template.min() == 0 and template.max() == 255:
+        # Binary map: any non-zero pixel is ink
+        mask = template > 0
+    else:
+        mask = template < bg_threshold
+    if not mask.any():
+        return template
+    ys, xs = np.where(mask)
+    y0, y1 = ys.min(), ys.max() + 1
+    x0, x1 = xs.min(), xs.max() + 1
+    pad = 2
+    y0 = max(0, y0 - pad)
+    x0 = max(0, x0 - pad)
+    y1 = min(template.shape[0], y1 + pad)
+    x1 = min(template.shape[1], x1 + pad)
+    return template[y0:y1, x0:x1]
+
+
+def _nms(boxes: list[tuple], scores: list[float], overlap_thresh: float = 0.3) -> list[int]:
+    """Non-max suppression. Returns indices of kept boxes."""
+    if not boxes:
+        return []
+    boxes_arr = np.array(boxes, dtype=np.float32)
+    x1 = boxes_arr[:, 0]
+    y1 = boxes_arr[:, 1]
+    x2 = boxes_arr[:, 2]
+    y2 = boxes_arr[:, 3]
+    areas = (x2 - x1) * (y2 - y1)
+    order = np.argsort(scores)[::-1]
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(int(i))
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+        w = np.maximum(0.0, xx2 - xx1)
+        h = np.maximum(0.0, yy2 - yy1)
+        inter = w * h
+        union = areas[i] + areas[order[1:]] - inter
+        iou = inter / np.maximum(union, 1e-6)
+        order = order[1:][iou <= overlap_thresh]
+    return keep
+
+
+MAX_DRAWING_PX = 9000  # effectively no downscale for typical A1/A0
+
+
+def debug_template(template_path: Path, drawing_path: Path) -> dict:
+    """Return diagnostics for tuning a symbol template."""
+    template = _prep(template_path)
+    template_cropped = _crop_to_content(template)
+    drawing = _prep(drawing_path)
+    info = {
+        "template_size": list(template.shape),
+        "template_cropped_size": list(template_cropped.shape),
+        "template_ink_pixels": int((template_cropped > 0).sum()
+                                   if template_cropped.max() == 255 and template_cropped.min() == 0
+                                   else (template_cropped < 240).sum()),
+        "template_total_pixels": int(template_cropped.size),
+        "drawing_size": list(drawing.shape),
+    }
+    if min(template_cropped.shape) < 8:
+        info["error"] = "template too small after content crop"
+        return info
+    # Scale sweep — finds the size at which template matches best
+    scale_scan = []
+    best_overall = -1.0
+    for scale in (0.15, 0.25, 0.35, 0.5, 0.7, 0.85, 1.0, 1.2, 1.5, 2.0):
+        nw = max(8, int(template_cropped.shape[1] * scale))
+        nh = max(8, int(template_cropped.shape[0] * scale))
+        if nh > drawing.shape[0] or nw > drawing.shape[1]:
+            continue
+        tmpl = cv2.resize(template_cropped, (nw, nh), interpolation=cv2.INTER_AREA)
+        res = cv2.matchTemplate(drawing, tmpl, cv2.TM_CCOEFF_NORMED)
+        m = float(res.max())
+        scale_scan.append({
+            "scale": scale,
+            "template_px": [nh, nw],
+            "max_score": round(m, 3),
+            "count_at_0.7": int((res >= 0.7).sum()),
+            "count_at_0.6": int((res >= 0.6).sum()),
+        })
+        if m > best_overall:
+            best_overall = m
+    info["max_score"] = best_overall
+    info["scale_scan"] = scale_scan
+    # Threshold counts at scale=1.0 for reference
+    result = cv2.matchTemplate(drawing, template_cropped, cv2.TM_CCOEFF_NORMED)
+    info["matches_at_threshold"] = {
+        "0.60": int((result >= 0.60).sum()),
+        "0.70": int((result >= 0.70).sum()),
+        "0.75": int((result >= 0.75).sum()),
+        "0.80": int((result >= 0.80).sum()),
+        "0.85": int((result >= 0.85).sum()),
+        "0.90": int((result >= 0.90).sum()),
+    }
+    return info
+
+
+def _merge_template_contour(ink: np.ndarray) -> np.ndarray | None:
+    """Combine all strokes of a template into one contour via closing.
+
+    Returns the biggest connected component. The template is small enough that
+    closing safely merges the line + arc of symbols like '-C' into one shape
+    without affecting matching against the drawing.
+    """
+    closed = cv2.morphologyEx(ink, cv2.MORPH_CLOSE, np.ones((5, 5), np.uint8))
+    contours, _ = cv2.findContours(closed, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
+    if not contours:
+        return None
+    return max(contours, key=cv2.contourArea)
+
+
+def count_template(
+    template_path: Path,
+    drawing_path: Path,
+    threshold: float = DEFAULT_THRESHOLD,
+    rotations: Iterable[int] = (0, 90, 180, 270),
+    scales: Iterable[float] = (0.6, 0.8, 1.0, 1.25, 1.5),
+    exclude_box: tuple | None = None,
+    area_tolerance: float = 200.0,
+    mirror: bool = True,
+) -> dict:
+    """Multi-scale, multi-rotation template matching (TM_SQDIFF_NORMED).
+
+    With mirror=True the template is also tested flipped horizontally — at
+    each rotation — so we catch mirrored instances (e.g. a socket facing
+    left vs facing right).
+    """
+    template = _prep(template_path)
+    template = _crop_to_content(template)
+    drawing = _prep(drawing_path)
+
+    coord_scale = 1.0
+    if max(drawing.shape) > MAX_DRAWING_PX:
+        coord_scale = max(drawing.shape) / MAX_DRAWING_PX
+        new_w = int(drawing.shape[1] / coord_scale)
+        new_h = int(drawing.shape[0] / coord_scale)
+        drawing = cv2.resize(drawing, (new_w, new_h), interpolation=cv2.INTER_AREA)
+
+    if exclude_box is not None:
+        ex_x, ex_y, ex_w, ex_h = exclude_box
+        ex_x = int(ex_x / coord_scale)
+        ex_y = int(ex_y / coord_scale)
+        ex_w = int(ex_w / coord_scale)
+        ex_h = int(ex_h / coord_scale)
+        h, w = drawing.shape
+        ex_x = max(0, min(w, ex_x))
+        ex_y = max(0, min(h, ex_y))
+        ex_x2 = max(0, min(w, ex_x + ex_w))
+        ex_y2 = max(0, min(h, ex_y + ex_h))
+        drawing[ex_y:ex_y2, ex_x:ex_x2] = 0
+        logger.info("Masked legend region (%d,%d,%d,%d)", ex_x, ex_y, ex_w, ex_h)
+
+    if min(template.shape) < 8 or int((template > 0).sum()) < 5:
+        logger.warning("Template too small / empty after preprocessing")
+        return {"count": 0, "matches": [], "threshold_used": threshold}
+
+    all_boxes: list[tuple] = []
+    all_scores: list[float] = []
+
+    # Build the variants we'll search with: rotations × (original, mirrored)
+    variants = [(template, False)]
+    if mirror:
+        variants.append((cv2.flip(template, 1), True))
+
+    def _rotate(img, angle_deg):
+        if angle_deg == 0:
+            return img
+        if angle_deg in (90, 180, 270):
+            return np.rot90(img, k=angle_deg // 90)
+        # Arbitrary angle — rotate with bbox expansion + white fill
+        h, w = img.shape[:2]
+        center = (w / 2, h / 2)
+        M = cv2.getRotationMatrix2D(center, angle_deg, 1.0)
+        cos = abs(M[0, 0]); sin = abs(M[0, 1])
+        new_w = int(h * sin + w * cos)
+        new_h = int(h * cos + w * sin)
+        M[0, 2] += new_w / 2 - center[0]
+        M[1, 2] += new_h / 2 - center[1]
+        return cv2.warpAffine(img, M, (new_w, new_h),
+                              flags=cv2.INTER_NEAREST, borderValue=0)
+
+    # Build the full job list (one entry per rot×scale×mirror)
+    jobs_list = []
+    for variant, _ in variants:
+        for rot in rotations:
+            base = _rotate(variant, rot)
+            for scale in scales:
+                new_w = max(8, int(base.shape[1] * scale))
+                new_h = max(8, int(base.shape[0] * scale))
+                if new_h > drawing.shape[0] or new_w > drawing.shape[1]:
+                    continue
+                tmpl = cv2.resize(base, (new_w, new_h),
+                                  interpolation=cv2.INTER_AREA)
+                jobs_list.append((tmpl, new_w, new_h))
+
+    def _run(args):
+        tmpl, new_w, new_h = args
+        sq = cv2.matchTemplate(drawing, tmpl, cv2.TM_SQDIFF_NORMED)
+        cc = cv2.matchTemplate(drawing, tmpl, cv2.TM_CCOEFF_NORMED)
+        sim = np.maximum(1.0 - sq, cc)
+        ys, xs = np.where(sim >= threshold)
+        out = []
+        for y, x in zip(ys, xs):
+            out.append((float(x), float(y), float(x + new_w),
+                        float(y + new_h), float(sim[y, x])))
+        return out
+
+    # Threading: cv2.matchTemplate releases the GIL, so threads give real speedup
+    import concurrent.futures
+    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as ex:
+        for result in ex.map(_run, jobs_list):
+            for x0, y0, x1, y1, score in result:
+                all_boxes.append((x0, y0, x1, y1))
+                all_scores.append(score)
+
+    if not all_boxes:
+        return {"count": 0, "matches": [], "threshold_used": threshold}
+
+    keep = _nms(all_boxes, all_scores, overlap_thresh=0.3)
+    matches = []
+    for i in keep:
+        x0, y0, x1, y1 = all_boxes[i]
+        matches.append({
+            "x": int(x0 * coord_scale),
+            "y": int(y0 * coord_scale),
+            "w": int((x1 - x0) * coord_scale),
+            "h": int((y1 - y0) * coord_scale),
+            "score": round(all_scores[i], 3),
+        })
+    return {"count": len(matches), "matches": matches, "threshold_used": threshold}