Files
AI_portal/dwg-rooms/extractor.py
Ondřej Glaser 48cef99257 Initial portal commit: landing + 9 AI-powered apps
Apps:
- dwg-rooms: extract room numbers from DWG/DXF
- dwg-counting: count symbols in PDF drawings (OpenCV template matching)
- contract-check: review PDF contracts against a checklist (Claude vision + Tesseract OCR fallback)
- email-drafter: bullet notes → polished Czech/English business emails
- invoice-extractor: PDF/image invoice → structured data → Excel
- translator: Czech-first translator across 19 languages with tone control
- vv-check: find inconsistent unit prices across VV sheets in one workbook
- vv-compare: diff original vs new VV files (changes / added / removed)
- feature-request: portal users submit ideas + sample files

Infrastructure:
- LiteLLM gateway with per-app virtual keys + budgets
- Langfuse observability
- Geist font, shared theme, cross-subdomain back link + theme sync via cookie/URL
- Caddy reverse proxy on *.klas.chat
2026-05-13 15:25:04 +02:00

169 lines
5.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Rule-based room extraction from DXF files."""
import math
import re
import ezdxf
# Defaults shown to user; they can remove or replace.
DEFAULT_EXAMPLES = ["č.m. 0301", "01024"]
MEASUREMENT_KW = (
"podlaha:", "stěny:", "strop:", "m2", "", "povrchová", "nátěr",
"penetrační", "vrstva", "odstín", "ral ", "chodníky:", "klenba:",
"portály:", "epoxidový", "beton", "dlažba", "omítka", "obklad",
)
def example_to_regex(example: str) -> re.Pattern | None:
"""
Convert an example like '4-22408' or 'č.m. 0301' into a compiled regex.
- digits become wildcards (\\d, exactly N digits where the example had N digits)
- everything else is matched literally
- an optional trailing letter is allowed (for variants like '0301a')
"""
if not example or not example.strip():
return None
s = example.strip()
parts: list[str] = []
run = 0
for ch in s:
if ch.isdigit():
run += 1
continue
if run:
parts.append(rf"\d{{{run}}}")
run = 0
parts.append(re.escape(ch))
if run:
parts.append(rf"\d{{{run}}}")
pattern = "^(" + "".join(parts) + r"[a-zA-Z]?)$"
try:
return re.compile(pattern, re.IGNORECASE)
except re.error:
return None
def compile_examples(examples: list[str] | None) -> list[re.Pattern]:
items = examples if examples is not None else DEFAULT_EXAMPLES
out = []
for ex in items:
rx = example_to_regex(ex)
if rx is not None:
out.append(rx)
return out
def _clean_mtext(text: str) -> str:
text = re.sub(r"\{\\[^;]*;", "", text)
text = re.sub(r"\\[Pp]", " ", text)
text = text.replace("}", "")
text = re.sub(r"\s*\d+[,\.]\d*\s*m2\s*$", "", text)
return " ".join(text.split()).strip()
def _is_measurement(text: str) -> bool:
t = text.lower()
return any(k in t for k in MEASUREMENT_KW)
def _is_room_marker(text: str, regexes: list[re.Pattern]) -> re.Match | None:
s = text.strip()
for rx in regexes:
m = rx.fullmatch(s)
if m:
return m
return None
def _is_dimension(text: str, regexes: list[re.Pattern]) -> bool:
"""
Anything that's effectively just digits (with optional separators) and is NOT
claimed by any active room pattern — treat as a dimension/measurement value.
"""
if _is_room_marker(text, regexes):
return False
clean = re.sub(r"[\s.,\-x×+]", "", text)
return clean.isdigit() and len(clean) > 0
def _all_text_entities(dxf_path: str) -> list[dict]:
doc = ezdxf.readfile(dxf_path)
msp = doc.modelspace()
out = []
for ent in msp:
try:
if ent.dxftype() == "TEXT":
t = ent.dxf.text.strip()
x, y = ent.dxf.insert.x, ent.dxf.insert.y
elif ent.dxftype() == "MTEXT":
t = _clean_mtext(ent.text)
x, y = ent.dxf.insert.x, ent.dxf.insert.y
else:
continue
if t:
out.append({"text": t, "x": x, "y": y})
except Exception:
pass
return out
def _nearest_description(rx_x: float, ry: float, candidates: list[dict],
regexes: list[re.Pattern], max_dist: float = 8000) -> str | None:
best, best_d = None, max_dist
for c in candidates:
t = c["text"]
if _is_measurement(t) or _is_dimension(t, regexes) or len(t.strip()) < 2:
continue
if _is_room_marker(t, regexes):
continue
d = math.hypot(c["x"] - rx_x, c["y"] - ry)
if d < best_d:
best_d, best = d, t
return best
def extract_rooms(dxf_path: str, examples: list[str] | None = None) -> tuple[list[dict], list[dict]]:
"""
Returns (rooms, unmatched_texts).
examples: user-provided room-number examples; None → DEFAULT_EXAMPLES.
"""
regexes = compile_examples(examples)
entities = _all_text_entities(dxf_path)
room_markers, other = [], []
for e in entities:
m = _is_room_marker(e["text"], regexes)
if m:
room_markers.append({"room": m.group(1), "x": e["x"], "y": e["y"]})
else:
other.append(e)
used: set[str] = set()
seen_rooms: set[str] = set()
rooms: list[dict] = []
for rm in room_markers:
if rm["room"] in seen_rooms:
continue
seen_rooms.add(rm["room"])
desc = _nearest_description(rm["x"], rm["y"], other, regexes)
rooms.append({
"room": rm["room"],
"description": desc or "",
"x": round(rm["x"], 1),
"y": round(rm["y"], 1),
"source": "rule",
"confidence": 1.0 if desc else 0.6,
})
if desc:
used.add(desc)
unmatched = [
e for e in other
if e["text"] not in used
and not _is_measurement(e["text"])
and not _is_dimension(e["text"], regexes)
and len(e["text"]) > 3
]
return rooms, unmatched