Apps: - dwg-rooms: extract room numbers from DWG/DXF - dwg-counting: count symbols in PDF drawings (OpenCV template matching) - contract-check: review PDF contracts against a checklist (Claude vision + Tesseract OCR fallback) - email-drafter: bullet notes → polished Czech/English business emails - invoice-extractor: PDF/image invoice → structured data → Excel - translator: Czech-first translator across 19 languages with tone control - vv-check: find inconsistent unit prices across VV sheets in one workbook - vv-compare: diff original vs new VV files (changes / added / removed) - feature-request: portal users submit ideas + sample files Infrastructure: - LiteLLM gateway with per-app virtual keys + budgets - Langfuse observability - Geist font, shared theme, cross-subdomain back link + theme sync via cookie/URL - Caddy reverse proxy on *.klas.chat
169 lines
5.0 KiB
Python
169 lines
5.0 KiB
Python
"""Rule-based room extraction from DXF files."""
|
||
import math
|
||
import re
|
||
|
||
import ezdxf
|
||
|
||
# Defaults shown to user; they can remove or replace.
|
||
DEFAULT_EXAMPLES = ["č.m. 0301", "01024"]
|
||
|
||
MEASUREMENT_KW = (
|
||
"podlaha:", "stěny:", "strop:", "m2", "m²", "povrchová", "nátěr",
|
||
"penetrační", "vrstva", "odstín", "ral ", "chodníky:", "klenba:",
|
||
"portály:", "epoxidový", "beton", "dlažba", "omítka", "obklad",
|
||
)
|
||
|
||
|
||
def example_to_regex(example: str) -> re.Pattern | None:
|
||
"""
|
||
Convert an example like '4-22408' or 'č.m. 0301' into a compiled regex.
|
||
- digits become wildcards (\\d, exactly N digits where the example had N digits)
|
||
- everything else is matched literally
|
||
- an optional trailing letter is allowed (for variants like '0301a')
|
||
"""
|
||
if not example or not example.strip():
|
||
return None
|
||
s = example.strip()
|
||
parts: list[str] = []
|
||
run = 0
|
||
for ch in s:
|
||
if ch.isdigit():
|
||
run += 1
|
||
continue
|
||
if run:
|
||
parts.append(rf"\d{{{run}}}")
|
||
run = 0
|
||
parts.append(re.escape(ch))
|
||
if run:
|
||
parts.append(rf"\d{{{run}}}")
|
||
pattern = "^(" + "".join(parts) + r"[a-zA-Z]?)$"
|
||
try:
|
||
return re.compile(pattern, re.IGNORECASE)
|
||
except re.error:
|
||
return None
|
||
|
||
|
||
def compile_examples(examples: list[str] | None) -> list[re.Pattern]:
|
||
items = examples if examples is not None else DEFAULT_EXAMPLES
|
||
out = []
|
||
for ex in items:
|
||
rx = example_to_regex(ex)
|
||
if rx is not None:
|
||
out.append(rx)
|
||
return out
|
||
|
||
|
||
def _clean_mtext(text: str) -> str:
|
||
text = re.sub(r"\{\\[^;]*;", "", text)
|
||
text = re.sub(r"\\[Pp]", " ", text)
|
||
text = text.replace("}", "")
|
||
text = re.sub(r"\s*\d+[,\.]\d*\s*m2\s*$", "", text)
|
||
return " ".join(text.split()).strip()
|
||
|
||
|
||
def _is_measurement(text: str) -> bool:
|
||
t = text.lower()
|
||
return any(k in t for k in MEASUREMENT_KW)
|
||
|
||
|
||
def _is_room_marker(text: str, regexes: list[re.Pattern]) -> re.Match | None:
|
||
s = text.strip()
|
||
for rx in regexes:
|
||
m = rx.fullmatch(s)
|
||
if m:
|
||
return m
|
||
return None
|
||
|
||
|
||
def _is_dimension(text: str, regexes: list[re.Pattern]) -> bool:
|
||
"""
|
||
Anything that's effectively just digits (with optional separators) and is NOT
|
||
claimed by any active room pattern — treat as a dimension/measurement value.
|
||
"""
|
||
if _is_room_marker(text, regexes):
|
||
return False
|
||
clean = re.sub(r"[\s.,\-x×+]", "", text)
|
||
return clean.isdigit() and len(clean) > 0
|
||
|
||
|
||
def _all_text_entities(dxf_path: str) -> list[dict]:
|
||
doc = ezdxf.readfile(dxf_path)
|
||
msp = doc.modelspace()
|
||
out = []
|
||
for ent in msp:
|
||
try:
|
||
if ent.dxftype() == "TEXT":
|
||
t = ent.dxf.text.strip()
|
||
x, y = ent.dxf.insert.x, ent.dxf.insert.y
|
||
elif ent.dxftype() == "MTEXT":
|
||
t = _clean_mtext(ent.text)
|
||
x, y = ent.dxf.insert.x, ent.dxf.insert.y
|
||
else:
|
||
continue
|
||
if t:
|
||
out.append({"text": t, "x": x, "y": y})
|
||
except Exception:
|
||
pass
|
||
return out
|
||
|
||
|
||
def _nearest_description(rx_x: float, ry: float, candidates: list[dict],
|
||
regexes: list[re.Pattern], max_dist: float = 8000) -> str | None:
|
||
best, best_d = None, max_dist
|
||
for c in candidates:
|
||
t = c["text"]
|
||
if _is_measurement(t) or _is_dimension(t, regexes) or len(t.strip()) < 2:
|
||
continue
|
||
if _is_room_marker(t, regexes):
|
||
continue
|
||
d = math.hypot(c["x"] - rx_x, c["y"] - ry)
|
||
if d < best_d:
|
||
best_d, best = d, t
|
||
return best
|
||
|
||
|
||
def extract_rooms(dxf_path: str, examples: list[str] | None = None) -> tuple[list[dict], list[dict]]:
|
||
"""
|
||
Returns (rooms, unmatched_texts).
|
||
examples: user-provided room-number examples; None → DEFAULT_EXAMPLES.
|
||
"""
|
||
regexes = compile_examples(examples)
|
||
entities = _all_text_entities(dxf_path)
|
||
|
||
room_markers, other = [], []
|
||
for e in entities:
|
||
m = _is_room_marker(e["text"], regexes)
|
||
if m:
|
||
room_markers.append({"room": m.group(1), "x": e["x"], "y": e["y"]})
|
||
else:
|
||
other.append(e)
|
||
|
||
used: set[str] = set()
|
||
seen_rooms: set[str] = set()
|
||
rooms: list[dict] = []
|
||
for rm in room_markers:
|
||
if rm["room"] in seen_rooms:
|
||
continue
|
||
seen_rooms.add(rm["room"])
|
||
desc = _nearest_description(rm["x"], rm["y"], other, regexes)
|
||
rooms.append({
|
||
"room": rm["room"],
|
||
"description": desc or "",
|
||
"x": round(rm["x"], 1),
|
||
"y": round(rm["y"], 1),
|
||
"source": "rule",
|
||
"confidence": 1.0 if desc else 0.6,
|
||
})
|
||
if desc:
|
||
used.add(desc)
|
||
|
||
unmatched = [
|
||
e for e in other
|
||
if e["text"] not in used
|
||
and not _is_measurement(e["text"])
|
||
and not _is_dimension(e["text"], regexes)
|
||
and len(e["text"]) > 3
|
||
]
|
||
|
||
return rooms, unmatched
|