"""Rule-based room extraction from DXF files.""" import math import re import ezdxf # Defaults shown to user; they can remove or replace. DEFAULT_EXAMPLES = ["č.m. 0301", "01024"] MEASUREMENT_KW = ( "podlaha:", "stěny:", "strop:", "m2", "m²", "povrchová", "nátěr", "penetrační", "vrstva", "odstín", "ral ", "chodníky:", "klenba:", "portály:", "epoxidový", "beton", "dlažba", "omítka", "obklad", ) def example_to_regex(example: str) -> re.Pattern | None: """ Convert an example like '4-22408' or 'č.m. 0301' into a compiled regex. - digits become wildcards (\\d, exactly N digits where the example had N digits) - everything else is matched literally - an optional trailing letter is allowed (for variants like '0301a') """ if not example or not example.strip(): return None s = example.strip() parts: list[str] = [] run = 0 for ch in s: if ch.isdigit(): run += 1 continue if run: parts.append(rf"\d{{{run}}}") run = 0 parts.append(re.escape(ch)) if run: parts.append(rf"\d{{{run}}}") pattern = "^(" + "".join(parts) + r"[a-zA-Z]?)$" try: return re.compile(pattern, re.IGNORECASE) except re.error: return None def compile_examples(examples: list[str] | None) -> list[re.Pattern]: items = examples if examples is not None else DEFAULT_EXAMPLES out = [] for ex in items: rx = example_to_regex(ex) if rx is not None: out.append(rx) return out def _clean_mtext(text: str) -> str: text = re.sub(r"\{\\[^;]*;", "", text) text = re.sub(r"\\[Pp]", " ", text) text = text.replace("}", "") text = re.sub(r"\s*\d+[,\.]\d*\s*m2\s*$", "", text) return " ".join(text.split()).strip() def _is_measurement(text: str) -> bool: t = text.lower() return any(k in t for k in MEASUREMENT_KW) def _is_room_marker(text: str, regexes: list[re.Pattern]) -> re.Match | None: s = text.strip() for rx in regexes: m = rx.fullmatch(s) if m: return m return None def _is_dimension(text: str, regexes: list[re.Pattern]) -> bool: """ Anything that's effectively just digits (with optional separators) and is NOT claimed by any active room pattern — treat as a dimension/measurement value. """ if _is_room_marker(text, regexes): return False clean = re.sub(r"[\s.,\-x×+]", "", text) return clean.isdigit() and len(clean) > 0 def _all_text_entities(dxf_path: str) -> list[dict]: doc = ezdxf.readfile(dxf_path) msp = doc.modelspace() out = [] for ent in msp: try: if ent.dxftype() == "TEXT": t = ent.dxf.text.strip() x, y = ent.dxf.insert.x, ent.dxf.insert.y elif ent.dxftype() == "MTEXT": t = _clean_mtext(ent.text) x, y = ent.dxf.insert.x, ent.dxf.insert.y else: continue if t: out.append({"text": t, "x": x, "y": y}) except Exception: pass return out def _nearest_description(rx_x: float, ry: float, candidates: list[dict], regexes: list[re.Pattern], max_dist: float = 8000) -> str | None: best, best_d = None, max_dist for c in candidates: t = c["text"] if _is_measurement(t) or _is_dimension(t, regexes) or len(t.strip()) < 2: continue if _is_room_marker(t, regexes): continue d = math.hypot(c["x"] - rx_x, c["y"] - ry) if d < best_d: best_d, best = d, t return best def extract_rooms(dxf_path: str, examples: list[str] | None = None) -> tuple[list[dict], list[dict]]: """ Returns (rooms, unmatched_texts). examples: user-provided room-number examples; None → DEFAULT_EXAMPLES. """ regexes = compile_examples(examples) entities = _all_text_entities(dxf_path) room_markers, other = [], [] for e in entities: m = _is_room_marker(e["text"], regexes) if m: room_markers.append({"room": m.group(1), "x": e["x"], "y": e["y"]}) else: other.append(e) used: set[str] = set() seen_rooms: set[str] = set() rooms: list[dict] = [] for rm in room_markers: if rm["room"] in seen_rooms: continue seen_rooms.add(rm["room"]) desc = _nearest_description(rm["x"], rm["y"], other, regexes) rooms.append({ "room": rm["room"], "description": desc or "", "x": round(rm["x"], 1), "y": round(rm["y"], 1), "source": "rule", "confidence": 1.0 if desc else 0.6, }) if desc: used.add(desc) unmatched = [ e for e in other if e["text"] not in used and not _is_measurement(e["text"]) and not _is_dimension(e["text"], regexes) and len(e["text"]) > 3 ] return rooms, unmatched