AI_portal/dwg-rooms/extractor.py

"""Rule-based room extraction from DXF files."""
import math
import re

import ezdxf

# Defaults shown to user; they can remove or replace.
DEFAULT_EXAMPLES = ["č.m. 0301", "01024"]

MEASUREMENT_KW = (
    "podlaha:", "stěny:", "strop:", "m2", "m²", "povrchová", "nátěr",
    "penetrační", "vrstva", "odstín", "ral ", "chodníky:", "klenba:",
    "portály:", "epoxidový", "beton", "dlažba", "omítka", "obklad",
)


def example_to_regex(example: str) -> re.Pattern | None:
    """
    Convert an example like '4-22408' or 'č.m. 0301' into a compiled regex.
      - digits become wildcards (\\d, exactly N digits where the example had N digits)
      - everything else is matched literally
      - an optional trailing letter is allowed (for variants like '0301a')
    """
    if not example or not example.strip():
        return None
    s = example.strip()
    parts: list[str] = []
    run = 0
    for ch in s:
        if ch.isdigit():
            run += 1
            continue
        if run:
            parts.append(rf"\d{{{run}}}")
            run = 0
        parts.append(re.escape(ch))
    if run:
        parts.append(rf"\d{{{run}}}")
    pattern = "^(" + "".join(parts) + r"[a-zA-Z]?)$"
    try:
        return re.compile(pattern, re.IGNORECASE)
    except re.error:
        return None


def compile_examples(examples: list[str] | None) -> list[re.Pattern]:
    items = examples if examples is not None else DEFAULT_EXAMPLES
    out = []
    for ex in items:
        rx = example_to_regex(ex)
        if rx is not None:
            out.append(rx)
    return out


def _clean_mtext(text: str) -> str:
    text = re.sub(r"\{\\[^;]*;", "", text)
    text = re.sub(r"\\[Pp]", " ", text)
    text = text.replace("}", "")
    text = re.sub(r"\s*\d+[,\.]\d*\s*m2\s*$", "", text)
    return " ".join(text.split()).strip()


def _is_measurement(text: str) -> bool:
    t = text.lower()
    return any(k in t for k in MEASUREMENT_KW)


def _is_room_marker(text: str, regexes: list[re.Pattern]) -> re.Match | None:
    s = text.strip()
    for rx in regexes:
        m = rx.fullmatch(s)
        if m:
            return m
    return None


def _is_dimension(text: str, regexes: list[re.Pattern]) -> bool:
    """
    Anything that's effectively just digits (with optional separators) and is NOT
    claimed by any active room pattern — treat as a dimension/measurement value.
    """
    if _is_room_marker(text, regexes):
        return False
    clean = re.sub(r"[\s.,\-x×+]", "", text)
    return clean.isdigit() and len(clean) > 0


def _all_text_entities(dxf_path: str) -> list[dict]:
    doc = ezdxf.readfile(dxf_path)
    msp = doc.modelspace()
    out = []
    for ent in msp:
        try:
            if ent.dxftype() == "TEXT":
                t = ent.dxf.text.strip()
                x, y = ent.dxf.insert.x, ent.dxf.insert.y
            elif ent.dxftype() == "MTEXT":
                t = _clean_mtext(ent.text)
                x, y = ent.dxf.insert.x, ent.dxf.insert.y
            else:
                continue
            if t:
                out.append({"text": t, "x": x, "y": y})
        except Exception:
            pass
    return out


def _nearest_description(rx_x: float, ry: float, candidates: list[dict],
                          regexes: list[re.Pattern], max_dist: float = 8000) -> str | None:
    best, best_d = None, max_dist
    for c in candidates:
        t = c["text"]
        if _is_measurement(t) or _is_dimension(t, regexes) or len(t.strip()) < 2:
            continue
        if _is_room_marker(t, regexes):
            continue
        d = math.hypot(c["x"] - rx_x, c["y"] - ry)
        if d < best_d:
            best_d, best = d, t
    return best


def extract_rooms(dxf_path: str, examples: list[str] | None = None) -> tuple[list[dict], list[dict]]:
    """
    Returns (rooms, unmatched_texts).
    examples: user-provided room-number examples; None → DEFAULT_EXAMPLES.
    """
    regexes = compile_examples(examples)
    entities = _all_text_entities(dxf_path)

    room_markers, other = [], []
    for e in entities:
        m = _is_room_marker(e["text"], regexes)
        if m:
            room_markers.append({"room": m.group(1), "x": e["x"], "y": e["y"]})
        else:
            other.append(e)

    used: set[str] = set()
    seen_rooms: set[str] = set()
    rooms: list[dict] = []
    for rm in room_markers:
        if rm["room"] in seen_rooms:
            continue
        seen_rooms.add(rm["room"])
        desc = _nearest_description(rm["x"], rm["y"], other, regexes)
        rooms.append({
            "room": rm["room"],
            "description": desc or "",
            "x": round(rm["x"], 1),
            "y": round(rm["y"], 1),
            "source": "rule",
            "confidence": 1.0 if desc else 0.6,
        })
        if desc:
            used.add(desc)

    unmatched = [
        e for e in other
        if e["text"] not in used
        and not _is_measurement(e["text"])
        and not _is_dimension(e["text"], regexes)
        and len(e["text"]) > 3
    ]

    return rooms, unmatched