Initial portal commit: landing + 9 AI-powered apps
Apps: - dwg-rooms: extract room numbers from DWG/DXF - dwg-counting: count symbols in PDF drawings (OpenCV template matching) - contract-check: review PDF contracts against a checklist (Claude vision + Tesseract OCR fallback) - email-drafter: bullet notes → polished Czech/English business emails - invoice-extractor: PDF/image invoice → structured data → Excel - translator: Czech-first translator across 19 languages with tone control - vv-check: find inconsistent unit prices across VV sheets in one workbook - vv-compare: diff original vs new VV files (changes / added / removed) - feature-request: portal users submit ideas + sample files Infrastructure: - LiteLLM gateway with per-app virtual keys + budgets - Langfuse observability - Geist font, shared theme, cross-subdomain back link + theme sync via cookie/URL - Caddy reverse proxy on *.klas.chat
This commit is contained in:
168
dwg-rooms/extractor.py
Normal file
168
dwg-rooms/extractor.py
Normal file
@@ -0,0 +1,168 @@
|
||||
"""Rule-based room extraction from DXF files."""
|
||||
import math
|
||||
import re
|
||||
|
||||
import ezdxf
|
||||
|
||||
# Defaults shown to user; they can remove or replace.
|
||||
DEFAULT_EXAMPLES = ["č.m. 0301", "01024"]
|
||||
|
||||
MEASUREMENT_KW = (
|
||||
"podlaha:", "stěny:", "strop:", "m2", "m²", "povrchová", "nátěr",
|
||||
"penetrační", "vrstva", "odstín", "ral ", "chodníky:", "klenba:",
|
||||
"portály:", "epoxidový", "beton", "dlažba", "omítka", "obklad",
|
||||
)
|
||||
|
||||
|
||||
def example_to_regex(example: str) -> re.Pattern | None:
|
||||
"""
|
||||
Convert an example like '4-22408' or 'č.m. 0301' into a compiled regex.
|
||||
- digits become wildcards (\\d, exactly N digits where the example had N digits)
|
||||
- everything else is matched literally
|
||||
- an optional trailing letter is allowed (for variants like '0301a')
|
||||
"""
|
||||
if not example or not example.strip():
|
||||
return None
|
||||
s = example.strip()
|
||||
parts: list[str] = []
|
||||
run = 0
|
||||
for ch in s:
|
||||
if ch.isdigit():
|
||||
run += 1
|
||||
continue
|
||||
if run:
|
||||
parts.append(rf"\d{{{run}}}")
|
||||
run = 0
|
||||
parts.append(re.escape(ch))
|
||||
if run:
|
||||
parts.append(rf"\d{{{run}}}")
|
||||
pattern = "^(" + "".join(parts) + r"[a-zA-Z]?)$"
|
||||
try:
|
||||
return re.compile(pattern, re.IGNORECASE)
|
||||
except re.error:
|
||||
return None
|
||||
|
||||
|
||||
def compile_examples(examples: list[str] | None) -> list[re.Pattern]:
|
||||
items = examples if examples is not None else DEFAULT_EXAMPLES
|
||||
out = []
|
||||
for ex in items:
|
||||
rx = example_to_regex(ex)
|
||||
if rx is not None:
|
||||
out.append(rx)
|
||||
return out
|
||||
|
||||
|
||||
def _clean_mtext(text: str) -> str:
|
||||
text = re.sub(r"\{\\[^;]*;", "", text)
|
||||
text = re.sub(r"\\[Pp]", " ", text)
|
||||
text = text.replace("}", "")
|
||||
text = re.sub(r"\s*\d+[,\.]\d*\s*m2\s*$", "", text)
|
||||
return " ".join(text.split()).strip()
|
||||
|
||||
|
||||
def _is_measurement(text: str) -> bool:
|
||||
t = text.lower()
|
||||
return any(k in t for k in MEASUREMENT_KW)
|
||||
|
||||
|
||||
def _is_room_marker(text: str, regexes: list[re.Pattern]) -> re.Match | None:
|
||||
s = text.strip()
|
||||
for rx in regexes:
|
||||
m = rx.fullmatch(s)
|
||||
if m:
|
||||
return m
|
||||
return None
|
||||
|
||||
|
||||
def _is_dimension(text: str, regexes: list[re.Pattern]) -> bool:
|
||||
"""
|
||||
Anything that's effectively just digits (with optional separators) and is NOT
|
||||
claimed by any active room pattern — treat as a dimension/measurement value.
|
||||
"""
|
||||
if _is_room_marker(text, regexes):
|
||||
return False
|
||||
clean = re.sub(r"[\s.,\-x×+]", "", text)
|
||||
return clean.isdigit() and len(clean) > 0
|
||||
|
||||
|
||||
def _all_text_entities(dxf_path: str) -> list[dict]:
|
||||
doc = ezdxf.readfile(dxf_path)
|
||||
msp = doc.modelspace()
|
||||
out = []
|
||||
for ent in msp:
|
||||
try:
|
||||
if ent.dxftype() == "TEXT":
|
||||
t = ent.dxf.text.strip()
|
||||
x, y = ent.dxf.insert.x, ent.dxf.insert.y
|
||||
elif ent.dxftype() == "MTEXT":
|
||||
t = _clean_mtext(ent.text)
|
||||
x, y = ent.dxf.insert.x, ent.dxf.insert.y
|
||||
else:
|
||||
continue
|
||||
if t:
|
||||
out.append({"text": t, "x": x, "y": y})
|
||||
except Exception:
|
||||
pass
|
||||
return out
|
||||
|
||||
|
||||
def _nearest_description(rx_x: float, ry: float, candidates: list[dict],
|
||||
regexes: list[re.Pattern], max_dist: float = 8000) -> str | None:
|
||||
best, best_d = None, max_dist
|
||||
for c in candidates:
|
||||
t = c["text"]
|
||||
if _is_measurement(t) or _is_dimension(t, regexes) or len(t.strip()) < 2:
|
||||
continue
|
||||
if _is_room_marker(t, regexes):
|
||||
continue
|
||||
d = math.hypot(c["x"] - rx_x, c["y"] - ry)
|
||||
if d < best_d:
|
||||
best_d, best = d, t
|
||||
return best
|
||||
|
||||
|
||||
def extract_rooms(dxf_path: str, examples: list[str] | None = None) -> tuple[list[dict], list[dict]]:
|
||||
"""
|
||||
Returns (rooms, unmatched_texts).
|
||||
examples: user-provided room-number examples; None → DEFAULT_EXAMPLES.
|
||||
"""
|
||||
regexes = compile_examples(examples)
|
||||
entities = _all_text_entities(dxf_path)
|
||||
|
||||
room_markers, other = [], []
|
||||
for e in entities:
|
||||
m = _is_room_marker(e["text"], regexes)
|
||||
if m:
|
||||
room_markers.append({"room": m.group(1), "x": e["x"], "y": e["y"]})
|
||||
else:
|
||||
other.append(e)
|
||||
|
||||
used: set[str] = set()
|
||||
seen_rooms: set[str] = set()
|
||||
rooms: list[dict] = []
|
||||
for rm in room_markers:
|
||||
if rm["room"] in seen_rooms:
|
||||
continue
|
||||
seen_rooms.add(rm["room"])
|
||||
desc = _nearest_description(rm["x"], rm["y"], other, regexes)
|
||||
rooms.append({
|
||||
"room": rm["room"],
|
||||
"description": desc or "",
|
||||
"x": round(rm["x"], 1),
|
||||
"y": round(rm["y"], 1),
|
||||
"source": "rule",
|
||||
"confidence": 1.0 if desc else 0.6,
|
||||
})
|
||||
if desc:
|
||||
used.add(desc)
|
||||
|
||||
unmatched = [
|
||||
e for e in other
|
||||
if e["text"] not in used
|
||||
and not _is_measurement(e["text"])
|
||||
and not _is_dimension(e["text"], regexes)
|
||||
and len(e["text"]) > 3
|
||||
]
|
||||
|
||||
return rooms, unmatched
|
||||
Reference in New Issue
Block a user