📦 Initialize Geulbeot structure and merge Prompts & test projects

2026-03-05 11:32:29 +09:00
commit 555a954458
687 changed files with 205247 additions and 0 deletions
--- a/Code/geulbeot_8th/handlers/tools/table.py
+++ b/Code/geulbeot_8th/handlers/tools/table.py
@@ -0,0 +1,328 @@
+# -*- coding: utf-8 -*-
+"""
+§6 표(Table) 구조 추출
+
+HWPX 실제 태그 (section0.xml):
+  <hp:tbl id="..." rowCnt="5" colCnt="3" cellSpacing="0"
+          repeatHeader="1" pageBreak="CELL" ...>
+    <hp:colSz><hp:widthList>8504 8504 8504</hp:widthList></hp:colSz>
+    또는 열 수에 맞는 hp:colSz 형태
+    <hp:tr>
+      <hp:tc name="" header="0" borderFillIDRef="5" ...>
+        <hp:cellAddr colAddr="0" rowAddr="0"/>
+        <hp:cellSpan colSpan="2" rowSpan="1"/>
+        <hp:cellSz width="17008" height="2400"/>
+        <hp:cellMargin left="510" right="510" top="142" bottom="142"/>
+        <hp:subList>
+          <hp:p ...><hp:run ...><hp:t>셀 텍스트</hp:t></hp:run></hp:p>
+        </hp:subList>
+      </hp:tc>
+    </hp:tr>
+  </hp:tbl>
+
+디폴트값 생성 안 함.
+"""
+
+import re
+
+from domain.hwpx.hwpx_utils import hwpunit_to_mm
+
+
+def extract(raw_xml: dict, parsed: dict = None) -> list | None:
+    """§6 모든 표 추출.
+
+    Returns:
+        [
+            {
+                "index": 0,
+                "rowCnt": 5, "colCnt": 3,
+                "repeatHeader": True,
+                "pageBreak": "CELL",
+                "colWidths_hu": [8504, 8504, 8504],
+                "colWidths_pct": [33, 34, 33],
+                "rows": [
+                    [  # row 0
+                        {
+                            "colAddr": 0, "rowAddr": 0,
+                            "colSpan": 2, "rowSpan": 1,
+                            "width_hu": 17008, "height_hu": 2400,
+                            "borderFillIDRef": 5,
+                            "cellMargin": {"left": 510, "right": 510,
+                                           "top": 142, "bottom": 142},
+                            "text": "셀 텍스트",
+                            "lines": ["셀 텍스트"],
+                        },
+                        ...
+                    ],
+                    ...
+                ],
+            },
+            ...
+        ]
+    """
+    section_xml = _get_section_xml(raw_xml, parsed)
+    if not section_xml:
+        return None
+
+    # tbl 블록 전체 추출
+    tbl_blocks = _find_tbl_blocks(section_xml)
+    if not tbl_blocks:
+        return None
+
+    result = []
+    for idx, (tbl_attrs, tbl_inner) in enumerate(tbl_blocks):
+        tbl = {"index": idx}
+
+        # 표 속성
+        for attr in ["rowCnt", "colCnt"]:
+            m = re.search(rf'\b{attr}="(\d+)"', tbl_attrs)
+            if m:
+                tbl[attr] = int(m.group(1))
+
+        rh = re.search(r'\brepeatHeader="(\d+)"', tbl_attrs)
+        if rh:
+            tbl["repeatHeader"] = bool(int(rh.group(1)))
+
+        pb = re.search(r'\bpageBreak="([^"]+)"', tbl_attrs)
+        if pb:
+            tbl["pageBreak"] = pb.group(1)
+
+        # 행/셀 (열 너비보다 먼저 — 첫 행에서 열 너비 추출 가능)
+        rows = _extract_rows(tbl_inner)
+        if rows:
+            tbl["rows"] = rows
+
+        # 열 너비
+        col_widths = _extract_col_widths(tbl_inner)
+        if not col_widths and rows:
+            # colSz 없으면 행 데이터에서 추출 (colspan 고려)
+            col_cnt = tbl.get("colCnt", 0)
+            col_widths = _col_widths_from_rows(rows, col_cnt)
+            if not col_widths:
+                col_widths = _col_widths_from_first_row(rows[0])
+        if col_widths:
+            tbl["colWidths_hu"] = col_widths
+            total = sum(col_widths) or 1
+            tbl["colWidths_pct"] = [round(w / total * 100) for w in col_widths]
+
+        result.append(tbl)
+
+    return result if result else None
+
+
+def _find_tbl_blocks(xml: str) -> list:
+    """중첩 표를 고려하여 최상위 tbl 블록 추출"""
+    blocks = []
+    start = 0
+    while True:
+        # <hp:tbl 시작 찾기
+        m = re.search(r'<hp:tbl\b([^>]*)>', xml[start:])
+        if not m:
+            break
+
+        attrs = m.group(1)
+        tag_start = start + m.start()
+        content_start = start + m.end()
+
+        # 중첩 카운트로 닫는 태그 찾기
+        depth = 1
+        pos = content_start
+        while depth > 0 and pos < len(xml):
+            open_m = re.search(r'<hp:tbl\b', xml[pos:])
+            close_m = re.search(r'</hp:tbl>', xml[pos:])
+
+            if close_m is None:
+                break
+
+            if open_m and open_m.start() < close_m.start():
+                depth += 1
+                pos += open_m.end()
+            else:
+                depth -= 1
+                if depth == 0:
+                    inner = xml[content_start:pos + close_m.start()]
+                    blocks.append((attrs, inner))
+                pos += close_m.end()
+
+        start = pos
+
+    return blocks
+
+
+def _extract_col_widths(tbl_inner: str) -> list | None:
+    """열 너비 HWPUNIT 추출"""
+    # 패턴 1: <hp:colSz><hp:widthList>8504 8504 8504</hp:widthList>
+    wl = re.search(r'<hp:widthList>([^<]+)</hp:widthList>', tbl_inner)
+    if wl:
+        try:
+            return [int(w) for w in wl.group(1).strip().split()]
+        except ValueError:
+            pass
+
+    # 패턴 2: 개별 colSz 태그
+    cols = re.findall(r'<hp:colSz\b[^>]*\bwidth="(\d+)"', tbl_inner)
+    if cols:
+        return [int(c) for c in cols]
+
+    return None
+
+
+def _extract_rows(tbl_inner: str) -> list:
+    """tr/tc 파싱하여 2D 셀 배열 반환"""
+    rows = []
+
+    tr_blocks = re.findall(
+        r'<hp:tr\b[^>]*>(.*?)</hp:tr>', tbl_inner, re.DOTALL
+    )
+
+    for tr_inner in tr_blocks:
+        cells = []
+        tc_blocks = re.finditer(
+            r'<hp:tc\b([^>]*)>(.*?)</hp:tc>', tr_inner, re.DOTALL
+        )
+
+        for tc_match in tc_blocks:
+            tc_attrs = tc_match.group(1)
+            tc_inner = tc_match.group(2)
+            cell = _parse_cell(tc_attrs, tc_inner)
+            cells.append(cell)
+
+        rows.append(cells)
+
+    return rows
+
+
+def _parse_cell(tc_attrs: str, tc_inner: str) -> dict:
+    """개별 셀 파싱"""
+    cell = {}
+
+    # borderFillIDRef on tc tag
+    bf = re.search(r'\bborderFillIDRef="(\d+)"', tc_attrs)
+    if bf:
+        cell["borderFillIDRef"] = int(bf.group(1))
+
+    # header flag
+    hd = re.search(r'\bheader="(\d+)"', tc_attrs)
+    if hd:
+        cell["isHeader"] = bool(int(hd.group(1)))
+
+    # cellAddr
+    addr = re.search(
+        r'<hp:cellAddr\b[^>]*\bcolAddr="(\d+)"[^>]*\browAddr="(\d+)"',
+        tc_inner
+    )
+    if addr:
+        cell["colAddr"] = int(addr.group(1))
+        cell["rowAddr"] = int(addr.group(2))
+
+    # cellSpan
+    span = re.search(r'<hp:cellSpan\b([^/]*)/?>', tc_inner)
+    if span:
+        cs = re.search(r'\bcolSpan="(\d+)"', span.group(1))
+        rs = re.search(r'\browSpan="(\d+)"', span.group(1))
+        if cs:
+            cell["colSpan"] = int(cs.group(1))
+        if rs:
+            cell["rowSpan"] = int(rs.group(1))
+
+    # cellSz
+    sz = re.search(r'<hp:cellSz\b([^/]*)/?>', tc_inner)
+    if sz:
+        w = re.search(r'\bwidth="(\d+)"', sz.group(1))
+        h = re.search(r'\bheight="(\d+)"', sz.group(1))
+        if w:
+            cell["width_hu"] = int(w.group(1))
+        if h:
+            cell["height_hu"] = int(h.group(1))
+
+    # cellMargin
+    cm = re.search(r'<hp:cellMargin\b([^/]*)/?>', tc_inner)
+    if cm:
+        margin = {}
+        for side in ["left", "right", "top", "bottom"]:
+            m = re.search(rf'\b{side}="(\d+)"', cm.group(1))
+            if m:
+                margin[side] = int(m.group(1))
+        if margin:
+            cell["cellMargin"] = margin
+
+    # 셀 텍스트
+    texts = re.findall(r'<hp:t>([^<]*)</hp:t>', tc_inner)
+    all_text = " ".join(t.strip() for t in texts if t.strip())
+    if all_text:
+        cell["text"] = all_text
+
+    # ★ v2: 셀 내 run의 charPrIDRef 추출 (스타일 연결용)
+    run_cprs = re.findall(r'<hp:run\b[^>]*\bcharPrIDRef="(\d+)"', tc_inner)
+    if run_cprs:
+        cell["charPrIDRefs"] = [int(c) for c in run_cprs]
+        cell["primaryCharPrIDRef"] = int(run_cprs[0])
+
+    # ★ v2: 셀 내 p의 paraPrIDRef, styleIDRef 추출
+    para_pprs = re.findall(r'<hp:p\b[^>]*\bparaPrIDRef="(\d+)"', tc_inner)
+    if para_pprs:
+        cell["paraPrIDRefs"] = [int(p) for p in para_pprs]
+        cell["primaryParaPrIDRef"] = int(para_pprs[0])
+
+    para_stys = re.findall(r'<hp:p\b[^>]*\bstyleIDRef="(\d+)"', tc_inner)
+    if para_stys:
+        cell["styleIDRefs"] = [int(s) for s in para_stys]
+
+    # 다중행 (p 태그 기준)
+    paras = re.findall(r'<hp:p\b[^>]*>(.*?)</hp:p>', tc_inner, re.DOTALL)
+    lines = []
+    for p in paras:
+        p_texts = re.findall(r'<hp:t>([^<]*)</hp:t>', p)
+        line = " ".join(t.strip() for t in p_texts if t.strip())
+        if line:
+            lines.append(line)
+    if lines:
+        cell["lines"] = lines
+
+    return cell
+
+
+def _col_widths_from_first_row(first_row: list) -> list | None:
+    """첫 행 셀의 width_hu에서 열 너비 추출 (colSz 없을 때 대체)"""
+    widths = []
+    for cell in first_row:
+        w = cell.get("width_hu")
+        if w:
+            widths.append(w)
+    return widths if widths else None
+
+
+def _col_widths_from_rows(rows: list, col_cnt: int) -> list | None:
+    """★ v2: 모든 행을 순회하여 colspan=1인 행에서 정확한 열 너비 추출.
+
+    첫 행에 colspan이 있으면 열 너비가 부정확하므로,
+    모든 열이 colspan=1인 행을 찾아 사용.
+    """
+    if not rows or not col_cnt:
+        return None
+
+    # colspan=1인 셀만 있는 행 찾기 (모든 열 존재)
+    for row in rows:
+        # 이 행의 모든 셀이 colspan=1이고, 셀 수 == col_cnt인지
+        all_single = all(cell.get("colSpan", 1) == 1 for cell in row)
+        if all_single and len(row) == col_cnt:
+            widths = []
+            for cell in sorted(row, key=lambda c: c.get("colAddr", 0)):
+                w = cell.get("width_hu")
+                if w:
+                    widths.append(w)
+            if len(widths) == col_cnt:
+                return widths
+
+    # 못 찾으면 첫 행 폴백
+    return _col_widths_from_first_row(rows[0]) if rows else None
+
+
+def _get_section_xml(raw_xml: dict, parsed: dict = None) -> str | None:
+    if parsed and parsed.get("section_xml"):
+        return parsed["section_xml"]
+    if isinstance(raw_xml, dict):
+        for name, content in raw_xml.items():
+            if "section" in name.lower() and isinstance(content, str):
+                return content
+    return raw_xml if isinstance(raw_xml, str) else None