test/handlers/tools/table.py

# -*- coding: utf-8 -*-
"""
§6 표(Table) 구조 추출

HWPX 실제 태그 (section0.xml):
  <hp:tbl id="..." rowCnt="5" colCnt="3" cellSpacing="0"
          repeatHeader="1" pageBreak="CELL" ...>
    <hp:colSz><hp:widthList>8504 8504 8504</hp:widthList></hp:colSz>
    또는 열 수에 맞는 hp:colSz 형태
    <hp:tr>
      <hp:tc name="" header="0" borderFillIDRef="5" ...>
        <hp:cellAddr colAddr="0" rowAddr="0"/>
        <hp:cellSpan colSpan="2" rowSpan="1"/>
        <hp:cellSz width="17008" height="2400"/>
        <hp:cellMargin left="510" right="510" top="142" bottom="142"/>
        <hp:subList>
          <hp:p ...><hp:run ...><hp:t>셀 텍스트</hp:t></hp:run></hp:p>
        </hp:subList>
      </hp:tc>
    </hp:tr>
  </hp:tbl>

디폴트값 생성 안 함.
"""

import re

from domain.hwpx.hwpx_utils import hwpunit_to_mm


def extract(raw_xml: dict, parsed: dict = None) -> list | None:
    """§6 모든 표 추출.

    Returns:
        [
            {
                "index": 0,
                "rowCnt": 5, "colCnt": 3,
                "repeatHeader": True,
                "pageBreak": "CELL",
                "colWidths_hu": [8504, 8504, 8504],
                "colWidths_pct": [33, 34, 33],
                "rows": [
                    [  # row 0
                        {
                            "colAddr": 0, "rowAddr": 0,
                            "colSpan": 2, "rowSpan": 1,
                            "width_hu": 17008, "height_hu": 2400,
                            "borderFillIDRef": 5,
                            "cellMargin": {"left": 510, "right": 510,
                                           "top": 142, "bottom": 142},
                            "text": "셀 텍스트",
                            "lines": ["셀 텍스트"],
                        },
                        ...
                    ],
                    ...
                ],
            },
            ...
        ]
    """
    section_xml = _get_section_xml(raw_xml, parsed)
    if not section_xml:
        return None

    # tbl 블록 전체 추출
    tbl_blocks = _find_tbl_blocks(section_xml)
    if not tbl_blocks:
        return None

    result = []
    for idx, (tbl_attrs, tbl_inner) in enumerate(tbl_blocks):
        tbl = {"index": idx}

        # 표 속성
        for attr in ["rowCnt", "colCnt"]:
            m = re.search(rf'\b{attr}="(\d+)"', tbl_attrs)
            if m:
                tbl[attr] = int(m.group(1))

        rh = re.search(r'\brepeatHeader="(\d+)"', tbl_attrs)
        if rh:
            tbl["repeatHeader"] = bool(int(rh.group(1)))

        pb = re.search(r'\bpageBreak="([^"]+)"', tbl_attrs)
        if pb:
            tbl["pageBreak"] = pb.group(1)

        # 행/셀 (열 너비보다 먼저 — 첫 행에서 열 너비 추출 가능)
        rows = _extract_rows(tbl_inner)
        if rows:
            tbl["rows"] = rows

        # 열 너비
        col_widths = _extract_col_widths(tbl_inner)
        if not col_widths and rows:
            # colSz 없으면 행 데이터에서 추출 (colspan 고려)
            col_cnt = tbl.get("colCnt", 0)
            col_widths = _col_widths_from_rows(rows, col_cnt)
            if not col_widths:
                col_widths = _col_widths_from_first_row(rows[0])
        if col_widths:
            tbl["colWidths_hu"] = col_widths
            total = sum(col_widths) or 1
            tbl["colWidths_pct"] = [round(w / total * 100) for w in col_widths]

        result.append(tbl)

    return result if result else None


def _find_tbl_blocks(xml: str) -> list:
    """중첩 표를 고려하여 최상위 tbl 블록 추출"""
    blocks = []
    start = 0
    while True:
        # <hp:tbl 시작 찾기
        m = re.search(r'<hp:tbl\b([^>]*)>', xml[start:])
        if not m:
            break

        attrs = m.group(1)
        tag_start = start + m.start()
        content_start = start + m.end()

        # 중첩 카운트로 닫는 태그 찾기
        depth = 1
        pos = content_start
        while depth > 0 and pos < len(xml):
            open_m = re.search(r'<hp:tbl\b', xml[pos:])
            close_m = re.search(r'</hp:tbl>', xml[pos:])

            if close_m is None:
                break

            if open_m and open_m.start() < close_m.start():
                depth += 1
                pos += open_m.end()
            else:
                depth -= 1
                if depth == 0:
                    inner = xml[content_start:pos + close_m.start()]
                    blocks.append((attrs, inner))
                pos += close_m.end()

        start = pos

    return blocks


def _extract_col_widths(tbl_inner: str) -> list | None:
    """열 너비 HWPUNIT 추출"""
    # 패턴 1: <hp:colSz><hp:widthList>8504 8504 8504</hp:widthList>
    wl = re.search(r'<hp:widthList>([^<]+)</hp:widthList>', tbl_inner)
    if wl:
        try:
            return [int(w) for w in wl.group(1).strip().split()]
        except ValueError:
            pass

    # 패턴 2: 개별 colSz 태그
    cols = re.findall(r'<hp:colSz\b[^>]*\bwidth="(\d+)"', tbl_inner)
    if cols:
        return [int(c) for c in cols]

    return None


def _extract_rows(tbl_inner: str) -> list:
    """tr/tc 파싱하여 2D 셀 배열 반환"""
    rows = []

    tr_blocks = re.findall(
        r'<hp:tr\b[^>]*>(.*?)</hp:tr>', tbl_inner, re.DOTALL
    )

    for tr_inner in tr_blocks:
        cells = []
        tc_blocks = re.finditer(
            r'<hp:tc\b([^>]*)>(.*?)</hp:tc>', tr_inner, re.DOTALL
        )

        for tc_match in tc_blocks:
            tc_attrs = tc_match.group(1)
            tc_inner = tc_match.group(2)
            cell = _parse_cell(tc_attrs, tc_inner)
            cells.append(cell)

        rows.append(cells)

    return rows


def _parse_cell(tc_attrs: str, tc_inner: str) -> dict:
    """개별 셀 파싱"""
    cell = {}

    # borderFillIDRef on tc tag
    bf = re.search(r'\bborderFillIDRef="(\d+)"', tc_attrs)
    if bf:
        cell["borderFillIDRef"] = int(bf.group(1))

    # header flag
    hd = re.search(r'\bheader="(\d+)"', tc_attrs)
    if hd:
        cell["isHeader"] = bool(int(hd.group(1)))

    # cellAddr
    addr = re.search(
        r'<hp:cellAddr\b[^>]*\bcolAddr="(\d+)"[^>]*\browAddr="(\d+)"',
        tc_inner
    )
    if addr:
        cell["colAddr"] = int(addr.group(1))
        cell["rowAddr"] = int(addr.group(2))

    # cellSpan
    span = re.search(r'<hp:cellSpan\b([^/]*)/?>', tc_inner)
    if span:
        cs = re.search(r'\bcolSpan="(\d+)"', span.group(1))
        rs = re.search(r'\browSpan="(\d+)"', span.group(1))
        if cs:
            cell["colSpan"] = int(cs.group(1))
        if rs:
            cell["rowSpan"] = int(rs.group(1))

    # cellSz
    sz = re.search(r'<hp:cellSz\b([^/]*)/?>', tc_inner)
    if sz:
        w = re.search(r'\bwidth="(\d+)"', sz.group(1))
        h = re.search(r'\bheight="(\d+)"', sz.group(1))
        if w:
            cell["width_hu"] = int(w.group(1))
        if h:
            cell["height_hu"] = int(h.group(1))

    # cellMargin
    cm = re.search(r'<hp:cellMargin\b([^/]*)/?>', tc_inner)
    if cm:
        margin = {}
        for side in ["left", "right", "top", "bottom"]:
            m = re.search(rf'\b{side}="(\d+)"', cm.group(1))
            if m:
                margin[side] = int(m.group(1))
        if margin:
            cell["cellMargin"] = margin

    # 셀 텍스트
    texts = re.findall(r'<hp:t>([^<]*)</hp:t>', tc_inner)
    all_text = " ".join(t.strip() for t in texts if t.strip())
    if all_text:
        cell["text"] = all_text

    # ★ v2: 셀 내 run의 charPrIDRef 추출 (스타일 연결용)
    run_cprs = re.findall(r'<hp:run\b[^>]*\bcharPrIDRef="(\d+)"', tc_inner)
    if run_cprs:
        cell["charPrIDRefs"] = [int(c) for c in run_cprs]
        cell["primaryCharPrIDRef"] = int(run_cprs[0])

    # ★ v2: 셀 내 p의 paraPrIDRef, styleIDRef 추출
    para_pprs = re.findall(r'<hp:p\b[^>]*\bparaPrIDRef="(\d+)"', tc_inner)
    if para_pprs:
        cell["paraPrIDRefs"] = [int(p) for p in para_pprs]
        cell["primaryParaPrIDRef"] = int(para_pprs[0])

    para_stys = re.findall(r'<hp:p\b[^>]*\bstyleIDRef="(\d+)"', tc_inner)
    if para_stys:
        cell["styleIDRefs"] = [int(s) for s in para_stys]

    # 다중행 (p 태그 기준)
    paras = re.findall(r'<hp:p\b[^>]*>(.*?)</hp:p>', tc_inner, re.DOTALL)
    lines = []
    for p in paras:
        p_texts = re.findall(r'<hp:t>([^<]*)</hp:t>', p)
        line = " ".join(t.strip() for t in p_texts if t.strip())
        if line:
            lines.append(line)
    if lines:
        cell["lines"] = lines

    return cell


def _col_widths_from_first_row(first_row: list) -> list | None:
    """첫 행 셀의 width_hu에서 열 너비 추출 (colSz 없을 때 대체)"""
    widths = []
    for cell in first_row:
        w = cell.get("width_hu")
        if w:
            widths.append(w)
    return widths if widths else None


def _col_widths_from_rows(rows: list, col_cnt: int) -> list | None:
    """★ v2: 모든 행을 순회하여 colspan=1인 행에서 정확한 열 너비 추출.

    첫 행에 colspan이 있으면 열 너비가 부정확하므로,
    모든 열이 colspan=1인 행을 찾아 사용.
    """
    if not rows or not col_cnt:
        return None

    # colspan=1인 셀만 있는 행 찾기 (모든 열 존재)
    for row in rows:
        # 이 행의 모든 셀이 colspan=1이고, 셀 수 == col_cnt인지
        all_single = all(cell.get("colSpan", 1) == 1 for cell in row)
        if all_single and len(row) == col_cnt:
            widths = []
            for cell in sorted(row, key=lambda c: c.get("colAddr", 0)):
                w = cell.get("width_hu")
                if w:
                    widths.append(w)
            if len(widths) == col_cnt:
                return widths

    # 못 찾으면 첫 행 폴백
    return _col_widths_from_first_row(rows[0]) if rows else None


def _get_section_xml(raw_xml: dict, parsed: dict = None) -> str | None:
    if parsed and parsed.get("section_xml"):
        return parsed["section_xml"]
    if isinstance(raw_xml, dict):
        for name, content in raw_xml.items():
            if "section" in name.lower() and isinstance(content, str):
                return content
    return raw_xml if isinstance(raw_xml, str) else None