# -*- coding: utf-8 -*- """ §6 표(Table) 구조 추출 HWPX 실제 태그 (section0.xml): 8504 8504 8504 또는 열 수에 맞는 hp:colSz 형태 셀 텍스트 디폴트값 생성 안 함. """ import re from domain.hwpx.hwpx_utils import hwpunit_to_mm def extract(raw_xml: dict, parsed: dict = None) -> list | None: """§6 모든 표 추출. Returns: [ { "index": 0, "rowCnt": 5, "colCnt": 3, "repeatHeader": True, "pageBreak": "CELL", "colWidths_hu": [8504, 8504, 8504], "colWidths_pct": [33, 34, 33], "rows": [ [ # row 0 { "colAddr": 0, "rowAddr": 0, "colSpan": 2, "rowSpan": 1, "width_hu": 17008, "height_hu": 2400, "borderFillIDRef": 5, "cellMargin": {"left": 510, "right": 510, "top": 142, "bottom": 142}, "text": "셀 텍스트", "lines": ["셀 텍스트"], }, ... ], ... ], }, ... ] """ section_xml = _get_section_xml(raw_xml, parsed) if not section_xml: return None # tbl 블록 전체 추출 tbl_blocks = _find_tbl_blocks(section_xml) if not tbl_blocks: return None result = [] for idx, (tbl_attrs, tbl_inner) in enumerate(tbl_blocks): tbl = {"index": idx} # 표 속성 for attr in ["rowCnt", "colCnt"]: m = re.search(rf'\b{attr}="(\d+)"', tbl_attrs) if m: tbl[attr] = int(m.group(1)) rh = re.search(r'\brepeatHeader="(\d+)"', tbl_attrs) if rh: tbl["repeatHeader"] = bool(int(rh.group(1))) pb = re.search(r'\bpageBreak="([^"]+)"', tbl_attrs) if pb: tbl["pageBreak"] = pb.group(1) # 행/셀 (열 너비보다 먼저 — 첫 행에서 열 너비 추출 가능) rows = _extract_rows(tbl_inner) if rows: tbl["rows"] = rows # 열 너비 col_widths = _extract_col_widths(tbl_inner) if not col_widths and rows: # colSz 없으면 행 데이터에서 추출 (colspan 고려) col_cnt = tbl.get("colCnt", 0) col_widths = _col_widths_from_rows(rows, col_cnt) if not col_widths: col_widths = _col_widths_from_first_row(rows[0]) if col_widths: tbl["colWidths_hu"] = col_widths total = sum(col_widths) or 1 tbl["colWidths_pct"] = [round(w / total * 100) for w in col_widths] result.append(tbl) return result if result else None def _find_tbl_blocks(xml: str) -> list: """중첩 표를 고려하여 최상위 tbl 블록 추출""" blocks = [] start = 0 while True: # ]*)>', xml[start:]) if not m: break attrs = m.group(1) tag_start = start + m.start() content_start = start + m.end() # 중첩 카운트로 닫는 태그 찾기 depth = 1 pos = content_start while depth > 0 and pos < len(xml): open_m = re.search(r'', xml[pos:]) if close_m is None: break if open_m and open_m.start() < close_m.start(): depth += 1 pos += open_m.end() else: depth -= 1 if depth == 0: inner = xml[content_start:pos + close_m.start()] blocks.append((attrs, inner)) pos += close_m.end() start = pos return blocks def _extract_col_widths(tbl_inner: str) -> list | None: """열 너비 HWPUNIT 추출""" # 패턴 1: 8504 8504 8504 wl = re.search(r'([^<]+)', tbl_inner) if wl: try: return [int(w) for w in wl.group(1).strip().split()] except ValueError: pass # 패턴 2: 개별 colSz 태그 cols = re.findall(r']*\bwidth="(\d+)"', tbl_inner) if cols: return [int(c) for c in cols] return None def _extract_rows(tbl_inner: str) -> list: """tr/tc 파싱하여 2D 셀 배열 반환""" rows = [] tr_blocks = re.findall( r']*>(.*?)', tbl_inner, re.DOTALL ) for tr_inner in tr_blocks: cells = [] tc_blocks = re.finditer( r']*)>(.*?)', tr_inner, re.DOTALL ) for tc_match in tc_blocks: tc_attrs = tc_match.group(1) tc_inner = tc_match.group(2) cell = _parse_cell(tc_attrs, tc_inner) cells.append(cell) rows.append(cells) return rows def _parse_cell(tc_attrs: str, tc_inner: str) -> dict: """개별 셀 파싱""" cell = {} # borderFillIDRef on tc tag bf = re.search(r'\bborderFillIDRef="(\d+)"', tc_attrs) if bf: cell["borderFillIDRef"] = int(bf.group(1)) # header flag hd = re.search(r'\bheader="(\d+)"', tc_attrs) if hd: cell["isHeader"] = bool(int(hd.group(1))) # cellAddr addr = re.search( r']*\bcolAddr="(\d+)"[^>]*\browAddr="(\d+)"', tc_inner ) if addr: cell["colAddr"] = int(addr.group(1)) cell["rowAddr"] = int(addr.group(2)) # cellSpan span = re.search(r'', tc_inner) if span: cs = re.search(r'\bcolSpan="(\d+)"', span.group(1)) rs = re.search(r'\browSpan="(\d+)"', span.group(1)) if cs: cell["colSpan"] = int(cs.group(1)) if rs: cell["rowSpan"] = int(rs.group(1)) # cellSz sz = re.search(r'', tc_inner) if sz: w = re.search(r'\bwidth="(\d+)"', sz.group(1)) h = re.search(r'\bheight="(\d+)"', sz.group(1)) if w: cell["width_hu"] = int(w.group(1)) if h: cell["height_hu"] = int(h.group(1)) # cellMargin cm = re.search(r'', tc_inner) if cm: margin = {} for side in ["left", "right", "top", "bottom"]: m = re.search(rf'\b{side}="(\d+)"', cm.group(1)) if m: margin[side] = int(m.group(1)) if margin: cell["cellMargin"] = margin # 셀 텍스트 texts = re.findall(r'([^<]*)', tc_inner) all_text = " ".join(t.strip() for t in texts if t.strip()) if all_text: cell["text"] = all_text # ★ v2: 셀 내 run의 charPrIDRef 추출 (스타일 연결용) run_cprs = re.findall(r']*\bcharPrIDRef="(\d+)"', tc_inner) if run_cprs: cell["charPrIDRefs"] = [int(c) for c in run_cprs] cell["primaryCharPrIDRef"] = int(run_cprs[0]) # ★ v2: 셀 내 p의 paraPrIDRef, styleIDRef 추출 para_pprs = re.findall(r']*\bparaPrIDRef="(\d+)"', tc_inner) if para_pprs: cell["paraPrIDRefs"] = [int(p) for p in para_pprs] cell["primaryParaPrIDRef"] = int(para_pprs[0]) para_stys = re.findall(r']*\bstyleIDRef="(\d+)"', tc_inner) if para_stys: cell["styleIDRefs"] = [int(s) for s in para_stys] # 다중행 (p 태그 기준) paras = re.findall(r']*>(.*?)', tc_inner, re.DOTALL) lines = [] for p in paras: p_texts = re.findall(r'([^<]*)', p) line = " ".join(t.strip() for t in p_texts if t.strip()) if line: lines.append(line) if lines: cell["lines"] = lines return cell def _col_widths_from_first_row(first_row: list) -> list | None: """첫 행 셀의 width_hu에서 열 너비 추출 (colSz 없을 때 대체)""" widths = [] for cell in first_row: w = cell.get("width_hu") if w: widths.append(w) return widths if widths else None def _col_widths_from_rows(rows: list, col_cnt: int) -> list | None: """★ v2: 모든 행을 순회하여 colspan=1인 행에서 정확한 열 너비 추출. 첫 행에 colspan이 있으면 열 너비가 부정확하므로, 모든 열이 colspan=1인 행을 찾아 사용. """ if not rows or not col_cnt: return None # colspan=1인 셀만 있는 행 찾기 (모든 열 존재) for row in rows: # 이 행의 모든 셀이 colspan=1이고, 셀 수 == col_cnt인지 all_single = all(cell.get("colSpan", 1) == 1 for cell in row) if all_single and len(row) == col_cnt: widths = [] for cell in sorted(row, key=lambda c: c.get("colAddr", 0)): w = cell.get("width_hu") if w: widths.append(w) if len(widths) == col_cnt: return widths # 못 찾으면 첫 행 폴백 return _col_widths_from_first_row(rows[0]) if rows else None def _get_section_xml(raw_xml: dict, parsed: dict = None) -> str | None: if parsed and parsed.get("section_xml"): return parsed["section_xml"] if isinstance(raw_xml, dict): for name, content in raw_xml.items(): if "section" in name.lower() and isinstance(content, str): return content return raw_xml if isinstance(raw_xml, str) else None