# -*- coding: utf-8 -*- """ §8 머리말/꼬리말(HeaderFooter) 추출 HWPX 실제 태그 (section0.xml): 머리말/꼬리말 안에 표가 있는 경우: - 표의 셀에 다중행 텍스트가 포함될 수 있음 - 각 셀의 colSpan, rowSpan, width, borderFillIDRef 등 추출 필요 secPr 내 속성: 디폴트값 생성 안 함. """ import re from domain.hwpx.hwpx_utils import hwpunit_to_mm def extract_header(raw_xml: dict, parsed: dict = None) -> dict | None: """머리말 구조 추출. Returns: { "exists": True, "type": "table" | "text", "hidden": False, "table": { ... } | None, # 표가 있는 경우 "texts": ["부서명", ...], } """ return _extract_hf(raw_xml, parsed, "header") def extract_footer(raw_xml: dict, parsed: dict = None) -> dict | None: """꼬리말 구조 추출.""" return _extract_hf(raw_xml, parsed, "footer") def _extract_hf(raw_xml: dict, parsed: dict, hf_type: str) -> dict | None: """header 또는 footer 추출 공통 로직""" # 1) parsed에서 직접 제공된 header/footer XML hf_xml = None if parsed: key = f"page_{hf_type}_xml" hf_xml = parsed.get(key, "") # 2) section XML에서 headerFooter 블록 탐색 section_xml = _get_section_xml(raw_xml, parsed) if not hf_xml and section_xml: # headerFooter 태그에서 header/footer 구분 hf_blocks = re.findall( r']*)>(.*?)', section_xml, re.DOTALL ) for attrs, inner in hf_blocks: # type 속성으로 구분 (HEADER / FOOTER) type_m = re.search(r'\btype="([^"]+)"', attrs) if type_m: if type_m.group(1).upper() == hf_type.upper(): hf_xml = inner break if not hf_xml or not hf_xml.strip(): return None # 해당 머리말/꼬리말 없음 result = {"exists": True} # hidden 여부 if section_xml: hide_key = f"hideFirst{'Header' if hf_type == 'header' else 'Footer'}" hide_m = re.search(rf'\b{hide_key}="(\d+)"', section_xml) if hide_m: result["hidden"] = bool(int(hide_m.group(1))) # 텍스트 추출 texts = re.findall(r'([^<]*)', hf_xml) clean_texts = [t.strip() for t in texts if t.strip()] if clean_texts: result["texts"] = clean_texts # 표 존재 여부 tbl_match = re.search( r']*)>(.*?)', hf_xml, re.DOTALL ) if tbl_match: result["type"] = "table" result["table"] = _parse_hf_table(tbl_match.group(1), tbl_match.group(2)) else: result["type"] = "text" return result def _parse_hf_table(tbl_attrs: str, tbl_inner: str) -> dict: """머리말/꼬리말 내 표 파싱""" table = {} # rowCnt, colCnt for attr in ["rowCnt", "colCnt"]: m = re.search(rf'\b{attr}="(\d+)"', tbl_attrs) if m: table[attr] = int(m.group(1)) # 열 너비 wl = re.search(r'([^<]+)', tbl_inner) if wl: try: widths = [int(w) for w in wl.group(1).strip().split()] table["colWidths_hu"] = widths total = sum(widths) or 1 table["colWidths_pct"] = [round(w / total * 100) for w in widths] except ValueError: pass # 행/셀 rows = [] tr_blocks = re.findall(r']*>(.*?)', tbl_inner, re.DOTALL) for tr in tr_blocks: cells = [] tc_blocks = re.finditer( r']*)>(.*?)', tr, re.DOTALL ) for tc in tc_blocks: cell = _parse_hf_cell(tc.group(1), tc.group(2)) cells.append(cell) rows.append(cells) if rows: table["rows"] = rows return table def _parse_hf_cell(tc_attrs: str, tc_inner: str) -> dict: """머리말/꼬리말 셀 파싱""" cell = {} # borderFillIDRef bf = re.search(r'\bborderFillIDRef="(\d+)"', tc_attrs) if bf: cell["borderFillIDRef"] = int(bf.group(1)) # cellAddr addr = re.search( r']*\bcolAddr="(\d+)"[^>]*\browAddr="(\d+)"', tc_inner ) if addr: cell["colAddr"] = int(addr.group(1)) cell["rowAddr"] = int(addr.group(2)) # cellSpan span = re.search(r'', tc_inner) if span: cs = re.search(r'\bcolSpan="(\d+)"', span.group(1)) rs = re.search(r'\browSpan="(\d+)"', span.group(1)) if cs: cell["colSpan"] = int(cs.group(1)) if rs: cell["rowSpan"] = int(rs.group(1)) # cellSz sz = re.search(r'', tc_inner) if sz: w = re.search(r'\bwidth="(\d+)"', sz.group(1)) if w: cell["width_hu"] = int(w.group(1)) # 셀 텍스트 (다중행) paras = re.findall(r']*>(.*?)', tc_inner, re.DOTALL) lines = [] for p in paras: p_texts = re.findall(r'([^<]*)', p) line = " ".join(t.strip() for t in p_texts if t.strip()) if line: lines.append(line) if lines: cell["text"] = " ".join(lines) cell["lines"] = lines return cell def _get_section_xml(raw_xml: dict, parsed: dict = None) -> str | None: if parsed and parsed.get("section_xml"): return parsed["section_xml"] if isinstance(raw_xml, dict): for name, content in raw_xml.items(): if "section" in name.lower() and isinstance(content, str): return content return raw_xml if isinstance(raw_xml, str) else None