200 lines
5.8 KiB
Python
200 lines
5.8 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""
|
|
§8 머리말/꼬리말(HeaderFooter) 추출
|
|
|
|
HWPX 실제 태그 (section0.xml):
|
|
<hp:headerFooter ...>
|
|
<!-- 내용은 section XML 내 또는 별도 header/footer 영역 -->
|
|
</hp:headerFooter>
|
|
|
|
머리말/꼬리말 안에 표가 있는 경우:
|
|
- 표의 셀에 다중행 텍스트가 포함될 수 있음
|
|
- 각 셀의 colSpan, rowSpan, width, borderFillIDRef 등 추출 필요
|
|
|
|
secPr 내 속성:
|
|
<hp:visibility hideFirstHeader="0" hideFirstFooter="0" .../>
|
|
|
|
디폴트값 생성 안 함.
|
|
"""
|
|
|
|
import re
|
|
|
|
from domain.hwpx.hwpx_utils import hwpunit_to_mm
|
|
|
|
|
|
def extract_header(raw_xml: dict, parsed: dict = None) -> dict | None:
|
|
"""머리말 구조 추출.
|
|
|
|
Returns:
|
|
{
|
|
"exists": True,
|
|
"type": "table" | "text",
|
|
"hidden": False,
|
|
"table": { ... } | None, # 표가 있는 경우
|
|
"texts": ["부서명", ...],
|
|
}
|
|
"""
|
|
return _extract_hf(raw_xml, parsed, "header")
|
|
|
|
|
|
def extract_footer(raw_xml: dict, parsed: dict = None) -> dict | None:
|
|
"""꼬리말 구조 추출."""
|
|
return _extract_hf(raw_xml, parsed, "footer")
|
|
|
|
|
|
def _extract_hf(raw_xml: dict, parsed: dict, hf_type: str) -> dict | None:
|
|
"""header 또는 footer 추출 공통 로직"""
|
|
# 1) parsed에서 직접 제공된 header/footer XML
|
|
hf_xml = None
|
|
if parsed:
|
|
key = f"page_{hf_type}_xml"
|
|
hf_xml = parsed.get(key, "")
|
|
|
|
# 2) section XML에서 headerFooter 블록 탐색
|
|
section_xml = _get_section_xml(raw_xml, parsed)
|
|
|
|
if not hf_xml and section_xml:
|
|
# headerFooter 태그에서 header/footer 구분
|
|
hf_blocks = re.findall(
|
|
r'<hp:headerFooter\b([^>]*)>(.*?)</hp:headerFooter>',
|
|
section_xml, re.DOTALL
|
|
)
|
|
for attrs, inner in hf_blocks:
|
|
# type 속성으로 구분 (HEADER / FOOTER)
|
|
type_m = re.search(r'\btype="([^"]+)"', attrs)
|
|
if type_m:
|
|
if type_m.group(1).upper() == hf_type.upper():
|
|
hf_xml = inner
|
|
break
|
|
|
|
if not hf_xml or not hf_xml.strip():
|
|
return None # 해당 머리말/꼬리말 없음
|
|
|
|
result = {"exists": True}
|
|
|
|
# hidden 여부
|
|
if section_xml:
|
|
hide_key = f"hideFirst{'Header' if hf_type == 'header' else 'Footer'}"
|
|
hide_m = re.search(rf'\b{hide_key}="(\d+)"', section_xml)
|
|
if hide_m:
|
|
result["hidden"] = bool(int(hide_m.group(1)))
|
|
|
|
# 텍스트 추출
|
|
texts = re.findall(r'<hp:t>([^<]*)</hp:t>', hf_xml)
|
|
clean_texts = [t.strip() for t in texts if t.strip()]
|
|
if clean_texts:
|
|
result["texts"] = clean_texts
|
|
|
|
# 표 존재 여부
|
|
tbl_match = re.search(
|
|
r'<hp:tbl\b([^>]*)>(.*?)</hp:tbl>',
|
|
hf_xml, re.DOTALL
|
|
)
|
|
if tbl_match:
|
|
result["type"] = "table"
|
|
result["table"] = _parse_hf_table(tbl_match.group(1), tbl_match.group(2))
|
|
else:
|
|
result["type"] = "text"
|
|
|
|
return result
|
|
|
|
|
|
def _parse_hf_table(tbl_attrs: str, tbl_inner: str) -> dict:
|
|
"""머리말/꼬리말 내 표 파싱"""
|
|
table = {}
|
|
|
|
# rowCnt, colCnt
|
|
for attr in ["rowCnt", "colCnt"]:
|
|
m = re.search(rf'\b{attr}="(\d+)"', tbl_attrs)
|
|
if m:
|
|
table[attr] = int(m.group(1))
|
|
|
|
# 열 너비
|
|
wl = re.search(r'<hp:widthList>([^<]+)</hp:widthList>', tbl_inner)
|
|
if wl:
|
|
try:
|
|
widths = [int(w) for w in wl.group(1).strip().split()]
|
|
table["colWidths_hu"] = widths
|
|
total = sum(widths) or 1
|
|
table["colWidths_pct"] = [round(w / total * 100) for w in widths]
|
|
except ValueError:
|
|
pass
|
|
|
|
# 행/셀
|
|
rows = []
|
|
tr_blocks = re.findall(r'<hp:tr\b[^>]*>(.*?)</hp:tr>', tbl_inner, re.DOTALL)
|
|
for tr in tr_blocks:
|
|
cells = []
|
|
tc_blocks = re.finditer(
|
|
r'<hp:tc\b([^>]*)>(.*?)</hp:tc>', tr, re.DOTALL
|
|
)
|
|
for tc in tc_blocks:
|
|
cell = _parse_hf_cell(tc.group(1), tc.group(2))
|
|
cells.append(cell)
|
|
rows.append(cells)
|
|
|
|
if rows:
|
|
table["rows"] = rows
|
|
|
|
return table
|
|
|
|
|
|
def _parse_hf_cell(tc_attrs: str, tc_inner: str) -> dict:
|
|
"""머리말/꼬리말 셀 파싱"""
|
|
cell = {}
|
|
|
|
# borderFillIDRef
|
|
bf = re.search(r'\bborderFillIDRef="(\d+)"', tc_attrs)
|
|
if bf:
|
|
cell["borderFillIDRef"] = int(bf.group(1))
|
|
|
|
# cellAddr
|
|
addr = re.search(
|
|
r'<hp:cellAddr\b[^>]*\bcolAddr="(\d+)"[^>]*\browAddr="(\d+)"',
|
|
tc_inner
|
|
)
|
|
if addr:
|
|
cell["colAddr"] = int(addr.group(1))
|
|
cell["rowAddr"] = int(addr.group(2))
|
|
|
|
# cellSpan
|
|
span = re.search(r'<hp:cellSpan\b([^/]*)/?>', tc_inner)
|
|
if span:
|
|
cs = re.search(r'\bcolSpan="(\d+)"', span.group(1))
|
|
rs = re.search(r'\browSpan="(\d+)"', span.group(1))
|
|
if cs:
|
|
cell["colSpan"] = int(cs.group(1))
|
|
if rs:
|
|
cell["rowSpan"] = int(rs.group(1))
|
|
|
|
# cellSz
|
|
sz = re.search(r'<hp:cellSz\b([^/]*)/?>', tc_inner)
|
|
if sz:
|
|
w = re.search(r'\bwidth="(\d+)"', sz.group(1))
|
|
if w:
|
|
cell["width_hu"] = int(w.group(1))
|
|
|
|
# 셀 텍스트 (다중행)
|
|
paras = re.findall(r'<hp:p\b[^>]*>(.*?)</hp:p>', tc_inner, re.DOTALL)
|
|
lines = []
|
|
for p in paras:
|
|
p_texts = re.findall(r'<hp:t>([^<]*)</hp:t>', p)
|
|
line = " ".join(t.strip() for t in p_texts if t.strip())
|
|
if line:
|
|
lines.append(line)
|
|
|
|
if lines:
|
|
cell["text"] = " ".join(lines)
|
|
cell["lines"] = lines
|
|
|
|
return cell
|
|
|
|
|
|
def _get_section_xml(raw_xml: dict, parsed: dict = None) -> str | None:
|
|
if parsed and parsed.get("section_xml"):
|
|
return parsed["section_xml"]
|
|
if isinstance(raw_xml, dict):
|
|
for name, content in raw_xml.items():
|
|
if "section" in name.lower() and isinstance(content, str):
|
|
return content
|
|
return raw_xml if isinstance(raw_xml, str) else None |