Files
test/handlers/tools/table.py

328 lines
10 KiB
Python

# -*- coding: utf-8 -*-
"""
§6 표(Table) 구조 추출
HWPX 실제 태그 (section0.xml):
<hp:tbl id="..." rowCnt="5" colCnt="3" cellSpacing="0"
repeatHeader="1" pageBreak="CELL" ...>
<hp:colSz><hp:widthList>8504 8504 8504</hp:widthList></hp:colSz>
또는 열 수에 맞는 hp:colSz 형태
<hp:tr>
<hp:tc name="" header="0" borderFillIDRef="5" ...>
<hp:cellAddr colAddr="0" rowAddr="0"/>
<hp:cellSpan colSpan="2" rowSpan="1"/>
<hp:cellSz width="17008" height="2400"/>
<hp:cellMargin left="510" right="510" top="142" bottom="142"/>
<hp:subList>
<hp:p ...><hp:run ...><hp:t>셀 텍스트</hp:t></hp:run></hp:p>
</hp:subList>
</hp:tc>
</hp:tr>
</hp:tbl>
디폴트값 생성 안 함.
"""
import re
from domain.hwpx.hwpx_utils import hwpunit_to_mm
def extract(raw_xml: dict, parsed: dict = None) -> list | None:
"""§6 모든 표 추출.
Returns:
[
{
"index": 0,
"rowCnt": 5, "colCnt": 3,
"repeatHeader": True,
"pageBreak": "CELL",
"colWidths_hu": [8504, 8504, 8504],
"colWidths_pct": [33, 34, 33],
"rows": [
[ # row 0
{
"colAddr": 0, "rowAddr": 0,
"colSpan": 2, "rowSpan": 1,
"width_hu": 17008, "height_hu": 2400,
"borderFillIDRef": 5,
"cellMargin": {"left": 510, "right": 510,
"top": 142, "bottom": 142},
"text": "셀 텍스트",
"lines": ["셀 텍스트"],
},
...
],
...
],
},
...
]
"""
section_xml = _get_section_xml(raw_xml, parsed)
if not section_xml:
return None
# tbl 블록 전체 추출
tbl_blocks = _find_tbl_blocks(section_xml)
if not tbl_blocks:
return None
result = []
for idx, (tbl_attrs, tbl_inner) in enumerate(tbl_blocks):
tbl = {"index": idx}
# 표 속성
for attr in ["rowCnt", "colCnt"]:
m = re.search(rf'\b{attr}="(\d+)"', tbl_attrs)
if m:
tbl[attr] = int(m.group(1))
rh = re.search(r'\brepeatHeader="(\d+)"', tbl_attrs)
if rh:
tbl["repeatHeader"] = bool(int(rh.group(1)))
pb = re.search(r'\bpageBreak="([^"]+)"', tbl_attrs)
if pb:
tbl["pageBreak"] = pb.group(1)
# 행/셀 (열 너비보다 먼저 — 첫 행에서 열 너비 추출 가능)
rows = _extract_rows(tbl_inner)
if rows:
tbl["rows"] = rows
# 열 너비
col_widths = _extract_col_widths(tbl_inner)
if not col_widths and rows:
# colSz 없으면 행 데이터에서 추출 (colspan 고려)
col_cnt = tbl.get("colCnt", 0)
col_widths = _col_widths_from_rows(rows, col_cnt)
if not col_widths:
col_widths = _col_widths_from_first_row(rows[0])
if col_widths:
tbl["colWidths_hu"] = col_widths
total = sum(col_widths) or 1
tbl["colWidths_pct"] = [round(w / total * 100) for w in col_widths]
result.append(tbl)
return result if result else None
def _find_tbl_blocks(xml: str) -> list:
"""중첩 표를 고려하여 최상위 tbl 블록 추출"""
blocks = []
start = 0
while True:
# <hp:tbl 시작 찾기
m = re.search(r'<hp:tbl\b([^>]*)>', xml[start:])
if not m:
break
attrs = m.group(1)
tag_start = start + m.start()
content_start = start + m.end()
# 중첩 카운트로 닫는 태그 찾기
depth = 1
pos = content_start
while depth > 0 and pos < len(xml):
open_m = re.search(r'<hp:tbl\b', xml[pos:])
close_m = re.search(r'</hp:tbl>', xml[pos:])
if close_m is None:
break
if open_m and open_m.start() < close_m.start():
depth += 1
pos += open_m.end()
else:
depth -= 1
if depth == 0:
inner = xml[content_start:pos + close_m.start()]
blocks.append((attrs, inner))
pos += close_m.end()
start = pos
return blocks
def _extract_col_widths(tbl_inner: str) -> list | None:
"""열 너비 HWPUNIT 추출"""
# 패턴 1: <hp:colSz><hp:widthList>8504 8504 8504</hp:widthList>
wl = re.search(r'<hp:widthList>([^<]+)</hp:widthList>', tbl_inner)
if wl:
try:
return [int(w) for w in wl.group(1).strip().split()]
except ValueError:
pass
# 패턴 2: 개별 colSz 태그
cols = re.findall(r'<hp:colSz\b[^>]*\bwidth="(\d+)"', tbl_inner)
if cols:
return [int(c) for c in cols]
return None
def _extract_rows(tbl_inner: str) -> list:
"""tr/tc 파싱하여 2D 셀 배열 반환"""
rows = []
tr_blocks = re.findall(
r'<hp:tr\b[^>]*>(.*?)</hp:tr>', tbl_inner, re.DOTALL
)
for tr_inner in tr_blocks:
cells = []
tc_blocks = re.finditer(
r'<hp:tc\b([^>]*)>(.*?)</hp:tc>', tr_inner, re.DOTALL
)
for tc_match in tc_blocks:
tc_attrs = tc_match.group(1)
tc_inner = tc_match.group(2)
cell = _parse_cell(tc_attrs, tc_inner)
cells.append(cell)
rows.append(cells)
return rows
def _parse_cell(tc_attrs: str, tc_inner: str) -> dict:
"""개별 셀 파싱"""
cell = {}
# borderFillIDRef on tc tag
bf = re.search(r'\bborderFillIDRef="(\d+)"', tc_attrs)
if bf:
cell["borderFillIDRef"] = int(bf.group(1))
# header flag
hd = re.search(r'\bheader="(\d+)"', tc_attrs)
if hd:
cell["isHeader"] = bool(int(hd.group(1)))
# cellAddr
addr = re.search(
r'<hp:cellAddr\b[^>]*\bcolAddr="(\d+)"[^>]*\browAddr="(\d+)"',
tc_inner
)
if addr:
cell["colAddr"] = int(addr.group(1))
cell["rowAddr"] = int(addr.group(2))
# cellSpan
span = re.search(r'<hp:cellSpan\b([^/]*)/?>', tc_inner)
if span:
cs = re.search(r'\bcolSpan="(\d+)"', span.group(1))
rs = re.search(r'\browSpan="(\d+)"', span.group(1))
if cs:
cell["colSpan"] = int(cs.group(1))
if rs:
cell["rowSpan"] = int(rs.group(1))
# cellSz
sz = re.search(r'<hp:cellSz\b([^/]*)/?>', tc_inner)
if sz:
w = re.search(r'\bwidth="(\d+)"', sz.group(1))
h = re.search(r'\bheight="(\d+)"', sz.group(1))
if w:
cell["width_hu"] = int(w.group(1))
if h:
cell["height_hu"] = int(h.group(1))
# cellMargin
cm = re.search(r'<hp:cellMargin\b([^/]*)/?>', tc_inner)
if cm:
margin = {}
for side in ["left", "right", "top", "bottom"]:
m = re.search(rf'\b{side}="(\d+)"', cm.group(1))
if m:
margin[side] = int(m.group(1))
if margin:
cell["cellMargin"] = margin
# 셀 텍스트
texts = re.findall(r'<hp:t>([^<]*)</hp:t>', tc_inner)
all_text = " ".join(t.strip() for t in texts if t.strip())
if all_text:
cell["text"] = all_text
# ★ v2: 셀 내 run의 charPrIDRef 추출 (스타일 연결용)
run_cprs = re.findall(r'<hp:run\b[^>]*\bcharPrIDRef="(\d+)"', tc_inner)
if run_cprs:
cell["charPrIDRefs"] = [int(c) for c in run_cprs]
cell["primaryCharPrIDRef"] = int(run_cprs[0])
# ★ v2: 셀 내 p의 paraPrIDRef, styleIDRef 추출
para_pprs = re.findall(r'<hp:p\b[^>]*\bparaPrIDRef="(\d+)"', tc_inner)
if para_pprs:
cell["paraPrIDRefs"] = [int(p) for p in para_pprs]
cell["primaryParaPrIDRef"] = int(para_pprs[0])
para_stys = re.findall(r'<hp:p\b[^>]*\bstyleIDRef="(\d+)"', tc_inner)
if para_stys:
cell["styleIDRefs"] = [int(s) for s in para_stys]
# 다중행 (p 태그 기준)
paras = re.findall(r'<hp:p\b[^>]*>(.*?)</hp:p>', tc_inner, re.DOTALL)
lines = []
for p in paras:
p_texts = re.findall(r'<hp:t>([^<]*)</hp:t>', p)
line = " ".join(t.strip() for t in p_texts if t.strip())
if line:
lines.append(line)
if lines:
cell["lines"] = lines
return cell
def _col_widths_from_first_row(first_row: list) -> list | None:
"""첫 행 셀의 width_hu에서 열 너비 추출 (colSz 없을 때 대체)"""
widths = []
for cell in first_row:
w = cell.get("width_hu")
if w:
widths.append(w)
return widths if widths else None
def _col_widths_from_rows(rows: list, col_cnt: int) -> list | None:
"""★ v2: 모든 행을 순회하여 colspan=1인 행에서 정확한 열 너비 추출.
첫 행에 colspan이 있으면 열 너비가 부정확하므로,
모든 열이 colspan=1인 행을 찾아 사용.
"""
if not rows or not col_cnt:
return None
# colspan=1인 셀만 있는 행 찾기 (모든 열 존재)
for row in rows:
# 이 행의 모든 셀이 colspan=1이고, 셀 수 == col_cnt인지
all_single = all(cell.get("colSpan", 1) == 1 for cell in row)
if all_single and len(row) == col_cnt:
widths = []
for cell in sorted(row, key=lambda c: c.get("colAddr", 0)):
w = cell.get("width_hu")
if w:
widths.append(w)
if len(widths) == col_cnt:
return widths
# 못 찾으면 첫 행 폴백
return _col_widths_from_first_row(rows[0]) if rows else None
def _get_section_xml(raw_xml: dict, parsed: dict = None) -> str | None:
if parsed and parsed.get("section_xml"):
return parsed["section_xml"]
if isinstance(raw_xml, dict):
for name, content in raw_xml.items():
if "section" in name.lower() and isinstance(content, str):
return content
return raw_xml if isinstance(raw_xml, str) else None