328 lines
10 KiB
Python
328 lines
10 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""
|
|
§6 표(Table) 구조 추출
|
|
|
|
HWPX 실제 태그 (section0.xml):
|
|
<hp:tbl id="..." rowCnt="5" colCnt="3" cellSpacing="0"
|
|
repeatHeader="1" pageBreak="CELL" ...>
|
|
<hp:colSz><hp:widthList>8504 8504 8504</hp:widthList></hp:colSz>
|
|
또는 열 수에 맞는 hp:colSz 형태
|
|
<hp:tr>
|
|
<hp:tc name="" header="0" borderFillIDRef="5" ...>
|
|
<hp:cellAddr colAddr="0" rowAddr="0"/>
|
|
<hp:cellSpan colSpan="2" rowSpan="1"/>
|
|
<hp:cellSz width="17008" height="2400"/>
|
|
<hp:cellMargin left="510" right="510" top="142" bottom="142"/>
|
|
<hp:subList>
|
|
<hp:p ...><hp:run ...><hp:t>셀 텍스트</hp:t></hp:run></hp:p>
|
|
</hp:subList>
|
|
</hp:tc>
|
|
</hp:tr>
|
|
</hp:tbl>
|
|
|
|
디폴트값 생성 안 함.
|
|
"""
|
|
|
|
import re
|
|
|
|
from domain.hwpx.hwpx_utils import hwpunit_to_mm
|
|
|
|
|
|
def extract(raw_xml: dict, parsed: dict = None) -> list | None:
|
|
"""§6 모든 표 추출.
|
|
|
|
Returns:
|
|
[
|
|
{
|
|
"index": 0,
|
|
"rowCnt": 5, "colCnt": 3,
|
|
"repeatHeader": True,
|
|
"pageBreak": "CELL",
|
|
"colWidths_hu": [8504, 8504, 8504],
|
|
"colWidths_pct": [33, 34, 33],
|
|
"rows": [
|
|
[ # row 0
|
|
{
|
|
"colAddr": 0, "rowAddr": 0,
|
|
"colSpan": 2, "rowSpan": 1,
|
|
"width_hu": 17008, "height_hu": 2400,
|
|
"borderFillIDRef": 5,
|
|
"cellMargin": {"left": 510, "right": 510,
|
|
"top": 142, "bottom": 142},
|
|
"text": "셀 텍스트",
|
|
"lines": ["셀 텍스트"],
|
|
},
|
|
...
|
|
],
|
|
...
|
|
],
|
|
},
|
|
...
|
|
]
|
|
"""
|
|
section_xml = _get_section_xml(raw_xml, parsed)
|
|
if not section_xml:
|
|
return None
|
|
|
|
# tbl 블록 전체 추출
|
|
tbl_blocks = _find_tbl_blocks(section_xml)
|
|
if not tbl_blocks:
|
|
return None
|
|
|
|
result = []
|
|
for idx, (tbl_attrs, tbl_inner) in enumerate(tbl_blocks):
|
|
tbl = {"index": idx}
|
|
|
|
# 표 속성
|
|
for attr in ["rowCnt", "colCnt"]:
|
|
m = re.search(rf'\b{attr}="(\d+)"', tbl_attrs)
|
|
if m:
|
|
tbl[attr] = int(m.group(1))
|
|
|
|
rh = re.search(r'\brepeatHeader="(\d+)"', tbl_attrs)
|
|
if rh:
|
|
tbl["repeatHeader"] = bool(int(rh.group(1)))
|
|
|
|
pb = re.search(r'\bpageBreak="([^"]+)"', tbl_attrs)
|
|
if pb:
|
|
tbl["pageBreak"] = pb.group(1)
|
|
|
|
# 행/셀 (열 너비보다 먼저 — 첫 행에서 열 너비 추출 가능)
|
|
rows = _extract_rows(tbl_inner)
|
|
if rows:
|
|
tbl["rows"] = rows
|
|
|
|
# 열 너비
|
|
col_widths = _extract_col_widths(tbl_inner)
|
|
if not col_widths and rows:
|
|
# colSz 없으면 행 데이터에서 추출 (colspan 고려)
|
|
col_cnt = tbl.get("colCnt", 0)
|
|
col_widths = _col_widths_from_rows(rows, col_cnt)
|
|
if not col_widths:
|
|
col_widths = _col_widths_from_first_row(rows[0])
|
|
if col_widths:
|
|
tbl["colWidths_hu"] = col_widths
|
|
total = sum(col_widths) or 1
|
|
tbl["colWidths_pct"] = [round(w / total * 100) for w in col_widths]
|
|
|
|
result.append(tbl)
|
|
|
|
return result if result else None
|
|
|
|
|
|
def _find_tbl_blocks(xml: str) -> list:
|
|
"""중첩 표를 고려하여 최상위 tbl 블록 추출"""
|
|
blocks = []
|
|
start = 0
|
|
while True:
|
|
# <hp:tbl 시작 찾기
|
|
m = re.search(r'<hp:tbl\b([^>]*)>', xml[start:])
|
|
if not m:
|
|
break
|
|
|
|
attrs = m.group(1)
|
|
tag_start = start + m.start()
|
|
content_start = start + m.end()
|
|
|
|
# 중첩 카운트로 닫는 태그 찾기
|
|
depth = 1
|
|
pos = content_start
|
|
while depth > 0 and pos < len(xml):
|
|
open_m = re.search(r'<hp:tbl\b', xml[pos:])
|
|
close_m = re.search(r'</hp:tbl>', xml[pos:])
|
|
|
|
if close_m is None:
|
|
break
|
|
|
|
if open_m and open_m.start() < close_m.start():
|
|
depth += 1
|
|
pos += open_m.end()
|
|
else:
|
|
depth -= 1
|
|
if depth == 0:
|
|
inner = xml[content_start:pos + close_m.start()]
|
|
blocks.append((attrs, inner))
|
|
pos += close_m.end()
|
|
|
|
start = pos
|
|
|
|
return blocks
|
|
|
|
|
|
def _extract_col_widths(tbl_inner: str) -> list | None:
|
|
"""열 너비 HWPUNIT 추출"""
|
|
# 패턴 1: <hp:colSz><hp:widthList>8504 8504 8504</hp:widthList>
|
|
wl = re.search(r'<hp:widthList>([^<]+)</hp:widthList>', tbl_inner)
|
|
if wl:
|
|
try:
|
|
return [int(w) for w in wl.group(1).strip().split()]
|
|
except ValueError:
|
|
pass
|
|
|
|
# 패턴 2: 개별 colSz 태그
|
|
cols = re.findall(r'<hp:colSz\b[^>]*\bwidth="(\d+)"', tbl_inner)
|
|
if cols:
|
|
return [int(c) for c in cols]
|
|
|
|
return None
|
|
|
|
|
|
def _extract_rows(tbl_inner: str) -> list:
|
|
"""tr/tc 파싱하여 2D 셀 배열 반환"""
|
|
rows = []
|
|
|
|
tr_blocks = re.findall(
|
|
r'<hp:tr\b[^>]*>(.*?)</hp:tr>', tbl_inner, re.DOTALL
|
|
)
|
|
|
|
for tr_inner in tr_blocks:
|
|
cells = []
|
|
tc_blocks = re.finditer(
|
|
r'<hp:tc\b([^>]*)>(.*?)</hp:tc>', tr_inner, re.DOTALL
|
|
)
|
|
|
|
for tc_match in tc_blocks:
|
|
tc_attrs = tc_match.group(1)
|
|
tc_inner = tc_match.group(2)
|
|
cell = _parse_cell(tc_attrs, tc_inner)
|
|
cells.append(cell)
|
|
|
|
rows.append(cells)
|
|
|
|
return rows
|
|
|
|
|
|
def _parse_cell(tc_attrs: str, tc_inner: str) -> dict:
|
|
"""개별 셀 파싱"""
|
|
cell = {}
|
|
|
|
# borderFillIDRef on tc tag
|
|
bf = re.search(r'\bborderFillIDRef="(\d+)"', tc_attrs)
|
|
if bf:
|
|
cell["borderFillIDRef"] = int(bf.group(1))
|
|
|
|
# header flag
|
|
hd = re.search(r'\bheader="(\d+)"', tc_attrs)
|
|
if hd:
|
|
cell["isHeader"] = bool(int(hd.group(1)))
|
|
|
|
# cellAddr
|
|
addr = re.search(
|
|
r'<hp:cellAddr\b[^>]*\bcolAddr="(\d+)"[^>]*\browAddr="(\d+)"',
|
|
tc_inner
|
|
)
|
|
if addr:
|
|
cell["colAddr"] = int(addr.group(1))
|
|
cell["rowAddr"] = int(addr.group(2))
|
|
|
|
# cellSpan
|
|
span = re.search(r'<hp:cellSpan\b([^/]*)/?>', tc_inner)
|
|
if span:
|
|
cs = re.search(r'\bcolSpan="(\d+)"', span.group(1))
|
|
rs = re.search(r'\browSpan="(\d+)"', span.group(1))
|
|
if cs:
|
|
cell["colSpan"] = int(cs.group(1))
|
|
if rs:
|
|
cell["rowSpan"] = int(rs.group(1))
|
|
|
|
# cellSz
|
|
sz = re.search(r'<hp:cellSz\b([^/]*)/?>', tc_inner)
|
|
if sz:
|
|
w = re.search(r'\bwidth="(\d+)"', sz.group(1))
|
|
h = re.search(r'\bheight="(\d+)"', sz.group(1))
|
|
if w:
|
|
cell["width_hu"] = int(w.group(1))
|
|
if h:
|
|
cell["height_hu"] = int(h.group(1))
|
|
|
|
# cellMargin
|
|
cm = re.search(r'<hp:cellMargin\b([^/]*)/?>', tc_inner)
|
|
if cm:
|
|
margin = {}
|
|
for side in ["left", "right", "top", "bottom"]:
|
|
m = re.search(rf'\b{side}="(\d+)"', cm.group(1))
|
|
if m:
|
|
margin[side] = int(m.group(1))
|
|
if margin:
|
|
cell["cellMargin"] = margin
|
|
|
|
# 셀 텍스트
|
|
texts = re.findall(r'<hp:t>([^<]*)</hp:t>', tc_inner)
|
|
all_text = " ".join(t.strip() for t in texts if t.strip())
|
|
if all_text:
|
|
cell["text"] = all_text
|
|
|
|
# ★ v2: 셀 내 run의 charPrIDRef 추출 (스타일 연결용)
|
|
run_cprs = re.findall(r'<hp:run\b[^>]*\bcharPrIDRef="(\d+)"', tc_inner)
|
|
if run_cprs:
|
|
cell["charPrIDRefs"] = [int(c) for c in run_cprs]
|
|
cell["primaryCharPrIDRef"] = int(run_cprs[0])
|
|
|
|
# ★ v2: 셀 내 p의 paraPrIDRef, styleIDRef 추출
|
|
para_pprs = re.findall(r'<hp:p\b[^>]*\bparaPrIDRef="(\d+)"', tc_inner)
|
|
if para_pprs:
|
|
cell["paraPrIDRefs"] = [int(p) for p in para_pprs]
|
|
cell["primaryParaPrIDRef"] = int(para_pprs[0])
|
|
|
|
para_stys = re.findall(r'<hp:p\b[^>]*\bstyleIDRef="(\d+)"', tc_inner)
|
|
if para_stys:
|
|
cell["styleIDRefs"] = [int(s) for s in para_stys]
|
|
|
|
# 다중행 (p 태그 기준)
|
|
paras = re.findall(r'<hp:p\b[^>]*>(.*?)</hp:p>', tc_inner, re.DOTALL)
|
|
lines = []
|
|
for p in paras:
|
|
p_texts = re.findall(r'<hp:t>([^<]*)</hp:t>', p)
|
|
line = " ".join(t.strip() for t in p_texts if t.strip())
|
|
if line:
|
|
lines.append(line)
|
|
if lines:
|
|
cell["lines"] = lines
|
|
|
|
return cell
|
|
|
|
|
|
def _col_widths_from_first_row(first_row: list) -> list | None:
|
|
"""첫 행 셀의 width_hu에서 열 너비 추출 (colSz 없을 때 대체)"""
|
|
widths = []
|
|
for cell in first_row:
|
|
w = cell.get("width_hu")
|
|
if w:
|
|
widths.append(w)
|
|
return widths if widths else None
|
|
|
|
|
|
def _col_widths_from_rows(rows: list, col_cnt: int) -> list | None:
|
|
"""★ v2: 모든 행을 순회하여 colspan=1인 행에서 정확한 열 너비 추출.
|
|
|
|
첫 행에 colspan이 있으면 열 너비가 부정확하므로,
|
|
모든 열이 colspan=1인 행을 찾아 사용.
|
|
"""
|
|
if not rows or not col_cnt:
|
|
return None
|
|
|
|
# colspan=1인 셀만 있는 행 찾기 (모든 열 존재)
|
|
for row in rows:
|
|
# 이 행의 모든 셀이 colspan=1이고, 셀 수 == col_cnt인지
|
|
all_single = all(cell.get("colSpan", 1) == 1 for cell in row)
|
|
if all_single and len(row) == col_cnt:
|
|
widths = []
|
|
for cell in sorted(row, key=lambda c: c.get("colAddr", 0)):
|
|
w = cell.get("width_hu")
|
|
if w:
|
|
widths.append(w)
|
|
if len(widths) == col_cnt:
|
|
return widths
|
|
|
|
# 못 찾으면 첫 행 폴백
|
|
return _col_widths_from_first_row(rows[0]) if rows else None
|
|
|
|
|
|
def _get_section_xml(raw_xml: dict, parsed: dict = None) -> str | None:
|
|
if parsed and parsed.get("section_xml"):
|
|
return parsed["section_xml"]
|
|
if isinstance(raw_xml, dict):
|
|
for name, content in raw_xml.items():
|
|
if "section" in name.lower() and isinstance(content, str):
|
|
return content
|
|
return raw_xml if isinstance(raw_xml, str) else None |