📦 Initialize Geulbeot structure and merge Prompts & test projects

This commit is contained in:
2026-03-05 11:32:29 +09:00
commit 555a954458
687 changed files with 205247 additions and 0 deletions

View File

@@ -0,0 +1,51 @@
# -*- coding: utf-8 -*-
"""
HWPX 템플릿 추출 도구 모음
각 모듈은 HWPX XML에서 특정 항목을 코드 기반으로 추출한다.
- 추출 실패 시 None 반환 (디폴트값 절대 생성 안 함)
- 모든 단위 변환은 hwpx_utils 사용
- hwpx_domain_guide.md 기준 준수
모듈 목록:
page_setup : §7 용지/여백 (pagePr + margin)
font : §3 글꼴 (fontface → font)
char_style : §4 글자 모양 (charPr)
para_style : §5 문단 모양 (paraPr)
border_fill : §2 테두리/배경 (borderFill)
table : §6 표 (tbl, tc)
header_footer: §8 머리말/꼬리말 (headerFooter)
section : §9 구역 정의 (secPr)
style_def : 스타일 정의 (styles)
numbering : 번호매기기/글머리표
image : 이미지/그리기 객체
content_order: 본문 콘텐츠 순서 (section*.xml)
"""
from . import page_setup
from . import font
from . import char_style
from . import para_style
from . import border_fill
from . import table
from . import header_footer
from . import section
from . import style_def
from . import numbering
from . import image
from . import content_order
__all__ = [
"page_setup",
"font",
"char_style",
"para_style",
"border_fill",
"table",
"header_footer",
"section",
"style_def",
"numbering",
"image",
"content_order"
]

View File

@@ -0,0 +1,127 @@
# -*- coding: utf-8 -*-
"""
§2 테두리/배경(BorderFill) 추출
HWPX 실제 태그 (header.xml):
<hh:borderFill id="3" threeD="0" shadow="0" centerLine="NONE" ...>
<hh:leftBorder type="SOLID" width="0.12 mm" color="#000000"/>
<hh:rightBorder type="SOLID" width="0.12 mm" color="#000000"/>
<hh:topBorder type="SOLID" width="0.12 mm" color="#000000"/>
<hh:bottomBorder type="SOLID" width="0.12 mm" color="#000000"/>
<hh:diagonal type="SOLID" width="0.1 mm" color="#000000"/>
<hc:fillBrush>
<hc:winBrush faceColor="#EDEDED" hatchColor="#FFE7E7E7" alpha="0"/>
</hc:fillBrush>
</hh:borderFill>
디폴트값 생성 안 함.
"""
import re
from domain.hwpx.hwpx_utils import BORDER_TYPE_TO_CSS, hwpx_border_to_css
def extract(raw_xml: dict, parsed: dict = None) -> dict | None:
"""§2 borderFill 전체 추출 → id별 dict.
Returns:
{
3: {
"id": 3,
"left": {"type": "SOLID", "width": "0.12 mm", "color": "#000000"},
"right": {"type": "SOLID", "width": "0.12 mm", "color": "#000000"},
"top": {"type": "SOLID", "width": "0.12 mm", "color": "#000000"},
"bottom": {"type": "SOLID", "width": "0.12 mm", "color": "#000000"},
"diagonal": {"type": "SOLID", "width": "0.1 mm", "color": "#000000"},
"background": "#EDEDED", # fillBrush faceColor
"css": { # 편의: 미리 변환된 CSS
"border-left": "0.12mm solid #000000",
...
"background-color": "#EDEDED",
}
},
...
}
또는 추출 실패 시 None
"""
header_xml = _get_header_xml(raw_xml, parsed)
if not header_xml:
return None
blocks = re.findall(
r'<hh:borderFill\b([^>]*)>(.*?)</hh:borderFill>',
header_xml, re.DOTALL
)
if not blocks:
return None
result = {}
for attrs_str, inner in blocks:
id_m = re.search(r'\bid="(\d+)"', attrs_str)
if not id_m:
continue
bf_id = int(id_m.group(1))
item = {"id": bf_id}
# 4방향 + diagonal
for side, tag in [
("left", "leftBorder"),
("right", "rightBorder"),
("top", "topBorder"),
("bottom", "bottomBorder"),
("diagonal", "diagonal"),
]:
# 태그 전체를 먼저 찾고, 속성을 개별 추출 (순서 무관)
tag_m = re.search(rf'<hh:{tag}\b([^/]*?)/?>', inner)
if tag_m:
tag_attrs = tag_m.group(1)
t = re.search(r'\btype="([^"]+)"', tag_attrs)
w = re.search(r'\bwidth="([^"]+)"', tag_attrs)
c = re.search(r'\bcolor="([^"]+)"', tag_attrs)
item[side] = {
"type": t.group(1) if t else "NONE",
"width": w.group(1).replace(" ", "") if w else "0.12mm",
"color": c.group(1) if c else "#000000",
}
# 배경 (fillBrush > winBrush faceColor)
bg_m = re.search(
r'<hc:winBrush\b[^>]*\bfaceColor="([^"]+)"', inner
)
if bg_m:
face = bg_m.group(1)
if face and face.lower() != "none":
item["background"] = face
# CSS 편의 변환
css = {}
for side in ["left", "right", "top", "bottom"]:
border_data = item.get(side)
if border_data:
css[f"border-{side}"] = hwpx_border_to_css(border_data)
else:
css[f"border-{side}"] = "none"
# border_data가 없으면 CSS에도 넣지 않음
if "background" in item:
css["background-color"] = item["background"]
if css:
item["css"] = css
result[bf_id] = item
return result if result else None
def _get_header_xml(raw_xml: dict, parsed: dict = None) -> str | None:
if parsed and parsed.get("header_xml"):
return parsed["header_xml"]
if isinstance(raw_xml, dict):
for name, content in raw_xml.items():
if "header" in name.lower() and isinstance(content, str):
return content
return raw_xml if isinstance(raw_xml, str) else None

View File

@@ -0,0 +1,133 @@
# -*- coding: utf-8 -*-
"""
§4 글자 모양(CharShape) 추출
HWPX 실제 태그 (header.xml):
<hh:charPr id="0" height="1000" textColor="#000000" shadeColor="none"
useFontSpace="0" useKerning="0" symMark="NONE" borderFillIDRef="2">
<hh:fontRef hangul="7" latin="6" hanja="6" .../>
<hh:ratio hangul="100" latin="100" .../>
<hh:spacing hangul="0" latin="0" .../>
<hh:relSz hangul="100" latin="100" .../>
<hh:offset hangul="0" latin="0" .../>
<hh:bold/> <!-- 존재하면 bold -->
<hh:italic/> <!-- 존재하면 italic -->
<hh:underline type="NONE" shape="SOLID" color="#000000"/>
<hh:strikeout shape="NONE" color="#000000"/>
</hh:charPr>
디폴트값 생성 안 함.
"""
import re
from domain.hwpx.hwpx_utils import charsize_to_pt
def extract(raw_xml: dict, parsed: dict = None) -> list | None:
"""§4 charPr 전체 목록 추출.
Returns:
[
{
"id": 0,
"height_pt": 10.0,
"textColor": "#000000",
"bold": False,
"italic": False,
"underline": "NONE",
"strikeout": "NONE",
"fontRef": {"hangul": 7, "latin": 6, ...},
"ratio": {"hangul": 100, "latin": 100, ...},
"spacing": {"hangul": 0, "latin": 0, ...},
"borderFillIDRef": 2,
},
...
]
"""
header_xml = _get_header_xml(raw_xml, parsed)
if not header_xml:
return None
# charPr 블록 추출 (self-closing이 아닌 블록)
blocks = re.findall(
r'<hh:charPr\b([^>]*)>(.*?)</hh:charPr>',
header_xml, re.DOTALL
)
if not blocks:
return None
result = []
for attrs_str, inner in blocks:
item = {}
# 속성 파싱
id_m = re.search(r'\bid="(\d+)"', attrs_str)
if id_m:
item["id"] = int(id_m.group(1))
height_m = re.search(r'\bheight="(\d+)"', attrs_str)
if height_m:
item["height_pt"] = charsize_to_pt(int(height_m.group(1)))
color_m = re.search(r'\btextColor="([^"]+)"', attrs_str)
if color_m:
item["textColor"] = color_m.group(1)
shade_m = re.search(r'\bshadeColor="([^"]+)"', attrs_str)
if shade_m and shade_m.group(1) != "none":
item["shadeColor"] = shade_m.group(1)
bf_m = re.search(r'\bborderFillIDRef="(\d+)"', attrs_str)
if bf_m:
item["borderFillIDRef"] = int(bf_m.group(1))
# bold / italic (태그 존재 여부로 판단)
item["bold"] = bool(re.search(r'<hh:bold\s*/?>', inner))
item["italic"] = bool(re.search(r'<hh:italic\s*/?>', inner))
# fontRef
fr = re.search(r'<hh:fontRef\b([^/]*)/>', inner)
if fr:
item["fontRef"] = _parse_lang_attrs(fr.group(1))
# ratio
ra = re.search(r'<hh:ratio\b([^/]*)/>', inner)
if ra:
item["ratio"] = _parse_lang_attrs(ra.group(1))
# spacing
sp = re.search(r'<hh:spacing\b([^/]*)/>', inner)
if sp:
item["spacing"] = _parse_lang_attrs(sp.group(1))
# underline
ul = re.search(r'<hh:underline\b[^>]*\btype="([^"]+)"', inner)
if ul:
item["underline"] = ul.group(1)
# strikeout
so = re.search(r'<hh:strikeout\b[^>]*\bshape="([^"]+)"', inner)
if so:
item["strikeout"] = so.group(1)
result.append(item)
return result if result else None
def _parse_lang_attrs(attrs_str: str) -> dict:
"""hangul="7" latin="6" ... → {"hangul": 7, "latin": 6, ...}"""
pairs = re.findall(r'(\w+)="(-?\d+)"', attrs_str)
return {k: int(v) for k, v in pairs}
def _get_header_xml(raw_xml: dict, parsed: dict = None) -> str | None:
if parsed and parsed.get("header_xml"):
return parsed["header_xml"]
if isinstance(raw_xml, dict):
for name, content in raw_xml.items():
if "header" in name.lower() and isinstance(content, str):
return content
return raw_xml if isinstance(raw_xml, str) else None

View File

@@ -0,0 +1,529 @@
# -*- coding: utf-8 -*-
"""
content_order.py — HWPX section*.xml 본문 콘텐츠 순서 추출
기존 12개 tool이 header.xml의 "정의(definition)"를 추출하는 반면,
이 tool은 section0.xml의 "본문(content)" 순서를 추출한다.
추출 결과는 template_manager._build_body_html()이
원본 순서 그대로 HTML을 조립하는 데 사용된다.
콘텐츠 유형:
- paragraph : 일반 텍스트 문단
- table : 표 (<hp:tbl>)
- image : 이미지 (<hp:pic>)
- empty : 빈 문단 (줄바꿈 역할)
참조: hwpx_domain_guide.md §6(표), §7(본문 구조)
"""
import re
import logging
logger = logging.getLogger(__name__)
# ================================================================
# 네임스페이스
# ================================================================
# HWPX는 여러 네임스페이스를 사용한다.
# section*.xml: hp: (본문), ha: (속성)
# header.xml: hh: (헤더 정의)
# 실제 파일에서 네임스페이스 URI가 다를 수 있으므로 로컬명 기반 탐색도 병행한다.
DEFAULT_NS = {
'hp': 'http://www.hancom.co.kr/hwpml/2011/paragraph',
'ha': 'http://www.hancom.co.kr/hwpml/2011/attributes',
'hh': 'http://www.hancom.co.kr/hwpml/2011/head',
'hc': 'http://www.hancom.co.kr/hwpml/2011/core',
}
# ================================================================
# 공개 API
# ================================================================
def extract(raw_xml, parsed, ns=None):
"""section*.xml에서 본문 콘텐츠 순서를 추출한다.
Args:
raw_xml (dict): 원본 XML 문자열 딕셔너리.
raw_xml.get("section0") 등으로 section XML에 접근.
parsed (dict): processor.py가 HWPX를 파싱한 전체 결과 dict.
parsed.get("section_xml") 등으로 parsed Element에 접근.
ns (dict, optional): 네임스페이스 매핑. None이면 자동 감지.
Returns:
list[dict]: 콘텐츠 순서 리스트. 각 항목은 다음 키를 포함:
- type: "paragraph" | "table" | "image" | "empty"
- index: 전체 순서 내 인덱스 (0부터)
- paraPrIDRef: 문단모양 참조 ID (str or None)
- styleIDRef: 스타일 참조 ID (str or None)
+ type별 추가 키 (아래 참조)
추출 실패 시 None 반환 (analyzer가 결과에서 제외함).
"""
# ── section XML 찾기 ──
# raw_xml dict에서 section 원본 문자열 추출
section_raw = None
if isinstance(raw_xml, dict):
# 키 이름은 프로젝트마다 다를 수 있음: section0, section_xml 등
for key in ['section0', 'section_xml', 'section0.xml']:
if key in raw_xml:
section_raw = raw_xml[key]
break
# 못 찾으면 "section"으로 시작하는 첫 번째 키
if section_raw is None:
for key, val in raw_xml.items():
if key.startswith('section') and isinstance(val, str):
section_raw = val
break
elif isinstance(raw_xml, str):
section_raw = raw_xml
# parsed dict에서 section Element 또는 문자열 추출
section_parsed = None
if isinstance(parsed, dict):
for key in ['section_xml', 'section0', 'section_parsed', 'section0_parsed']:
val = parsed.get(key)
if val is None:
continue
if isinstance(val, str):
# 문자열이면 section_raw로 활용 (table.py와 동일)
if section_raw is None:
section_raw = val
elif not isinstance(val, dict):
# Element 객체로 추정
section_parsed = val
break
# fallback: raw_xml 문자열을 직접 파싱
if section_parsed is None and section_raw:
import xml.etree.ElementTree as ET
try:
section_parsed = ET.fromstring(section_raw)
except ET.ParseError:
logger.warning("section XML 파싱 실패")
return None
else:
# parsed 자체가 Element일 수 있음 (직접 호출 시)
section_parsed = parsed
if section_parsed is None:
logger.warning("section XML을 찾을 수 없음 — content_order 추출 생략")
return None
if ns is None:
ns = _detect_namespaces(section_raw or '', section_parsed)
# <hp:p> 엘리먼트 수집 — secPr 내부는 제외
paragraphs = _collect_body_paragraphs(section_parsed, ns)
content_order = []
table_idx = 0
image_idx = 0
for p_elem in paragraphs:
para_pr_id = _get_attr(p_elem, 'paraPrIDRef')
style_id = _get_attr(p_elem, 'styleIDRef')
base = {
'index': len(content_order),
'paraPrIDRef': para_pr_id,
'styleIDRef': style_id,
}
# ── (1) 표 확인 ──
tbl = _find_element(p_elem, 'tbl', ns)
if tbl is not None:
tbl_info = _extract_table_info(tbl, ns)
content_order.append({
**base,
'type': 'table',
'table_idx': table_idx,
**tbl_info,
})
table_idx += 1
continue
# ── (2) 이미지 확인 ──
pic = _find_element(p_elem, 'pic', ns)
if pic is not None:
img_info = _extract_image_info(pic, p_elem, ns)
content_order.append({
**base,
'type': 'image',
'image_idx': image_idx,
**img_info,
})
image_idx += 1
continue
# ── (3) 텍스트 문단 / 빈 문단 ──
text = _collect_text(p_elem, ns)
runs_info = _extract_runs_info(p_elem, ns)
if not text.strip():
content_order.append({
**base,
'type': 'empty',
})
else:
content_order.append({
**base,
'type': 'paragraph',
'text': text,
'charPrIDRef': runs_info.get('first_charPrIDRef'),
'runs': runs_info.get('runs', []),
})
logger.info(
"content_order 추출 완료: %d items "
"(paragraphs=%d, tables=%d, images=%d, empty=%d)",
len(content_order),
sum(1 for c in content_order if c['type'] == 'paragraph'),
table_idx,
image_idx,
sum(1 for c in content_order if c['type'] == 'empty'),
)
return content_order
# ================================================================
# 본문 <hp:p> 수집 — secPr 내부 제외
# ================================================================
def _collect_body_paragraphs(root, ns):
"""<hp:sec> 직계 <hp:p> 만 수집한다.
secPr, headerFooter 내부의 <hp:p>는 본문이 아니므로 제외.
subList 내부(셀 안 문단)도 제외 — 표는 통째로 하나의 항목.
"""
paragraphs = []
# 방법 1: sec 직계 자식 중 p 태그만
sec = _find_element(root, 'sec', ns)
if sec is None:
# 루트 자체가 sec일 수 있음
sec = root
for child in sec:
tag = _local_tag(child)
if tag == 'p':
paragraphs.append(child)
# 직계 자식에서 못 찾았으면 fallback: 전체 탐색 (but secPr/subList 제외)
if not paragraphs:
paragraphs = _collect_paragraphs_fallback(root, ns)
return paragraphs
def _collect_paragraphs_fallback(root, ns):
"""fallback: 전체에서 <hp:p>를 찾되, secPr/headerFooter/subList 내부는 제외"""
skip_tags = {'secPr', 'headerFooter', 'subList', 'tc'}
result = []
def _walk(elem, skip=False):
if skip:
return
tag = _local_tag(elem)
if tag in skip_tags:
return
if tag == 'p':
# 부모가 sec이거나 루트 직계인 경우만
result.append(elem)
return # p 내부의 하위 p는 수집하지 않음
for child in elem:
_walk(child)
_walk(root)
return result
# ================================================================
# 표 정보 추출
# ================================================================
def _extract_table_info(tbl, ns):
"""<hp:tbl> 에서 기본 메타 정보 추출"""
info = {
'rowCnt': _get_attr(tbl, 'rowCnt'),
'colCnt': _get_attr(tbl, 'colCnt'),
'borderFillIDRef': _get_attr(tbl, 'borderFillIDRef'),
}
# 열 너비
col_sz = _find_element(tbl, 'colSz', ns)
if col_sz is not None:
width_list_elem = _find_element(col_sz, 'widthList', ns)
if width_list_elem is not None and width_list_elem.text:
info['colWidths'] = width_list_elem.text.strip().split()
return info
# ================================================================
# 이미지 정보 추출
# ================================================================
def _extract_image_info(pic, p_elem, ns):
"""<hp:pic> 에서 이미지 참조 정보 추출"""
info = {
'binaryItemIDRef': None,
'text': '', # 이미지와 같은 문단에 있는 텍스트 (캡션 등)
}
# img 태그에서 binaryItemIDRef
img = _find_element(pic, 'img', ns)
if img is not None:
info['binaryItemIDRef'] = _get_attr(img, 'binaryItemIDRef')
# imgRect에서 크기 정보
img_rect = _find_element(pic, 'imgRect', ns)
if img_rect is not None:
info['imgRect'] = {
'x': _get_attr(img_rect, 'x'),
'y': _get_attr(img_rect, 'y'),
'w': _get_attr(img_rect, 'w'),
'h': _get_attr(img_rect, 'h'),
}
# 같은 문단 내 텍스트 (pic 바깥의 run들)
info['text'] = _collect_text_outside(p_elem, pic, ns)
return info
# ================================================================
# 텍스트 수집
# ================================================================
def _collect_text(p_elem, ns):
"""<hp:p> 내 모든 <hp:t> 텍스트를 순서대로 합침
주의: t.tail은 XML 들여쓰기 공백이므로 수집하지 않는다.
HWPX에서 실제 텍스트는 항상 <hp:t>...</hp:t> 안에 있다.
"""
parts = []
for t in _find_all_elements(p_elem, 't', ns):
if t.text:
parts.append(t.text)
return ''.join(parts)
def _collect_text_outside(p_elem, exclude_elem, ns):
"""p_elem 내에서 exclude_elem(예: pic) 바깥의 텍스트만 수집"""
parts = []
def _walk(elem):
if elem is exclude_elem:
return
tag = _local_tag(elem)
if tag == 't' and elem.text:
parts.append(elem.text)
for child in elem:
_walk(child)
_walk(p_elem)
return ''.join(parts)
# ================================================================
# Run 정보 추출
# ================================================================
def _extract_runs_info(p_elem, ns):
"""<hp:p> 내 <hp:run> 들의 charPrIDRef와 텍스트 추출
Returns:
{
'first_charPrIDRef': str or None,
'runs': [
{'charPrIDRef': '8', 'text': '1. SamanPro...'},
{'charPrIDRef': '24', 'text': '포장설계...'},
]
}
"""
runs = []
first_char_pr = None
for run_elem in _find_direct_runs(p_elem, ns):
char_pr = _get_attr(run_elem, 'charPrIDRef')
if first_char_pr is None and char_pr is not None:
first_char_pr = char_pr
text_parts = []
for t in _find_all_elements(run_elem, 't', ns):
if t.text:
text_parts.append(t.text)
if text_parts:
runs.append({
'charPrIDRef': char_pr,
'text': ''.join(text_parts),
})
return {
'first_charPrIDRef': first_char_pr,
'runs': runs,
}
def _find_direct_runs(p_elem, ns):
"""<hp:p> 직계 <hp:run>만 찾음 (subList 내부 제외)"""
results = []
for child in p_elem:
tag = _local_tag(child)
if tag == 'run':
results.append(child)
return results
# ================================================================
# 네임스페이스 감지
# ================================================================
def _detect_namespaces(raw_xml, parsed):
"""XML에서 실제 사용된 네임스페이스 URI를 감지한다.
HWPX 버전에 따라 네임스페이스 URI가 다를 수 있다:
- 2011 버전: http://www.hancom.co.kr/hwpml/2011/paragraph
- 2016 버전: http://www.hancom.co.kr/hwpml/2016/paragraph (일부)
"""
ns = dict(DEFAULT_NS)
if raw_xml:
# xmlns:hp="..." 패턴으로 실제 URI 추출
for prefix in ['hp', 'ha', 'hh', 'hc']:
pattern = rf'xmlns:{prefix}="([^"]+)"'
match = re.search(pattern, raw_xml)
if match:
ns[prefix] = match.group(1)
return ns
# ================================================================
# XML 유틸리티 — 네임스페이스 불가지론적 탐색
# ================================================================
def _local_tag(elem):
"""'{namespace}localname''localname'"""
tag = elem.tag
if '}' in tag:
return tag.split('}', 1)[1]
return tag
def _get_attr(elem, attr_name):
"""속성값 가져오기. 네임스페이스 유무 모두 시도."""
# 직접 속성명
val = elem.get(attr_name)
if val is not None:
return val
# 네임스페이스 접두사가 붙은 속성 시도
for full_attr in elem.attrib:
if full_attr.endswith(attr_name):
return elem.attrib[full_attr]
return None
def _find_element(parent, local_name, ns):
"""자식 중 로컬명이 일치하는 첫 번째 엘리먼트를 찾는다.
네임스페이스 prefix 시도 후, 실패하면 로컬명 직접 비교.
"""
# 1차: 네임스페이스 prefix로 탐색
for prefix in ['hp', 'hh', 'hc', 'ha']:
uri = ns.get(prefix, '')
found = parent.find(f'{{{uri}}}{local_name}')
if found is not None:
return found
# 2차: 직계 자식 로컬명 비교
for child in parent:
if _local_tag(child) == local_name:
return child
# 3차: 재귀 탐색 (1단계만)
for child in parent:
for grandchild in child:
if _local_tag(grandchild) == local_name:
return grandchild
return None
def _find_all_elements(parent, local_name, ns):
"""하위 전체에서 로컬명이 일치하는 모든 엘리먼트를 찾는다."""
results = []
def _walk(elem):
if _local_tag(elem) == local_name:
results.append(elem)
for child in elem:
_walk(child)
_walk(parent)
return results
# ================================================================
# 편의 함수
# ================================================================
def summarize(content_order):
"""content_order 리스트를 사람이 읽기 쉬운 요약으로 변환"""
lines = []
for item in content_order:
idx = item['index']
t = item['type']
if t == 'paragraph':
text_preview = item['text'][:50]
if len(item['text']) > 50:
text_preview += '...'
lines.append(
f"[{idx:3d}] P paraPr={item['paraPrIDRef']:<4s} "
f"charPr={item.get('charPrIDRef', '-'):<4s} "
f"\"{text_preview}\""
)
elif t == 'table':
lines.append(
f"[{idx:3d}] T table_idx={item['table_idx']} "
f"({item.get('rowCnt', '?')}×{item.get('colCnt', '?')})"
)
elif t == 'image':
ref = item.get('binaryItemIDRef', '?')
caption = item.get('text', '')[:30]
lines.append(
f"[{idx:3d}] I image_idx={item['image_idx']} "
f"ref={ref} \"{caption}\""
)
elif t == 'empty':
lines.append(f"[{idx:3d}] _ (empty)")
return '\n'.join(lines)
def get_stats(content_order):
"""content_order 통계 반환"""
type_map = {
'paragraph': 'paragraphs',
'table': 'tables',
'image': 'images',
'empty': 'empty',
}
stats = {
'total': len(content_order),
'paragraphs': 0,
'tables': 0,
'images': 0,
'empty': 0,
}
for item in content_order:
key = type_map.get(item['type'])
if key:
stats[key] += 1
return stats

View File

@@ -0,0 +1,82 @@
# -*- coding: utf-8 -*-
"""
§3 글꼴(FaceName) 추출
HWPX 실제 태그 (header.xml):
<hh:fontface lang="HANGUL" fontCnt="9">
<hh:font id="0" face="돋움" type="TTF" isEmbedded="0">
<hh:font id="1" face="맑은 고딕" type="TTF" isEmbedded="0">
</hh:fontface>
<hh:fontface lang="LATIN" fontCnt="9">
<hh:font id="0" face="돋움" type="TTF" isEmbedded="0">
</hh:fontface>
디폴트값 생성 안 함. 추출 실패 시 None 반환.
"""
import re
def extract(raw_xml: dict, parsed: dict = None) -> dict | None:
"""§3 fontface에서 언어별 글꼴 정의 추출.
Returns:
{
"HANGUL": [{"id": 0, "face": "돋움", "type": "TTF"}, ...],
"LATIN": [{"id": 0, "face": "돋움", "type": "TTF"}, ...],
"HANJA": [...],
...
}
또는 추출 실패 시 None
"""
header_xml = _get_header_xml(raw_xml, parsed)
if not header_xml:
return None
result = {}
# fontface 블록을 lang별로 추출
fontface_blocks = re.findall(
r'<hh:fontface\b[^>]*\blang="([^"]+)"[^>]*>(.*?)</hh:fontface>',
header_xml, re.DOTALL
)
if not fontface_blocks:
return None
for lang, block_content in fontface_blocks:
fonts = []
font_matches = re.finditer(
r'<hh:font\b[^>]*'
r'\bid="(\d+)"[^>]*'
r'\bface="([^"]+)"[^>]*'
r'\btype="([^"]+)"',
block_content
)
for fm in font_matches:
fonts.append({
"id": int(fm.group(1)),
"face": fm.group(2),
"type": fm.group(3),
})
if fonts:
result[lang] = fonts
return result if result else None
def _get_header_xml(raw_xml: dict, parsed: dict = None) -> str | None:
"""header.xml 문자열을 가져온다."""
if parsed and parsed.get("header_xml"):
return parsed["header_xml"]
if isinstance(raw_xml, dict):
for name, content in raw_xml.items():
if "header" in name.lower() and isinstance(content, str):
return content
if isinstance(raw_xml, str):
return raw_xml
return None

View File

@@ -0,0 +1,200 @@
# -*- coding: utf-8 -*-
"""
§8 머리말/꼬리말(HeaderFooter) 추출
HWPX 실제 태그 (section0.xml):
<hp:headerFooter ...>
<!-- 내용은 section XML 내 또는 별도 header/footer 영역 -->
</hp:headerFooter>
머리말/꼬리말 안에 표가 있는 경우:
- 표의 셀에 다중행 텍스트가 포함될 수 있음
- 각 셀의 colSpan, rowSpan, width, borderFillIDRef 등 추출 필요
secPr 내 속성:
<hp:visibility hideFirstHeader="0" hideFirstFooter="0" .../>
디폴트값 생성 안 함.
"""
import re
from domain.hwpx.hwpx_utils import hwpunit_to_mm
def extract_header(raw_xml: dict, parsed: dict = None) -> dict | None:
"""머리말 구조 추출.
Returns:
{
"exists": True,
"type": "table" | "text",
"hidden": False,
"table": { ... } | None, # 표가 있는 경우
"texts": ["부서명", ...],
}
"""
return _extract_hf(raw_xml, parsed, "header")
def extract_footer(raw_xml: dict, parsed: dict = None) -> dict | None:
"""꼬리말 구조 추출."""
return _extract_hf(raw_xml, parsed, "footer")
def _extract_hf(raw_xml: dict, parsed: dict, hf_type: str) -> dict | None:
"""header 또는 footer 추출 공통 로직"""
# 1) parsed에서 직접 제공된 header/footer XML
hf_xml = None
if parsed:
key = f"page_{hf_type}_xml"
hf_xml = parsed.get(key, "")
# 2) section XML에서 headerFooter 블록 탐색
section_xml = _get_section_xml(raw_xml, parsed)
if not hf_xml and section_xml:
# headerFooter 태그에서 header/footer 구분
hf_blocks = re.findall(
r'<hp:headerFooter\b([^>]*)>(.*?)</hp:headerFooter>',
section_xml, re.DOTALL
)
for attrs, inner in hf_blocks:
# type 속성으로 구분 (HEADER / FOOTER)
type_m = re.search(r'\btype="([^"]+)"', attrs)
if type_m:
if type_m.group(1).upper() == hf_type.upper():
hf_xml = inner
break
if not hf_xml or not hf_xml.strip():
return None # 해당 머리말/꼬리말 없음
result = {"exists": True}
# hidden 여부
if section_xml:
hide_key = f"hideFirst{'Header' if hf_type == 'header' else 'Footer'}"
hide_m = re.search(rf'\b{hide_key}="(\d+)"', section_xml)
if hide_m:
result["hidden"] = bool(int(hide_m.group(1)))
# 텍스트 추출
texts = re.findall(r'<hp:t>([^<]*)</hp:t>', hf_xml)
clean_texts = [t.strip() for t in texts if t.strip()]
if clean_texts:
result["texts"] = clean_texts
# 표 존재 여부
tbl_match = re.search(
r'<hp:tbl\b([^>]*)>(.*?)</hp:tbl>',
hf_xml, re.DOTALL
)
if tbl_match:
result["type"] = "table"
result["table"] = _parse_hf_table(tbl_match.group(1), tbl_match.group(2))
else:
result["type"] = "text"
return result
def _parse_hf_table(tbl_attrs: str, tbl_inner: str) -> dict:
"""머리말/꼬리말 내 표 파싱"""
table = {}
# rowCnt, colCnt
for attr in ["rowCnt", "colCnt"]:
m = re.search(rf'\b{attr}="(\d+)"', tbl_attrs)
if m:
table[attr] = int(m.group(1))
# 열 너비
wl = re.search(r'<hp:widthList>([^<]+)</hp:widthList>', tbl_inner)
if wl:
try:
widths = [int(w) for w in wl.group(1).strip().split()]
table["colWidths_hu"] = widths
total = sum(widths) or 1
table["colWidths_pct"] = [round(w / total * 100) for w in widths]
except ValueError:
pass
# 행/셀
rows = []
tr_blocks = re.findall(r'<hp:tr\b[^>]*>(.*?)</hp:tr>', tbl_inner, re.DOTALL)
for tr in tr_blocks:
cells = []
tc_blocks = re.finditer(
r'<hp:tc\b([^>]*)>(.*?)</hp:tc>', tr, re.DOTALL
)
for tc in tc_blocks:
cell = _parse_hf_cell(tc.group(1), tc.group(2))
cells.append(cell)
rows.append(cells)
if rows:
table["rows"] = rows
return table
def _parse_hf_cell(tc_attrs: str, tc_inner: str) -> dict:
"""머리말/꼬리말 셀 파싱"""
cell = {}
# borderFillIDRef
bf = re.search(r'\bborderFillIDRef="(\d+)"', tc_attrs)
if bf:
cell["borderFillIDRef"] = int(bf.group(1))
# cellAddr
addr = re.search(
r'<hp:cellAddr\b[^>]*\bcolAddr="(\d+)"[^>]*\browAddr="(\d+)"',
tc_inner
)
if addr:
cell["colAddr"] = int(addr.group(1))
cell["rowAddr"] = int(addr.group(2))
# cellSpan
span = re.search(r'<hp:cellSpan\b([^/]*)/?>', tc_inner)
if span:
cs = re.search(r'\bcolSpan="(\d+)"', span.group(1))
rs = re.search(r'\browSpan="(\d+)"', span.group(1))
if cs:
cell["colSpan"] = int(cs.group(1))
if rs:
cell["rowSpan"] = int(rs.group(1))
# cellSz
sz = re.search(r'<hp:cellSz\b([^/]*)/?>', tc_inner)
if sz:
w = re.search(r'\bwidth="(\d+)"', sz.group(1))
if w:
cell["width_hu"] = int(w.group(1))
# 셀 텍스트 (다중행)
paras = re.findall(r'<hp:p\b[^>]*>(.*?)</hp:p>', tc_inner, re.DOTALL)
lines = []
for p in paras:
p_texts = re.findall(r'<hp:t>([^<]*)</hp:t>', p)
line = " ".join(t.strip() for t in p_texts if t.strip())
if line:
lines.append(line)
if lines:
cell["text"] = " ".join(lines)
cell["lines"] = lines
return cell
def _get_section_xml(raw_xml: dict, parsed: dict = None) -> str | None:
if parsed and parsed.get("section_xml"):
return parsed["section_xml"]
if isinstance(raw_xml, dict):
for name, content in raw_xml.items():
if "section" in name.lower() and isinstance(content, str):
return content
return raw_xml if isinstance(raw_xml, str) else None

View File

@@ -0,0 +1,98 @@
# -*- coding: utf-8 -*-
"""
이미지/그리기 객체(ShapeObject) 추출
HWPX 실제 태그 (section0.xml):
<hp:pic id="..." zOrder="..." ...>
<hp:offset x="0" y="0"/>
<hp:orgSz width="..." height="..."/>
<hp:curSz width="..." height="..."/>
<hp:imgRect>
<hp:pt x="..." y="..."/> <!-- 4개 꼭짓점 -->
</hp:imgRect>
<hp:imgClip .../>
<hp:img binaryItemIDRef="image1.JPG" .../>
</hp:pic>
또는 그리기 객체:
<hp:container id="..." ...>
<hp:offset x="..." y="..."/>
...
</hp:container>
디폴트값 생성 안 함.
"""
import re
from domain.hwpx.hwpx_utils import hwpunit_to_mm
def extract(raw_xml: dict, parsed: dict = None) -> list | None:
"""이미지/그리기 객체 추출.
Returns:
[
{
"type": "image",
"binaryItemRef": "image1.JPG",
"width_hu": 28346, "height_hu": 14173,
"width_mm": 100.0, "height_mm": 50.0,
"offset": {"x": 0, "y": 0},
},
...
]
"""
section_xml = _get_section_xml(raw_xml, parsed)
if not section_xml:
return None
result = []
# <hp:pic> 블록
pic_blocks = re.finditer(
r'<hp:pic\b([^>]*)>(.*?)</hp:pic>',
section_xml, re.DOTALL
)
for pm in pic_blocks:
pic_inner = pm.group(2)
item = {"type": "image"}
# binaryItemRef
img = re.search(r'<hp:img\b[^>]*\bbinaryItemIDRef="([^"]+)"', pic_inner)
if img:
item["binaryItemRef"] = img.group(1)
# curSz (현재 크기)
csz = re.search(
r'<hp:curSz\b[^>]*\bwidth="(\d+)"[^>]*\bheight="(\d+)"',
pic_inner
)
if csz:
w, h = int(csz.group(1)), int(csz.group(2))
item["width_hu"] = w
item["height_hu"] = h
item["width_mm"] = round(hwpunit_to_mm(w), 1)
item["height_mm"] = round(hwpunit_to_mm(h), 1)
# offset
off = re.search(
r'<hp:offset\b[^>]*\bx="(-?\d+)"[^>]*\by="(-?\d+)"',
pic_inner
)
if off:
item["offset"] = {"x": int(off.group(1)), "y": int(off.group(2))}
result.append(item)
return result if result else None
def _get_section_xml(raw_xml: dict, parsed: dict = None) -> str | None:
if parsed and parsed.get("section_xml"):
return parsed["section_xml"]
if isinstance(raw_xml, dict):
for name, content in raw_xml.items():
if "section" in name.lower() and isinstance(content, str):
return content
return raw_xml if isinstance(raw_xml, str) else None

View File

@@ -0,0 +1,136 @@
# -*- coding: utf-8 -*-
"""
번호매기기(Numbering) / 글머리표(Bullet) 추출
HWPX 실제 태그 (header.xml):
<hh:numbering id="1" start="0">
<hh:paraHead start="1" level="1" align="LEFT" useInstWidth="1"
autoIndent="1" widthAdjust="0" textOffsetType="PERCENT"
textOffset="50" numFormat="DIGIT" charPrIDRef="4294967295"
checkable="0">^1.</hh:paraHead>
<hh:paraHead start="1" level="2" ... numFormat="HANGUL_SYLLABLE">^2.</hh:paraHead>
</hh:numbering>
<hh:bullet id="1" char="-" useImage="0">
<hh:paraHead level="0" align="LEFT" .../>
</hh:bullet>
디폴트값 생성 안 함.
"""
import re
def extract(raw_xml: dict, parsed: dict = None) -> dict | None:
"""번호매기기 + 글머리표 정의 추출.
Returns:
{
"numberings": [
{
"id": 1, "start": 0,
"levels": [
{"level": 1, "numFormat": "DIGIT", "pattern": "^1.",
"align": "LEFT"},
{"level": 2, "numFormat": "HANGUL_SYLLABLE", "pattern": "^2."},
...
]
}
],
"bullets": [
{"id": 1, "char": "-", "useImage": False}
]
}
"""
header_xml = _get_header_xml(raw_xml, parsed)
if not header_xml:
return None
result = {}
# ── 번호매기기 ──
numbering_blocks = re.findall(
r'<hh:numbering\b([^>]*)>(.*?)</hh:numbering>',
header_xml, re.DOTALL
)
if numbering_blocks:
nums = []
for attrs, inner in numbering_blocks:
num = {}
id_m = re.search(r'\bid="(\d+)"', attrs)
if id_m:
num["id"] = int(id_m.group(1))
start_m = re.search(r'\bstart="(\d+)"', attrs)
if start_m:
num["start"] = int(start_m.group(1))
# paraHead 레벨들
levels = []
heads = re.finditer(
r'<hh:paraHead\b([^>]*)>([^<]*)</hh:paraHead>',
inner
)
for h in heads:
h_attrs = h.group(1)
h_pattern = h.group(2).strip()
level = {}
lv = re.search(r'\blevel="(\d+)"', h_attrs)
if lv:
level["level"] = int(lv.group(1))
fmt = re.search(r'\bnumFormat="([^"]+)"', h_attrs)
if fmt:
level["numFormat"] = fmt.group(1)
al = re.search(r'\balign="([^"]+)"', h_attrs)
if al:
level["align"] = al.group(1)
if h_pattern:
level["pattern"] = h_pattern
if level:
levels.append(level)
if levels:
num["levels"] = levels
nums.append(num)
if nums:
result["numberings"] = nums
# ── 글머리표 ──
bullet_blocks = re.findall(
r'<hh:bullet\b([^>]*)>(.*?)</hh:bullet>',
header_xml, re.DOTALL
)
if bullet_blocks:
bullets = []
for attrs, inner in bullet_blocks:
bullet = {}
id_m = re.search(r'\bid="(\d+)"', attrs)
if id_m:
bullet["id"] = int(id_m.group(1))
char_m = re.search(r'\bchar="([^"]*)"', attrs)
if char_m:
bullet["char"] = char_m.group(1)
img_m = re.search(r'\buseImage="(\d+)"', attrs)
if img_m:
bullet["useImage"] = bool(int(img_m.group(1)))
bullets.append(bullet)
if bullets:
result["bullets"] = bullets
return result if result else None
def _get_header_xml(raw_xml: dict, parsed: dict = None) -> str | None:
if parsed and parsed.get("header_xml"):
return parsed["header_xml"]
if isinstance(raw_xml, dict):
for name, content in raw_xml.items():
if "header" in name.lower() and isinstance(content, str):
return content
return raw_xml if isinstance(raw_xml, str) else None

View File

@@ -0,0 +1,110 @@
# -*- coding: utf-8 -*-
"""
§7 용지 설정 추출 (pagePr + margin)
HWPX 실제 태그:
<hp:pagePr landscape="WIDELY" width="59528" height="84188" gutterType="LEFT_ONLY">
<hp:margin header="4251" footer="4251" gutter="0"
left="5669" right="5669" top="2834" bottom="2834"/>
디폴트값 생성 안 함. 추출 실패 시 None 반환.
"""
import re
from domain.hwpx.hwpx_utils import hwpunit_to_mm, mm_format, detect_paper_size
def extract(raw_xml: dict, parsed: dict = None) -> dict | None:
"""§7 pagePr + margin에서 용지/여백 정보 추출.
Returns:
{
"paper": {"name": "A4", "width_mm": 210.0, "height_mm": 297.0,
"landscape": True/False},
"margins": {"top": "10.0mm", "bottom": "10.0mm",
"left": "20.0mm", "right": "20.0mm",
"header": "15.0mm", "footer": "15.0mm",
"gutter": "0.0mm"}
}
또는 추출 실패 시 None
"""
section_xml = _get_section_xml(raw_xml, parsed)
if not section_xml:
return None
result = {}
# ── 용지 크기 ─────────────────────────────────
page_match = re.search(
r'<hp:pagePr\b[^>]*'
r'\bwidth="(\d+)"[^>]*'
r'\bheight="(\d+)"',
section_xml
)
if not page_match:
# 속성 순서가 다를 수 있음
page_match = re.search(
r'<hp:pagePr\b[^>]*'
r'\bheight="(\d+)"[^>]*'
r'\bwidth="(\d+)"',
section_xml
)
if page_match:
h_hu, w_hu = int(page_match.group(1)), int(page_match.group(2))
else:
return None
else:
w_hu, h_hu = int(page_match.group(1)), int(page_match.group(2))
landscape_match = re.search(
r'<hp:pagePr\b[^>]*\blandscape="([^"]+)"', section_xml
)
is_landscape = False
if landscape_match:
is_landscape = landscape_match.group(1) == "WIDELY"
paper_name = detect_paper_size(w_hu, h_hu)
result["paper"] = {
"name": paper_name,
"width_mm": round(hwpunit_to_mm(w_hu), 1),
"height_mm": round(hwpunit_to_mm(h_hu), 1),
"landscape": is_landscape,
}
# ── 여백 ──────────────────────────────────────
margin_match = re.search(r'<hp:margin\b([^/]*)/>', section_xml)
if not margin_match:
return result # 용지 크기는 있으나 여백은 없을 수 있음
attrs_str = margin_match.group(1)
margins = {}
for key in ["top", "bottom", "left", "right", "header", "footer", "gutter"]:
m = re.search(rf'\b{key}="(\d+)"', attrs_str)
if m:
margins[key] = mm_format(int(m.group(1)))
if margins:
result["margins"] = margins
return result
def _get_section_xml(raw_xml: dict, parsed: dict = None) -> str | None:
"""section XML 문자열을 가져온다."""
# parsed에서 직접 제공
if parsed and parsed.get("section_xml"):
return parsed["section_xml"]
# raw_xml dict에서 section 파일 찾기
if isinstance(raw_xml, dict):
for name, content in raw_xml.items():
if "section" in name.lower() and isinstance(content, str):
return content
# raw_xml이 문자열이면 그대로
if isinstance(raw_xml, str):
return raw_xml
return None

View File

@@ -0,0 +1,185 @@
# -*- coding: utf-8 -*-
"""
§5 문단 모양(ParaShape) 추출
HWPX 실제 태그 (header.xml):
<hh:paraPr id="0" tabPrIDRef="1" condense="0" ...>
<hh:align horizontal="JUSTIFY" vertical="BASELINE"/>
<hh:heading type="NONE" idRef="0" level="0"/>
<hh:breakSetting breakLatinWord="KEEP_WORD" breakNonLatinWord="KEEP_WORD"
widowOrphan="0" keepWithNext="0" keepLines="0"
pageBreakBefore="0" lineWrap="BREAK"/>
<hp:case ...>
<hh:margin>
<hc:intent value="-1310" unit="HWPUNIT"/>
<hc:left value="0" unit="HWPUNIT"/>
<hc:right value="0" unit="HWPUNIT"/>
<hc:prev value="0" unit="HWPUNIT"/>
<hc:next value="0" unit="HWPUNIT"/>
</hh:margin>
<hh:lineSpacing type="PERCENT" value="130" unit="HWPUNIT"/>
</hp:case>
<hh:border borderFillIDRef="2" .../>
</hh:paraPr>
디폴트값 생성 안 함.
"""
import re
from domain.hwpx.hwpx_utils import hwpunit_to_mm
def extract(raw_xml: dict, parsed: dict = None) -> list | None:
"""§5 paraPr 전체 목록 추출.
Returns:
[
{
"id": 0,
"align": "JUSTIFY",
"verticalAlign": "BASELINE",
"heading": {"type": "NONE", "idRef": 0, "level": 0},
"breakSetting": {
"widowOrphan": False, "keepWithNext": False,
"keepLines": False, "pageBreakBefore": False,
"lineWrap": "BREAK",
"breakLatinWord": "KEEP_WORD",
"breakNonLatinWord": "KEEP_WORD"
},
"margin": {
"indent_hu": -1310, "left_hu": 0, "right_hu": 0,
"before_hu": 0, "after_hu": 0,
},
"lineSpacing": {"type": "PERCENT", "value": 130},
"borderFillIDRef": 2,
"tabPrIDRef": 1,
},
...
]
"""
header_xml = _get_header_xml(raw_xml, parsed)
if not header_xml:
return None
blocks = re.findall(
r'<hh:paraPr\b([^>]*)>(.*?)</hh:paraPr>',
header_xml, re.DOTALL
)
if not blocks:
return None
result = []
for attrs_str, inner in blocks:
item = {}
# id
id_m = re.search(r'\bid="(\d+)"', attrs_str)
if id_m:
item["id"] = int(id_m.group(1))
# tabPrIDRef
tab_m = re.search(r'\btabPrIDRef="(\d+)"', attrs_str)
if tab_m:
item["tabPrIDRef"] = int(tab_m.group(1))
# align
al = re.search(r'<hh:align\b[^>]*\bhorizontal="([^"]+)"', inner)
if al:
item["align"] = al.group(1)
val = re.search(r'<hh:align\b[^>]*\bvertical="([^"]+)"', inner)
if val:
item["verticalAlign"] = val.group(1)
# heading
hd = re.search(
r'<hh:heading\b[^>]*\btype="([^"]+)"[^>]*'
r'\bidRef="(\d+)"[^>]*\blevel="(\d+)"', inner
)
if hd:
item["heading"] = {
"type": hd.group(1),
"idRef": int(hd.group(2)),
"level": int(hd.group(3)),
}
# breakSetting
bs = re.search(r'<hh:breakSetting\b([^/]*)/?>', inner)
if bs:
bstr = bs.group(1)
item["breakSetting"] = {
"widowOrphan": _bool_attr(bstr, "widowOrphan"),
"keepWithNext": _bool_attr(bstr, "keepWithNext"),
"keepLines": _bool_attr(bstr, "keepLines"),
"pageBreakBefore": _bool_attr(bstr, "pageBreakBefore"),
"lineWrap": _str_attr(bstr, "lineWrap"),
"breakLatinWord": _str_attr(bstr, "breakLatinWord"),
"breakNonLatinWord": _str_attr(bstr, "breakNonLatinWord"),
}
# margin (hp:case 블록 내 첫 번째 사용 — HwpUnitChar case 우선)
case_block = re.search(
r'<hp:case\b[^>]*required-namespace="[^"]*HwpUnitChar[^"]*"[^>]*>'
r'(.*?)</hp:case>',
inner, re.DOTALL
)
margin_src = case_block.group(1) if case_block else inner
margin = {}
for tag, key in [
("intent", "indent_hu"),
("left", "left_hu"),
("right", "right_hu"),
("prev", "before_hu"),
("next", "after_hu"),
]:
m = re.search(
rf'<hc:{tag}\b[^>]*\bvalue="(-?\d+)"', margin_src
)
if m:
margin[key] = int(m.group(1))
if margin:
item["margin"] = margin
# lineSpacing
ls = re.search(
r'<hh:lineSpacing\b[^>]*\btype="([^"]+)"[^>]*\bvalue="(\d+)"',
margin_src
)
if ls:
item["lineSpacing"] = {
"type": ls.group(1),
"value": int(ls.group(2)),
}
# borderFillIDRef
bf = re.search(r'<hh:border\b[^>]*\bborderFillIDRef="(\d+)"', inner)
if bf:
item["borderFillIDRef"] = int(bf.group(1))
result.append(item)
return result if result else None
def _bool_attr(s: str, name: str) -> bool | None:
m = re.search(rf'\b{name}="(\d+)"', s)
return bool(int(m.group(1))) if m else None
def _str_attr(s: str, name: str) -> str | None:
m = re.search(rf'\b{name}="([^"]+)"', s)
return m.group(1) if m else None
def _get_header_xml(raw_xml: dict, parsed: dict = None) -> str | None:
if parsed and parsed.get("header_xml"):
return parsed["header_xml"]
if isinstance(raw_xml, dict):
for name, content in raw_xml.items():
if "header" in name.lower() and isinstance(content, str):
return content
return raw_xml if isinstance(raw_xml, str) else None

View File

@@ -0,0 +1,120 @@
# -*- coding: utf-8 -*-
"""
§9 구역 정의(Section) 추출
HWPX 실제 태그 (section0.xml):
<hp:secPr id="" textDirection="HORIZONTAL" spaceColumns="1134"
tabStop="8000" tabStopVal="4000" tabStopUnit="HWPUNIT"
outlineShapeIDRef="1" ...>
<hp:grid lineGrid="0" charGrid="0" .../>
<hp:startNum pageStartsOn="BOTH" page="0" .../>
<hp:visibility hideFirstHeader="0" hideFirstFooter="0" .../>
<hp:pagePr landscape="WIDELY" width="59528" height="84188" ...>
<hp:margin header="4251" footer="4251" left="5669" right="5669"
top="2834" bottom="2834"/>
<hp:pageNum pos="BOTTOM_CENTER" formatType="DIGIT" sideChar="-"/>
</hp:secPr>
디폴트값 생성 안 함.
"""
import re
def extract(raw_xml: dict, parsed: dict = None) -> dict | None:
"""§9 구역 속성 추출.
Returns:
{
"textDirection": "HORIZONTAL",
"hideFirstHeader": False,
"hideFirstFooter": False,
"pageNum": {"pos": "BOTTOM_CENTER", "formatType": "DIGIT",
"sideChar": "-"},
"startNum": {"page": 0},
"colDef": None,
}
"""
section_xml = _get_section_xml(raw_xml, parsed)
if not section_xml:
return None
sec_match = re.search(
r'<hp:secPr\b([^>]*)>(.*?)</hp:secPr>',
section_xml, re.DOTALL
)
if not sec_match:
return None
attrs_str = sec_match.group(1)
inner = sec_match.group(2)
result = {}
# textDirection
td = re.search(r'\btextDirection="([^"]+)"', attrs_str)
if td:
result["textDirection"] = td.group(1)
# visibility
vis = re.search(r'<hp:visibility\b([^/]*)/?>', inner)
if vis:
v = vis.group(1)
for attr in ["hideFirstHeader", "hideFirstFooter",
"hideFirstMasterPage", "hideFirstPageNum",
"hideFirstEmptyLine"]:
m = re.search(rf'\b{attr}="(\d+)"', v)
if m:
result[attr] = bool(int(m.group(1)))
# startNum
sn = re.search(r'<hp:startNum\b([^/]*)/?>', inner)
if sn:
sns = sn.group(1)
start = {}
pso = re.search(r'\bpageStartsOn="([^"]+)"', sns)
if pso:
start["pageStartsOn"] = pso.group(1)
pg = re.search(r'\bpage="(\d+)"', sns)
if pg:
start["page"] = int(pg.group(1))
if start:
result["startNum"] = start
# pageNum
pn = re.search(r'<hp:pageNum\b([^/]*)/?>', inner)
if pn:
pns = pn.group(1)
pagenum = {}
for attr in ["pos", "formatType", "sideChar"]:
m = re.search(rf'\b{attr}="([^"]*)"', pns)
if m:
pagenum[attr] = m.group(1)
if pagenum:
result["pageNum"] = pagenum
# colDef (단 설정)
cd = re.search(r'<hp:colDef\b([^>]*)>(.*?)</hp:colDef>', inner, re.DOTALL)
if cd:
cds = cd.group(1)
coldef = {}
cnt = re.search(r'\bcount="(\d+)"', cds)
if cnt:
coldef["count"] = int(cnt.group(1))
layout = re.search(r'\blayout="([^"]+)"', cds)
if layout:
coldef["layout"] = layout.group(1)
if coldef:
result["colDef"] = coldef
return result if result else None
def _get_section_xml(raw_xml: dict, parsed: dict = None) -> str | None:
if parsed and parsed.get("section_xml"):
return parsed["section_xml"]
if isinstance(raw_xml, dict):
for name, content in raw_xml.items():
if "section" in name.lower() and isinstance(content, str):
return content
return raw_xml if isinstance(raw_xml, str) else None

View File

@@ -0,0 +1,68 @@
# -*- coding: utf-8 -*-
"""
스타일 정의(Style) 추출
HWPX 실제 태그 (header.xml):
<hh:styles itemCnt="12">
<hh:style id="0" type="PARA" name="바탕글" engName="Normal"
paraPrIDRef="3" charPrIDRef="0" nextStyleIDRef="0"
langID="1042" lockForm="0"/>
<hh:style id="1" type="PARA" name="머리말" engName="Header"
paraPrIDRef="2" charPrIDRef="3" nextStyleIDRef="1" .../>
</hh:styles>
charPrIDRef → charPr(글자모양), paraPrIDRef → paraPr(문단모양) 연결.
디폴트값 생성 안 함.
"""
import re
def extract(raw_xml: dict, parsed: dict = None) -> list | None:
"""스타일 정의 추출.
Returns:
[
{
"id": 0, "type": "PARA",
"name": "바탕글", "engName": "Normal",
"paraPrIDRef": 3, "charPrIDRef": 0,
"nextStyleIDRef": 0,
},
...
]
"""
header_xml = _get_header_xml(raw_xml, parsed)
if not header_xml:
return None
styles = re.findall(r'<hh:style\b([^/]*)/>', header_xml)
if not styles:
return None
result = []
for s in styles:
item = {}
for attr in ["id", "paraPrIDRef", "charPrIDRef", "nextStyleIDRef"]:
m = re.search(rf'\b{attr}="(\d+)"', s)
if m:
item[attr] = int(m.group(1))
for attr in ["type", "name", "engName"]:
m = re.search(rf'\b{attr}="([^"]*)"', s)
if m:
item[attr] = m.group(1)
result.append(item)
return result if result else None
def _get_header_xml(raw_xml: dict, parsed: dict = None) -> str | None:
if parsed and parsed.get("header_xml"):
return parsed["header_xml"]
if isinstance(raw_xml, dict):
for name, content in raw_xml.items():
if "header" in name.lower() and isinstance(content, str):
return content
return raw_xml if isinstance(raw_xml, str) else None

View File

@@ -0,0 +1,328 @@
# -*- coding: utf-8 -*-
"""
§6 표(Table) 구조 추출
HWPX 실제 태그 (section0.xml):
<hp:tbl id="..." rowCnt="5" colCnt="3" cellSpacing="0"
repeatHeader="1" pageBreak="CELL" ...>
<hp:colSz><hp:widthList>8504 8504 8504</hp:widthList></hp:colSz>
또는 열 수에 맞는 hp:colSz 형태
<hp:tr>
<hp:tc name="" header="0" borderFillIDRef="5" ...>
<hp:cellAddr colAddr="0" rowAddr="0"/>
<hp:cellSpan colSpan="2" rowSpan="1"/>
<hp:cellSz width="17008" height="2400"/>
<hp:cellMargin left="510" right="510" top="142" bottom="142"/>
<hp:subList>
<hp:p ...><hp:run ...><hp:t>셀 텍스트</hp:t></hp:run></hp:p>
</hp:subList>
</hp:tc>
</hp:tr>
</hp:tbl>
디폴트값 생성 안 함.
"""
import re
from domain.hwpx.hwpx_utils import hwpunit_to_mm
def extract(raw_xml: dict, parsed: dict = None) -> list | None:
"""§6 모든 표 추출.
Returns:
[
{
"index": 0,
"rowCnt": 5, "colCnt": 3,
"repeatHeader": True,
"pageBreak": "CELL",
"colWidths_hu": [8504, 8504, 8504],
"colWidths_pct": [33, 34, 33],
"rows": [
[ # row 0
{
"colAddr": 0, "rowAddr": 0,
"colSpan": 2, "rowSpan": 1,
"width_hu": 17008, "height_hu": 2400,
"borderFillIDRef": 5,
"cellMargin": {"left": 510, "right": 510,
"top": 142, "bottom": 142},
"text": "셀 텍스트",
"lines": ["셀 텍스트"],
},
...
],
...
],
},
...
]
"""
section_xml = _get_section_xml(raw_xml, parsed)
if not section_xml:
return None
# tbl 블록 전체 추출
tbl_blocks = _find_tbl_blocks(section_xml)
if not tbl_blocks:
return None
result = []
for idx, (tbl_attrs, tbl_inner) in enumerate(tbl_blocks):
tbl = {"index": idx}
# 표 속성
for attr in ["rowCnt", "colCnt"]:
m = re.search(rf'\b{attr}="(\d+)"', tbl_attrs)
if m:
tbl[attr] = int(m.group(1))
rh = re.search(r'\brepeatHeader="(\d+)"', tbl_attrs)
if rh:
tbl["repeatHeader"] = bool(int(rh.group(1)))
pb = re.search(r'\bpageBreak="([^"]+)"', tbl_attrs)
if pb:
tbl["pageBreak"] = pb.group(1)
# 행/셀 (열 너비보다 먼저 — 첫 행에서 열 너비 추출 가능)
rows = _extract_rows(tbl_inner)
if rows:
tbl["rows"] = rows
# 열 너비
col_widths = _extract_col_widths(tbl_inner)
if not col_widths and rows:
# colSz 없으면 행 데이터에서 추출 (colspan 고려)
col_cnt = tbl.get("colCnt", 0)
col_widths = _col_widths_from_rows(rows, col_cnt)
if not col_widths:
col_widths = _col_widths_from_first_row(rows[0])
if col_widths:
tbl["colWidths_hu"] = col_widths
total = sum(col_widths) or 1
tbl["colWidths_pct"] = [round(w / total * 100) for w in col_widths]
result.append(tbl)
return result if result else None
def _find_tbl_blocks(xml: str) -> list:
"""중첩 표를 고려하여 최상위 tbl 블록 추출"""
blocks = []
start = 0
while True:
# <hp:tbl 시작 찾기
m = re.search(r'<hp:tbl\b([^>]*)>', xml[start:])
if not m:
break
attrs = m.group(1)
tag_start = start + m.start()
content_start = start + m.end()
# 중첩 카운트로 닫는 태그 찾기
depth = 1
pos = content_start
while depth > 0 and pos < len(xml):
open_m = re.search(r'<hp:tbl\b', xml[pos:])
close_m = re.search(r'</hp:tbl>', xml[pos:])
if close_m is None:
break
if open_m and open_m.start() < close_m.start():
depth += 1
pos += open_m.end()
else:
depth -= 1
if depth == 0:
inner = xml[content_start:pos + close_m.start()]
blocks.append((attrs, inner))
pos += close_m.end()
start = pos
return blocks
def _extract_col_widths(tbl_inner: str) -> list | None:
"""열 너비 HWPUNIT 추출"""
# 패턴 1: <hp:colSz><hp:widthList>8504 8504 8504</hp:widthList>
wl = re.search(r'<hp:widthList>([^<]+)</hp:widthList>', tbl_inner)
if wl:
try:
return [int(w) for w in wl.group(1).strip().split()]
except ValueError:
pass
# 패턴 2: 개별 colSz 태그
cols = re.findall(r'<hp:colSz\b[^>]*\bwidth="(\d+)"', tbl_inner)
if cols:
return [int(c) for c in cols]
return None
def _extract_rows(tbl_inner: str) -> list:
"""tr/tc 파싱하여 2D 셀 배열 반환"""
rows = []
tr_blocks = re.findall(
r'<hp:tr\b[^>]*>(.*?)</hp:tr>', tbl_inner, re.DOTALL
)
for tr_inner in tr_blocks:
cells = []
tc_blocks = re.finditer(
r'<hp:tc\b([^>]*)>(.*?)</hp:tc>', tr_inner, re.DOTALL
)
for tc_match in tc_blocks:
tc_attrs = tc_match.group(1)
tc_inner = tc_match.group(2)
cell = _parse_cell(tc_attrs, tc_inner)
cells.append(cell)
rows.append(cells)
return rows
def _parse_cell(tc_attrs: str, tc_inner: str) -> dict:
"""개별 셀 파싱"""
cell = {}
# borderFillIDRef on tc tag
bf = re.search(r'\bborderFillIDRef="(\d+)"', tc_attrs)
if bf:
cell["borderFillIDRef"] = int(bf.group(1))
# header flag
hd = re.search(r'\bheader="(\d+)"', tc_attrs)
if hd:
cell["isHeader"] = bool(int(hd.group(1)))
# cellAddr
addr = re.search(
r'<hp:cellAddr\b[^>]*\bcolAddr="(\d+)"[^>]*\browAddr="(\d+)"',
tc_inner
)
if addr:
cell["colAddr"] = int(addr.group(1))
cell["rowAddr"] = int(addr.group(2))
# cellSpan
span = re.search(r'<hp:cellSpan\b([^/]*)/?>', tc_inner)
if span:
cs = re.search(r'\bcolSpan="(\d+)"', span.group(1))
rs = re.search(r'\browSpan="(\d+)"', span.group(1))
if cs:
cell["colSpan"] = int(cs.group(1))
if rs:
cell["rowSpan"] = int(rs.group(1))
# cellSz
sz = re.search(r'<hp:cellSz\b([^/]*)/?>', tc_inner)
if sz:
w = re.search(r'\bwidth="(\d+)"', sz.group(1))
h = re.search(r'\bheight="(\d+)"', sz.group(1))
if w:
cell["width_hu"] = int(w.group(1))
if h:
cell["height_hu"] = int(h.group(1))
# cellMargin
cm = re.search(r'<hp:cellMargin\b([^/]*)/?>', tc_inner)
if cm:
margin = {}
for side in ["left", "right", "top", "bottom"]:
m = re.search(rf'\b{side}="(\d+)"', cm.group(1))
if m:
margin[side] = int(m.group(1))
if margin:
cell["cellMargin"] = margin
# 셀 텍스트
texts = re.findall(r'<hp:t>([^<]*)</hp:t>', tc_inner)
all_text = " ".join(t.strip() for t in texts if t.strip())
if all_text:
cell["text"] = all_text
# ★ v2: 셀 내 run의 charPrIDRef 추출 (스타일 연결용)
run_cprs = re.findall(r'<hp:run\b[^>]*\bcharPrIDRef="(\d+)"', tc_inner)
if run_cprs:
cell["charPrIDRefs"] = [int(c) for c in run_cprs]
cell["primaryCharPrIDRef"] = int(run_cprs[0])
# ★ v2: 셀 내 p의 paraPrIDRef, styleIDRef 추출
para_pprs = re.findall(r'<hp:p\b[^>]*\bparaPrIDRef="(\d+)"', tc_inner)
if para_pprs:
cell["paraPrIDRefs"] = [int(p) for p in para_pprs]
cell["primaryParaPrIDRef"] = int(para_pprs[0])
para_stys = re.findall(r'<hp:p\b[^>]*\bstyleIDRef="(\d+)"', tc_inner)
if para_stys:
cell["styleIDRefs"] = [int(s) for s in para_stys]
# 다중행 (p 태그 기준)
paras = re.findall(r'<hp:p\b[^>]*>(.*?)</hp:p>', tc_inner, re.DOTALL)
lines = []
for p in paras:
p_texts = re.findall(r'<hp:t>([^<]*)</hp:t>', p)
line = " ".join(t.strip() for t in p_texts if t.strip())
if line:
lines.append(line)
if lines:
cell["lines"] = lines
return cell
def _col_widths_from_first_row(first_row: list) -> list | None:
"""첫 행 셀의 width_hu에서 열 너비 추출 (colSz 없을 때 대체)"""
widths = []
for cell in first_row:
w = cell.get("width_hu")
if w:
widths.append(w)
return widths if widths else None
def _col_widths_from_rows(rows: list, col_cnt: int) -> list | None:
"""★ v2: 모든 행을 순회하여 colspan=1인 행에서 정확한 열 너비 추출.
첫 행에 colspan이 있으면 열 너비가 부정확하므로,
모든 열이 colspan=1인 행을 찾아 사용.
"""
if not rows or not col_cnt:
return None
# colspan=1인 셀만 있는 행 찾기 (모든 열 존재)
for row in rows:
# 이 행의 모든 셀이 colspan=1이고, 셀 수 == col_cnt인지
all_single = all(cell.get("colSpan", 1) == 1 for cell in row)
if all_single and len(row) == col_cnt:
widths = []
for cell in sorted(row, key=lambda c: c.get("colAddr", 0)):
w = cell.get("width_hu")
if w:
widths.append(w)
if len(widths) == col_cnt:
return widths
# 못 찾으면 첫 행 폴백
return _col_widths_from_first_row(rows[0]) if rows else None
def _get_section_xml(raw_xml: dict, parsed: dict = None) -> str | None:
if parsed and parsed.get("section_xml"):
return parsed["section_xml"]
if isinstance(raw_xml, dict):
for name, content in raw_xml.items():
if "section" in name.lower() and isinstance(content, str):
return content
return raw_xml if isinstance(raw_xml, str) else None