v8:문서유형 분석등록 및 추출_20260206

2026-02-20 11:46:52 +09:00
parent db6532b33c
commit c3e9e29205
57 changed files with 22138 additions and 1421 deletions
--- a/handlers/tools/section.py
+++ b/handlers/tools/section.py
@@ -0,0 +1,120 @@
+# -*- coding: utf-8 -*-
+"""
+§9 구역 정의(Section) 추출
+
+HWPX 실제 태그 (section0.xml):
+  <hp:secPr id="" textDirection="HORIZONTAL" spaceColumns="1134"
+            tabStop="8000" tabStopVal="4000" tabStopUnit="HWPUNIT"
+            outlineShapeIDRef="1" ...>
+    <hp:grid lineGrid="0" charGrid="0" .../>
+    <hp:startNum pageStartsOn="BOTH" page="0" .../>
+    <hp:visibility hideFirstHeader="0" hideFirstFooter="0" .../>
+    <hp:pagePr landscape="WIDELY" width="59528" height="84188" ...>
+    <hp:margin header="4251" footer="4251" left="5669" right="5669"
+               top="2834" bottom="2834"/>
+    <hp:pageNum pos="BOTTOM_CENTER" formatType="DIGIT" sideChar="-"/>
+  </hp:secPr>
+
+디폴트값 생성 안 함.
+"""
+
+import re
+
+
+def extract(raw_xml: dict, parsed: dict = None) -> dict | None:
+    """§9 구역 속성 추출.
+
+    Returns:
+        {
+            "textDirection": "HORIZONTAL",
+            "hideFirstHeader": False,
+            "hideFirstFooter": False,
+            "pageNum": {"pos": "BOTTOM_CENTER", "formatType": "DIGIT",
+                        "sideChar": "-"},
+            "startNum": {"page": 0},
+            "colDef": None,
+        }
+    """
+    section_xml = _get_section_xml(raw_xml, parsed)
+    if not section_xml:
+        return None
+
+    sec_match = re.search(
+        r'<hp:secPr\b([^>]*)>(.*?)</hp:secPr>',
+        section_xml, re.DOTALL
+    )
+    if not sec_match:
+        return None
+
+    attrs_str = sec_match.group(1)
+    inner = sec_match.group(2)
+
+    result = {}
+
+    # textDirection
+    td = re.search(r'\btextDirection="([^"]+)"', attrs_str)
+    if td:
+        result["textDirection"] = td.group(1)
+
+    # visibility
+    vis = re.search(r'<hp:visibility\b([^/]*)/?>', inner)
+    if vis:
+        v = vis.group(1)
+        for attr in ["hideFirstHeader", "hideFirstFooter",
+                      "hideFirstMasterPage", "hideFirstPageNum",
+                      "hideFirstEmptyLine"]:
+            m = re.search(rf'\b{attr}="(\d+)"', v)
+            if m:
+                result[attr] = bool(int(m.group(1)))
+
+    # startNum
+    sn = re.search(r'<hp:startNum\b([^/]*)/?>', inner)
+    if sn:
+        sns = sn.group(1)
+        start = {}
+        pso = re.search(r'\bpageStartsOn="([^"]+)"', sns)
+        if pso:
+            start["pageStartsOn"] = pso.group(1)
+        pg = re.search(r'\bpage="(\d+)"', sns)
+        if pg:
+            start["page"] = int(pg.group(1))
+        if start:
+            result["startNum"] = start
+
+    # pageNum
+    pn = re.search(r'<hp:pageNum\b([^/]*)/?>', inner)
+    if pn:
+        pns = pn.group(1)
+        pagenum = {}
+        for attr in ["pos", "formatType", "sideChar"]:
+            m = re.search(rf'\b{attr}="([^"]*)"', pns)
+            if m:
+                pagenum[attr] = m.group(1)
+        if pagenum:
+            result["pageNum"] = pagenum
+
+    # colDef (단 설정)
+    cd = re.search(r'<hp:colDef\b([^>]*)>(.*?)</hp:colDef>', inner, re.DOTALL)
+    if cd:
+        cds = cd.group(1)
+        coldef = {}
+        cnt = re.search(r'\bcount="(\d+)"', cds)
+        if cnt:
+            coldef["count"] = int(cnt.group(1))
+        layout = re.search(r'\blayout="([^"]+)"', cds)
+        if layout:
+            coldef["layout"] = layout.group(1)
+        if coldef:
+            result["colDef"] = coldef
+
+    return result if result else None
+
+
+def _get_section_xml(raw_xml: dict, parsed: dict = None) -> str | None:
+    if parsed and parsed.get("section_xml"):
+        return parsed["section_xml"]
+    if isinstance(raw_xml, dict):
+        for name, content in raw_xml.items():
+            if "section" in name.lower() and isinstance(content, str):
+                return content
+    return raw_xml if isinstance(raw_xml, str) else None