📦 Initialize Geulbeot structure and merge Prompts & test projects

2026-03-05 11:32:29 +09:00
commit 555a954458
687 changed files with 205247 additions and 0 deletions
--- a/Code/geulbeot_5th/converters/init.py
+++ b/Code/geulbeot_5th/converters/init.py
--- a/Code/geulbeot_5th/converters/html_to_hwp.py
+++ b/Code/geulbeot_5th/converters/html_to_hwp.py
--- a/Code/geulbeot_5th/converters/html_to_hwp_briefing.py
+++ b/Code/geulbeot_5th/converters/html_to_hwp_briefing.py
@@ -0,0 +1,616 @@
+# -*- coding: utf-8 -*-
+"""
+HTML → HWP 변환기 (기획서 전용)
+
+✅ 머리말/꼬리말: 보고서 방식 적용 (페이지 번호 포함)
+✅ lead-box, section, data-table, strategy-grid, qa-grid, bottom-box 지원
+✅ process-container (단계별 프로세스) 지원
+✅ badge 스타일 텍스트 변환
+✅ Navy 색상 테마
+
+pip install pyhwpx beautifulsoup4
+"""
+
+from pyhwpx import Hwp
+from bs4 import BeautifulSoup
+import os
+
+
+class Config:
+    """페이지 설정"""
+    PAGE_WIDTH = 210
+    PAGE_HEIGHT = 297
+    MARGIN_LEFT = 20
+    MARGIN_RIGHT = 20
+    MARGIN_TOP = 20
+    MARGIN_BOTTOM = 15
+    HEADER_LEN = 10
+    FOOTER_LEN = 10
+    CONTENT_WIDTH = 170
+
+
+class HtmlToHwpConverter:
+    """HTML → HWP 변환기 (기획서 전용)"""
+    
+    def __init__(self, visible=True):
+        self.hwp = Hwp(visible=visible)
+        self.cfg = Config()
+        self.colors = {}
+        self.is_first_h1 = True
+    
+    # ─────────────────────────────────────────────────────────
+    # 초기화 및 유틸리티
+    # ─────────────────────────────────────────────────────────
+    
+    def _init_colors(self):
+        """색상 팔레트 초기화 (Navy 계열)"""
+        self.colors = {
+            'primary-navy': self.hwp.RGBColor(26, 54, 93),      # #1a365d
+            'secondary-navy': self.hwp.RGBColor(44, 82, 130),   # #2c5282
+            'accent-navy': self.hwp.RGBColor(49, 130, 206),     # #3182ce
+            'dark-gray': self.hwp.RGBColor(45, 55, 72),         # #2d3748
+            'medium-gray': self.hwp.RGBColor(74, 85, 104),      # #4a5568
+            'light-gray': self.hwp.RGBColor(226, 232, 240),     # #e2e8f0
+            'bg-light': self.hwp.RGBColor(247, 250, 252),       # #f7fafc
+            'border-color': self.hwp.RGBColor(203, 213, 224),   # #cbd5e0
+            'badge-safe': self.hwp.RGBColor(30, 111, 63),       # #1e6f3f
+            'badge-caution': self.hwp.RGBColor(154, 91, 19),    # #9a5b13
+            'badge-risk': self.hwp.RGBColor(161, 43, 43),       # #a12b2b
+            'white': self.hwp.RGBColor(255, 255, 255),
+            'black': self.hwp.RGBColor(0, 0, 0),
+        }
+    
+    def _mm(self, mm):
+        """밀리미터를 HWP 단위로 변환"""
+        return self.hwp.MiliToHwpUnit(mm)
+    
+    def _pt(self, pt):
+        """포인트를 HWP 단위로 변환"""
+        return self.hwp.PointToHwpUnit(pt)
+    
+    def _rgb(self, hex_color):
+        """HEX 색상을 RGB로 변환"""
+        c = hex_color.lstrip('#')
+        return self.hwp.RGBColor(int(c[0:2], 16), int(c[2:4], 16), int(c[4:6], 16)) if len(c) >= 6 else self.hwp.RGBColor(0, 0, 0)
+    
+    def _font(self, size=10, color='black', bold=False):
+        """폰트 설정 (색상 이름 사용)"""
+        self.hwp.set_font(
+            FaceName='맑은 고딕',
+            Height=size,
+            Bold=bold,
+            TextColor=self.colors.get(color, self.colors['black'])
+        )
+    
+    def _set_font(self, size=11, bold=False, hex_color='#000000'):
+        """폰트 설정 (HEX 색상 사용)"""
+        self.hwp.set_font(
+            FaceName='맑은 고딕',
+            Height=size,
+            Bold=bold,
+            TextColor=self._rgb(hex_color)
+        )
+    
+    def _align(self, align):
+        """정렬 설정"""
+        actions = {
+            'left': 'ParagraphShapeAlignLeft',
+            'center': 'ParagraphShapeAlignCenter',
+            'right': 'ParagraphShapeAlignRight',
+            'justify': 'ParagraphShapeAlignJustify',
+        }
+        if align in actions:
+            self.hwp.HAction.Run(actions[align])
+    
+    def _para(self, text='', size=10, color='black', bold=False, align='left'):
+        """문단 삽입"""
+        self._align(align)
+        self._font(size, color, bold)
+        if text:
+            self.hwp.insert_text(text)
+        self.hwp.BreakPara()
+    
+    def _exit_table(self):
+        """표 편집 모드 종료"""
+        self.hwp.HAction.Run("Cancel")
+        self.hwp.HAction.Run("CloseEx")
+        self.hwp.HAction.Run("MoveDocEnd")
+        self.hwp.BreakPara()
+    
+    def _setup_page(self):
+        """페이지 설정"""
+        try:
+            self.hwp.HAction.GetDefault("PageSetup", self.hwp.HParameterSet.HSecDef.HSet)
+            s = self.hwp.HParameterSet.HSecDef
+            s.PageDef.LeftMargin = self._mm(self.cfg.MARGIN_LEFT)
+            s.PageDef.RightMargin = self._mm(self.cfg.MARGIN_RIGHT)
+            s.PageDef.TopMargin = self._mm(self.cfg.MARGIN_TOP)
+            s.PageDef.BottomMargin = self._mm(self.cfg.MARGIN_BOTTOM)
+            s.PageDef.HeaderLen = self._mm(self.cfg.HEADER_LEN)
+            s.PageDef.FooterLen = self._mm(self.cfg.FOOTER_LEN)
+            self.hwp.HAction.Execute("PageSetup", s.HSet)
+            print(f"[설정] 여백: 좌우 {self.cfg.MARGIN_LEFT}mm, 상 {self.cfg.MARGIN_TOP}mm, 하 {self.cfg.MARGIN_BOTTOM}mm")
+        except Exception as e:
+            print(f"[경고] 페이지 설정 실패: {e}")
+    
+    # ─────────────────────────────────────────────────────────
+    # 머리말 / 꼬리말 (보고서 방식)
+    # ─────────────────────────────────────────────────────────
+    
+    def _create_header(self, right_text=""):
+        """머리말 생성 (우측 정렬)"""
+        print(f"  → 머리말 생성: {right_text if right_text else '(초기화)'}")
+        try:
+            self.hwp.HAction.GetDefault("HeaderFooter", self.hwp.HParameterSet.HHeaderFooter.HSet)
+            self.hwp.HParameterSet.HHeaderFooter.HSet.SetItem("HeaderFooterStyle", 0)
+            self.hwp.HParameterSet.HHeaderFooter.HSet.SetItem("HeaderFooterCtrlType", 0)
+            self.hwp.HAction.Execute("HeaderFooter", self.hwp.HParameterSet.HHeaderFooter.HSet)
+            
+            self.hwp.HAction.Run("ParagraphShapeAlignRight")
+            self._set_font(9, False, '#4a5568')
+            if right_text:
+                self.hwp.insert_text(right_text)
+            
+            self.hwp.HAction.Run("CloseEx")
+        except Exception as e:
+            print(f"    [경고] 머리말: {e}")
+    
+    def _create_footer(self, left_text=""):
+        """꼬리말 생성 (좌측 텍스트 + 우측 페이지 번호)"""
+        print(f"  → 꼬리말: {left_text}")
+        
+        # 1. 꼬리말 열기
+        self.hwp.HAction.GetDefault("HeaderFooter", self.hwp.HParameterSet.HHeaderFooter.HSet)
+        self.hwp.HParameterSet.HHeaderFooter.HSet.SetItem("HeaderFooterStyle", 0)
+        self.hwp.HParameterSet.HHeaderFooter.HSet.SetItem("HeaderFooterCtrlType", 1)
+        self.hwp.HAction.Execute("HeaderFooter", self.hwp.HParameterSet.HHeaderFooter.HSet)
+        
+        # 2. 좌측 정렬 + 제목 8pt
+        self.hwp.HAction.Run("ParagraphShapeAlignLeft")
+        self._set_font(8, False, '#4a5568')
+        self.hwp.insert_text(left_text)
+        
+        # 3. 꼬리말 닫기
+        self.hwp.HAction.Run("CloseEx")
+        
+        # 4. 쪽번호 (우측 하단)
+        self.hwp.HAction.GetDefault("PageNumPos", self.hwp.HParameterSet.HPageNumPos.HSet)
+        self.hwp.HParameterSet.HPageNumPos.DrawPos = self.hwp.PageNumPosition("BottomRight")
+        self.hwp.HAction.Execute("PageNumPos", self.hwp.HParameterSet.HPageNumPos.HSet)
+    
+    def _new_section_with_header(self, header_text):
+        """새 구역 생성 후 머리말 설정"""
+        print(f"    → 새 구역 머리말: {header_text}")
+        try:
+            self.hwp.HAction.Run("BreakSection")
+            
+            self.hwp.HAction.GetDefault("HeaderFooter", self.hwp.HParameterSet.HHeaderFooter.HSet)
+            self.hwp.HParameterSet.HHeaderFooter.HSet.SetItem("HeaderFooterStyle", 0)
+            self.hwp.HParameterSet.HHeaderFooter.HSet.SetItem("HeaderFooterCtrlType", 0)
+            self.hwp.HAction.Execute("HeaderFooter", self.hwp.HParameterSet.HHeaderFooter.HSet)
+            
+            self.hwp.HAction.Run("SelectAll")
+            self.hwp.HAction.Run("Delete")
+            
+            self.hwp.HAction.Run("ParagraphShapeAlignRight")
+            self._set_font(9, False, '#4a5568')
+            self.hwp.insert_text(header_text)
+            
+            self.hwp.HAction.Run("CloseEx")
+        except Exception as e:
+            print(f"    [경고] 구역 머리말: {e}")
+    
+    # ─────────────────────────────────────────────────────────
+    # 셀 배경색 설정
+    # ─────────────────────────────────────────────────────────
+    
+    def _set_cell_bg(self, color_name):
+        """셀 배경색 설정 (색상 이름)"""
+        self.hwp.HAction.GetDefault("CellBorderFill", self.hwp.HParameterSet.HCellBorderFill.HSet)
+        pset = self.hwp.HParameterSet.HCellBorderFill
+        pset.FillAttr.type = self.hwp.BrushType("NullBrush|WinBrush")
+        pset.FillAttr.WinBrushFaceStyle = self.hwp.HatchStyle("None")
+        pset.FillAttr.WinBrushHatchColor = self.hwp.RGBColor(0, 0, 0)
+        pset.FillAttr.WinBrushFaceColor = self.colors.get(color_name, self.colors['white'])
+        pset.FillAttr.WindowsBrush = 1
+        self.hwp.HAction.Execute("CellBorderFill", pset.HSet)
+    
+    # ─────────────────────────────────────────────────────────
+    # HTML 요소 변환 (기획서 전용)
+    # ─────────────────────────────────────────────────────────
+    
+    def _convert_lead_box(self, elem):
+        """lead-box 변환 (핵심 기조 박스)"""
+        content = elem.find("div")
+        if not content:
+            return
+        
+        text = content.get_text(strip=True)
+        text = ' '.join(text.split())
+        print(f"  → lead-box")
+        
+        self.hwp.create_table(1, 1, treat_as_char=True)
+        self._set_cell_bg('bg-light')
+        self._font(11.5, 'dark-gray', False)
+        self.hwp.insert_text(text)
+        self._exit_table()
+    
+    def _convert_strategy_grid(self, elem):
+        """strategy-grid 변환 (2x2 전략 박스)"""
+        items = elem.find_all(class_="strategy-item")
+        if not items:
+            return
+        
+        print(f"  → strategy-grid: {len(items)} items")
+        
+        self.hwp.create_table(2, 2, treat_as_char=True)
+        
+        for i, item in enumerate(items[:4]):
+            if i > 0:
+                self.hwp.HAction.Run("MoveRight")
+            
+            self._set_cell_bg('bg-light')
+            
+            title = item.find(class_="strategy-title")
+            if title:
+                self._font(10, 'primary-navy', True)
+                self.hwp.insert_text(title.get_text(strip=True))
+                self.hwp.BreakPara()
+            
+            p = item.find("p")
+            if p:
+                self._font(9.5, 'dark-gray', False)
+                self.hwp.insert_text(p.get_text(strip=True))
+        
+        self._exit_table()
+    
+    def _convert_process_container(self, elem):
+        """process-container 변환 (단계별 프로세스)"""
+        steps = elem.find_all(class_="process-step")
+        if not steps:
+            return
+        
+        print(f"  → process-container: {len(steps)} steps")
+        
+        rows = len(steps)
+        self.hwp.create_table(rows, 2, treat_as_char=True)
+        
+        for i, step in enumerate(steps):
+            if i > 0:
+                self.hwp.HAction.Run("MoveRight")
+            
+            # 번호 셀
+            num = step.find(class_="step-num")
+            self._set_cell_bg('primary-navy')
+            self._font(10, 'white', True)
+            self._align('center')
+            if num:
+                self.hwp.insert_text(num.get_text(strip=True))
+            
+            self.hwp.HAction.Run("MoveRight")
+            
+            # 내용 셀
+            content = step.find(class_="step-content")
+            self._set_cell_bg('bg-light')
+            self._font(10.5, 'dark-gray', False)
+            self._align('left')
+            if content:
+                self.hwp.insert_text(content.get_text(strip=True))
+        
+        self._exit_table()
+    
+    def _convert_data_table(self, table):
+        """data-table 변환 (badge 포함)"""
+        data = []
+        
+        thead = table.find("thead")
+        if thead:
+            ths = thead.find_all("th")
+            data.append([th.get_text(strip=True) for th in ths])
+        
+        tbody = table.find("tbody")
+        if tbody:
+            for tr in tbody.find_all("tr"):
+                row = []
+                for td in tr.find_all("td"):
+                    badge = td.find(class_="badge")
+                    if badge:
+                        badge_class = ' '.join(badge.get('class', []))
+                        badge_text = badge.get_text(strip=True)
+                        if 'badge-safe' in badge_class:
+                            row.append(f"[✓ {badge_text}]")
+                        elif 'badge-caution' in badge_class:
+                            row.append(f"[△ {badge_text}]")
+                        elif 'badge-risk' in badge_class:
+                            row.append(f"[✗ {badge_text}]")
+                        else:
+                            row.append(f"[{badge_text}]")
+                    else:
+                        row.append(td.get_text(strip=True))
+                data.append(row)
+        
+        if not data:
+            return
+        
+        rows = len(data)
+        cols = len(data[0]) if data else 0
+        print(f"  → data-table: {rows}×{cols}")
+        
+        self.hwp.create_table(rows, cols, treat_as_char=True)
+        
+        for row_idx, row in enumerate(data):
+            for col_idx, cell_text in enumerate(row):
+                is_header = (row_idx == 0)
+                is_first_col = (col_idx == 0 and not is_header)
+                
+                is_safe = '[✓' in str(cell_text)
+                is_caution = '[△' in str(cell_text)
+                is_risk = '[✗' in str(cell_text)
+                
+                if is_header:
+                    self._set_cell_bg('primary-navy')
+                    self._font(9, 'white', True)
+                elif is_first_col:
+                    self._set_cell_bg('bg-light')
+                    self._font(9.5, 'primary-navy', True)
+                elif is_safe:
+                    self._font(9.5, 'badge-safe', True)
+                elif is_caution:
+                    self._font(9.5, 'badge-caution', True)
+                elif is_risk:
+                    self._font(9.5, 'badge-risk', True)
+                else:
+                    self._font(9.5, 'dark-gray', False)
+                
+                self._align('center')
+                self.hwp.insert_text(str(cell_text))
+                
+                if not (row_idx == rows - 1 and col_idx == cols - 1):
+                    self.hwp.HAction.Run("MoveRight")
+        
+        self._exit_table()
+    
+    def _convert_qa_grid(self, elem):
+        """qa-grid 변환 (Q&A 2단 박스)"""
+        items = elem.find_all(class_="qa-item")
+        if not items:
+            return
+        
+        print(f"  → qa-grid: {len(items)} items")
+        
+        self.hwp.create_table(1, 2, treat_as_char=True)
+        
+        for i, item in enumerate(items[:2]):
+            if i > 0:
+                self.hwp.HAction.Run("MoveRight")
+            
+            self._set_cell_bg('bg-light')
+            
+            text = item.get_text(strip=True)
+            strong = item.find("strong")
+            if strong:
+                q_text = strong.get_text(strip=True)
+                a_text = text.replace(q_text, '').strip()
+                
+                self._font(9.5, 'primary-navy', True)
+                self.hwp.insert_text(q_text)
+                self.hwp.BreakPara()
+                self._font(9.5, 'dark-gray', False)
+                self.hwp.insert_text(a_text)
+            else:
+                self._font(9.5, 'dark-gray', False)
+                self.hwp.insert_text(text)
+        
+        self._exit_table()
+    
+    def _convert_bottom_box(self, elem):
+        """bottom-box 변환 (핵심 결론 박스)"""
+        left = elem.find(class_="bottom-left")
+        right = elem.find(class_="bottom-right")
+        
+        if not left or not right:
+            return
+        
+        left_text = ' '.join(left.get_text().split())
+        right_text = right.get_text(strip=True)
+        print(f"  → bottom-box")
+        
+        self.hwp.create_table(1, 2, treat_as_char=True)
+        
+        # 좌측 (Navy 배경)
+        self._set_cell_bg('primary-navy')
+        self._font(10.5, 'white', True)
+        self._align('center')
+        self.hwp.insert_text(left_text)
+        
+        self.hwp.HAction.Run("MoveRight")
+        
+        # 우측 (연한 배경)
+        self._set_cell_bg('bg-light')
+        self._font(10.5, 'primary-navy', True)
+        self._align('center')
+        self.hwp.insert_text(right_text)
+        
+        self._exit_table()
+    
+    def _convert_section(self, section):
+        """section 변환"""
+        title = section.find(class_="section-title")
+        if title:
+            self._para("■ " + title.get_text(strip=True), 12, 'primary-navy', True)
+        
+        strategy_grid = section.find(class_="strategy-grid")
+        if strategy_grid:
+            self._convert_strategy_grid(strategy_grid)
+        
+        process = section.find(class_="process-container")
+        if process:
+            self._convert_process_container(process)
+        
+        table = section.find("table", class_="data-table")
+        if table:
+            self._convert_data_table(table)
+        
+        ul = section.find("ul")
+        if ul:
+            for li in ul.find_all("li", recursive=False):
+                keyword = li.find(class_="keyword")
+                if keyword:
+                    kw_text = keyword.get_text(strip=True)
+                    full = li.get_text(strip=True)
+                    rest = full.replace(kw_text, '', 1).strip()
+                    
+                    self._font(10.5, 'primary-navy', True)
+                    self.hwp.insert_text("  • " + kw_text + " ")
+                    self._font(10.5, 'dark-gray', False)
+                    self.hwp.insert_text(rest)
+                    self.hwp.BreakPara()
+                else:
+                    self._para("  • " + li.get_text(strip=True), 10.5, 'dark-gray')
+        
+        qa_grid = section.find(class_="qa-grid")
+        if qa_grid:
+            self._convert_qa_grid(qa_grid)
+        
+        self._para()
+    
+    def _convert_sheet(self, sheet, is_first_page=False, footer_title=""):
+        """한 페이지(sheet) 변환"""
+        
+        # 첫 페이지에서만 머리말/꼬리말 설정
+        if is_first_page:
+            # 머리말: page-header에서 텍스트 추출
+            header = sheet.find(class_="page-header")
+            if header:
+                left = header.find(class_="header-left")
+                right = header.find(class_="header-right")
+                # 우측 텍스트 사용 (부서명 등)
+                header_text = right.get_text(strip=True) if right else ""
+                if header_text:
+                    self._create_header(header_text)
+            
+            # 꼬리말: 제목 + 페이지번호
+            self._create_footer(footer_title)
+        
+        # 대제목
+        title = sheet.find(class_="header-title")
+        if title:
+            title_text = title.get_text(strip=True)
+            if '[첨부]' in title_text:
+                self._para(title_text, 15, 'primary-navy', True, 'left')
+                self._font(10, 'secondary-navy', False)
+                self._align('left')
+                self.hwp.insert_text("─" * 60)
+                self.hwp.BreakPara()
+            else:
+                self._para(title_text, 23, 'primary-navy', True, 'center')
+                self._font(10, 'secondary-navy', False)
+                self._align('center')
+                self.hwp.insert_text("━" * 45)
+                self.hwp.BreakPara()
+        
+        self._para()
+        
+        # 리드 박스
+        lead_box = sheet.find(class_="lead-box")
+        if lead_box:
+            self._convert_lead_box(lead_box)
+            self._para()
+        
+        # 섹션들
+        for section in sheet.find_all(class_="section"):
+            self._convert_section(section)
+        
+        # 하단 박스
+        bottom_box = sheet.find(class_="bottom-box")
+        if bottom_box:
+            self._para()
+            self._convert_bottom_box(bottom_box)
+    
+    # ─────────────────────────────────────────────────────────
+    # 메인 변환 함수
+    # ─────────────────────────────────────────────────────────
+    
+    def convert(self, html_path, output_path):
+        """HTML → HWP 변환 실행"""
+        
+        print("=" * 60)
+        print("HTML → HWP 변환기 (기획서 전용)")
+        print("  ✓ 머리말/꼬리말: 보고서 방식")
+        print("  ✓ Navy 테마, 기획서 요소")
+        print("=" * 60)
+        
+        print(f"\n[입력] {html_path}")
+        
+        with open(html_path, 'r', encoding='utf-8') as f:
+            soup = BeautifulSoup(f.read(), 'html.parser')
+        
+        # 제목 추출 (꼬리말용)
+        title_tag = soup.find('title')
+        if title_tag:
+            full_title = title_tag.get_text(strip=True)
+            footer_title = full_title.split(':')[0].strip()
+        else:
+            footer_title = ""
+        
+        self.hwp.FileNew()
+        self._init_colors()
+        self._setup_page()
+        
+        # 페이지별 변환
+        sheets = soup.find_all(class_="sheet")
+        total = len(sheets)
+        print(f"[변환] 총 {total} 페이지\n")
+        
+        for i, sheet in enumerate(sheets, 1):
+            print(f"[{i}/{total}] 페이지 처리 중...")
+            self._convert_sheet(sheet, is_first_page=(i == 1), footer_title=footer_title)
+            
+            if i < total:
+                self.hwp.HAction.Run("BreakPage")
+        
+        # 저장
+        self.hwp.SaveAs(output_path)
+        print(f"\n✅ 저장 완료: {output_path}")
+    
+    def close(self):
+        """HWP 종료"""
+        try:
+            self.hwp.Quit()
+        except:
+            pass
+
+
+def main():
+    """메인 실행"""
+    
+    html_path = r"D:\for python\geulbeot-light\geulbeot-light\output\briefing.html"
+    output_path = r"D:\for python\geulbeot-light\geulbeot-light\output\briefing.hwp"
+    
+    print("=" * 60)
+    print("HTML → HWP 변환기 (기획서)")
+    print("=" * 60)
+    print()
+    
+    try:
+        converter = HtmlToHwpConverter(visible=True)
+        converter.convert(html_path, output_path)
+        
+        print("\n" + "=" * 60)
+        print("✅ 변환 완료!")
+        print("=" * 60)
+        
+        input("\nEnter를 누르면 HWP가 닫힙니다...")
+        converter.close()
+        
+    except FileNotFoundError:
+        print(f"\n[에러] 파일을 찾을 수 없습니다: {html_path}")
+        print("경로를 확인해주세요.")
+    except Exception as e:
+        print(f"\n[에러] {e}")
+        import traceback
+        traceback.print_exc()
+
+
+if __name__ == "__main__":
+    main()
--- a/Code/geulbeot_5th/converters/hwp_style_mapping.py
+++ b/Code/geulbeot_5th/converters/hwp_style_mapping.py
@@ -0,0 +1,434 @@
+# -*- coding: utf-8 -*-
+"""
+HWP 스타일 매핑 모듈 v2.0
+HTML 역할(Role) → HWP 스타일 매핑
+
+✅ v2.0 변경사항:
+- pyhwpx API에 맞게 apply_to_hwp() 재작성
+- CharShape/ParaShape 직접 설정 방식
+- 역할 → 개요 스타일 매핑
+"""
+
+from dataclasses import dataclass
+from typing import Dict, Optional
+from enum import Enum
+
+
+class HwpStyleType(Enum):
+    """HWP 스타일 유형"""
+    PARAGRAPH = "paragraph"
+    CHARACTER = "character"
+
+
+@dataclass
+class HwpStyle:
+    """HWP 스타일 정의"""
+    id: int
+    name: str
+    type: HwpStyleType
+    font_size: float
+    font_bold: bool = False
+    font_color: str = "000000"
+    align: str = "justify"
+    line_spacing: float = 160
+    space_before: float = 0
+    space_after: float = 0
+    indent_left: float = 0
+    indent_first: float = 0
+    bg_color: Optional[str] = None
+
+
+# =============================================================================
+# 기본 스타일 템플릿
+# =============================================================================
+DEFAULT_STYLES: Dict[str, HwpStyle] = {
+    # 표지
+    "COVER_TITLE": HwpStyle(
+        id=100, name="표지제목", type=HwpStyleType.PARAGRAPH,
+        font_size=32, font_bold=True, align="center",
+        space_before=20, space_after=10, font_color="1a365d"
+    ),
+    "COVER_SUBTITLE": HwpStyle(
+        id=101, name="표지부제", type=HwpStyleType.PARAGRAPH,
+        font_size=18, font_bold=False, align="center",
+        font_color="555555"
+    ),
+    "COVER_INFO": HwpStyle(
+        id=102, name="표지정보", type=HwpStyleType.PARAGRAPH,
+        font_size=12, align="center", font_color="666666"
+    ),
+    
+    # 목차
+    "TOC_H1": HwpStyle(
+        id=110, name="목차1수준", type=HwpStyleType.PARAGRAPH,
+        font_size=12, font_bold=True, indent_left=0
+    ),
+    "TOC_H2": HwpStyle(
+        id=111, name="목차2수준", type=HwpStyleType.PARAGRAPH,
+        font_size=11, indent_left=20
+    ),
+    "TOC_H3": HwpStyle(
+        id=112, name="목차3수준", type=HwpStyleType.PARAGRAPH,
+        font_size=10, indent_left=40, font_color="666666"
+    ),
+    
+    # 제목 계층 (개요 1~7 매핑)
+    "H1": HwpStyle(
+        id=1, name="개요 1", type=HwpStyleType.PARAGRAPH,
+        font_size=20, font_bold=True, align="left",
+        space_before=30, space_after=15, font_color="1a365d"
+    ),
+    "H2": HwpStyle(
+        id=2, name="개요 2", type=HwpStyleType.PARAGRAPH,
+        font_size=16, font_bold=True, align="left",
+        space_before=20, space_after=10, font_color="2c5282"
+    ),
+    "H3": HwpStyle(
+        id=3, name="개요 3", type=HwpStyleType.PARAGRAPH,
+        font_size=14, font_bold=True, align="left",
+        space_before=15, space_after=8, font_color="2b6cb0"
+    ),
+    "H4": HwpStyle(
+        id=4, name="개요 4", type=HwpStyleType.PARAGRAPH,
+        font_size=12, font_bold=True, align="left",
+        space_before=10, space_after=5, indent_left=10
+    ),
+    "H5": HwpStyle(
+        id=5, name="개요 5", type=HwpStyleType.PARAGRAPH,
+        font_size=11, font_bold=True, align="left",
+        space_before=8, space_after=4, indent_left=20
+    ),
+    "H6": HwpStyle(
+        id=6, name="개요 6", type=HwpStyleType.PARAGRAPH,
+        font_size=11, font_bold=False, align="left",
+        indent_left=30
+    ),
+    "H7": HwpStyle(
+        id=7, name="개요 7", type=HwpStyleType.PARAGRAPH,
+        font_size=10.5, font_bold=False, align="left",
+        indent_left=40
+    ),
+    
+    # 본문
+    "BODY": HwpStyle(
+        id=20, name="바탕글", type=HwpStyleType.PARAGRAPH,
+        font_size=11, align="justify",
+        line_spacing=180, indent_first=10
+    ),
+    "LIST_ITEM": HwpStyle(
+        id=8, name="개요 8", type=HwpStyleType.PARAGRAPH,
+        font_size=11, align="left",
+        indent_left=15, line_spacing=160
+    ),
+    "HIGHLIGHT_BOX": HwpStyle(
+        id=21, name="강조박스", type=HwpStyleType.PARAGRAPH,
+        font_size=10.5, align="left",
+        bg_color="f7fafc", indent_left=10, indent_first=0
+    ),
+    
+    # 표
+    "TABLE": HwpStyle(
+        id=30, name="표", type=HwpStyleType.PARAGRAPH,
+        font_size=10, align="center"
+    ),
+    "TH": HwpStyle(
+        id=11, name="표제목", type=HwpStyleType.PARAGRAPH,
+        font_size=10, font_bold=True, align="center",
+        bg_color="e2e8f0"
+    ),
+    "TD": HwpStyle(
+        id=31, name="표내용", type=HwpStyleType.PARAGRAPH,
+        font_size=10, align="left"
+    ),
+    "TABLE_CAPTION": HwpStyle(
+        id=19, name="표캡션", type=HwpStyleType.PARAGRAPH,
+        font_size=10, font_bold=True, align="center",
+        space_before=5, space_after=3
+    ),
+    
+    # 그림
+    "FIGURE": HwpStyle(
+        id=32, name="그림", type=HwpStyleType.PARAGRAPH,
+        font_size=10, align="center"
+    ),
+    "FIGURE_CAPTION": HwpStyle(
+        id=18, name="그림캡션", type=HwpStyleType.PARAGRAPH,
+        font_size=9.5, align="center",
+        font_color="666666", space_before=5
+    ),
+    
+    # 기타
+    "UNKNOWN": HwpStyle(
+        id=0, name="바탕글", type=HwpStyleType.PARAGRAPH,
+        font_size=10, align="left"
+    ),
+}
+
+# 역할 → 개요 번호 매핑 (StyleShortcut 용)
+ROLE_TO_OUTLINE_NUM = {
+    "H1": 1,
+    "H2": 2,
+    "H3": 3,
+    "H4": 4,
+    "H5": 5,
+    "H6": 6,
+    "H7": 7,
+    "LIST_ITEM": 8,
+    "BODY": 0,  # 바탕글
+    "COVER_TITLE": 0,
+    "COVER_SUBTITLE": 0,
+    "COVER_INFO": 0,
+}
+
+# 역할 → HWP 스타일 이름 매핑
+ROLE_TO_STYLE_NAME = {
+    "H1": "개요 1",
+    "H2": "개요 2",
+    "H3": "개요 3",
+    "H4": "개요 4",
+    "H5": "개요 5",
+    "H6": "개요 6",
+    "H7": "개요 7",
+    "LIST_ITEM": "개요 8",
+    "BODY": "바탕글",
+    "COVER_TITLE": "표지제목",
+    "COVER_SUBTITLE": "표지부제",
+    "TH": "표제목",
+    "TD": "표내용",
+    "TABLE_CAPTION": "표캡션",
+    "FIGURE_CAPTION": "그림캡션",
+    "UNKNOWN": "바탕글",
+}
+
+
+class HwpStyleMapper:
+    """HTML 역할 → HWP 스타일 매퍼"""
+    
+    def __init__(self, custom_styles: Optional[Dict[str, HwpStyle]] = None):
+        self.styles = DEFAULT_STYLES.copy()
+        if custom_styles:
+            self.styles.update(custom_styles)
+    
+    def get_style(self, role: str) -> HwpStyle:
+        return self.styles.get(role, self.styles["UNKNOWN"])
+    
+    def get_style_id(self, role: str) -> int:
+        return self.get_style(role).id
+    
+    def get_all_styles(self) -> Dict[str, HwpStyle]:
+        return self.styles
+
+
+class HwpStyGenerator:
+    """
+    HTML 스타일 → HWP 스타일 적용기
+    
+    pyhwpx API를 사용하여:
+    1. 역할별 스타일 정보 저장
+    2. 텍스트 삽입 시 CharShape/ParaShape 직접 적용
+    3. 개요 스타일 번호 매핑 반환
+    """
+    
+    def __init__(self):
+        self.styles: Dict[str, HwpStyle] = {}
+        self.hwp = None
+    
+    def update_from_html(self, html_styles: Dict[str, Dict]):
+        """HTML에서 추출한 스타일로 업데이트"""
+        for role, style_dict in html_styles.items():
+            if role in DEFAULT_STYLES:
+                base = DEFAULT_STYLES[role]
+                
+                # color 처리 - # 제거
+                color = style_dict.get('color', base.font_color)
+                if isinstance(color, str):
+                    color = color.lstrip('#')
+                
+                self.styles[role] = HwpStyle(
+                    id=base.id,
+                    name=base.name,
+                    type=base.type,
+                    font_size=style_dict.get('font_size', base.font_size),
+                    font_bold=style_dict.get('bold', base.font_bold),
+                    font_color=color,
+                    align=style_dict.get('align', base.align),
+                    line_spacing=style_dict.get('line_spacing', base.line_spacing),
+                    space_before=style_dict.get('space_before', base.space_before),
+                    space_after=style_dict.get('space_after', base.space_after),
+                    indent_left=style_dict.get('indent_left', base.indent_left),
+                    indent_first=style_dict.get('indent_first', base.indent_first),
+                    bg_color=style_dict.get('bg_color', base.bg_color),
+                )
+            else:
+                # 기본 스타일 사용
+                self.styles[role] = DEFAULT_STYLES.get('UNKNOWN')
+        
+        # 누락된 역할은 기본값으로 채움
+        for role in DEFAULT_STYLES:
+            if role not in self.styles:
+                self.styles[role] = DEFAULT_STYLES[role]
+    
+    def apply_to_hwp(self, hwp) -> Dict[str, HwpStyle]:
+        """역할 → HwpStyle 매핑 반환"""
+        self.hwp = hwp
+        
+        # 🚫 스타일 생성 비활성화 (API 문제)
+        # for role, style in self.styles.items():
+        #     self._create_or_update_style(hwp, role, style)
+        
+        if not self.styles:
+            self.styles = DEFAULT_STYLES.copy()
+        
+        print(f"  ✅ 스타일 매핑 완료: {len(self.styles)}개")
+        return self.styles
+
+    def _create_or_update_style(self, hwp, role: str, style: HwpStyle):
+        """HWP에 스타일 생성 또는 수정"""
+        try:
+            # 1. 스타일 편집 모드
+            hwp.HAction.GetDefault("ModifyStyle", hwp.HParameterSet.HStyle.HSet)
+            hwp.HParameterSet.HStyle.StyleName = style.name
+            
+            # 2. 글자 모양
+            color_hex = style.font_color.lstrip('#')
+            if len(color_hex) == 6:
+                r, g, b = int(color_hex[0:2], 16), int(color_hex[2:4], 16), int(color_hex[4:6], 16)
+                text_color = hwp.RGBColor(r, g, b)
+            else:
+                text_color = hwp.RGBColor(0, 0, 0)
+            
+            hwp.HParameterSet.HStyle.CharShape.Height = hwp.PointToHwpUnit(style.font_size)
+            hwp.HParameterSet.HStyle.CharShape.Bold = style.font_bold
+            hwp.HParameterSet.HStyle.CharShape.TextColor = text_color
+            
+            # 3. 문단 모양
+            align_map = {'left': 0, 'center': 1, 'right': 2, 'justify': 3}
+            hwp.HParameterSet.HStyle.ParaShape.Align = align_map.get(style.align, 3)
+            hwp.HParameterSet.HStyle.ParaShape.LineSpacing = int(style.line_spacing)
+            hwp.HParameterSet.HStyle.ParaShape.SpaceBeforePara = hwp.PointToHwpUnit(style.space_before)
+            hwp.HParameterSet.HStyle.ParaShape.SpaceAfterPara = hwp.PointToHwpUnit(style.space_after)
+            
+            # 4. 실행
+            hwp.HAction.Execute("ModifyStyle", hwp.HParameterSet.HStyle.HSet)
+            print(f"    ✓ 스타일 '{style.name}' 정의됨")
+            
+        except Exception as e:
+            print(f"    [경고] 스타일 '{style.name}' 생성 실패: {e}")
+        
+    def get_style(self, role: str) -> HwpStyle:
+        """역할에 해당하는 스타일 반환"""
+        return self.styles.get(role, DEFAULT_STYLES.get('UNKNOWN'))
+    
+    def apply_char_shape(self, hwp, role: str):
+        """현재 선택 영역에 글자 모양 적용"""
+        style = self.get_style(role)
+        
+        try:
+            # RGB 색상 변환
+            color_hex = style.font_color.lstrip('#') if style.font_color else '000000'
+            if len(color_hex) == 6:
+                r = int(color_hex[0:2], 16)
+                g = int(color_hex[2:4], 16)
+                b = int(color_hex[4:6], 16)
+                text_color = hwp.RGBColor(r, g, b)
+            else:
+                text_color = hwp.RGBColor(0, 0, 0)
+            
+            # 글자 모양 설정
+            hwp.HAction.GetDefault("CharShape", hwp.HParameterSet.HCharShape.HSet)
+            hwp.HParameterSet.HCharShape.Height = hwp.PointToHwpUnit(style.font_size)
+            hwp.HParameterSet.HCharShape.Bold = style.font_bold
+            hwp.HParameterSet.HCharShape.TextColor = text_color
+            hwp.HAction.Execute("CharShape", hwp.HParameterSet.HCharShape.HSet)
+            
+        except Exception as e:
+            print(f"    [경고] 글자 모양 적용 실패 ({role}): {e}")
+    
+    def apply_para_shape(self, hwp, role: str):
+        """현재 문단에 문단 모양 적용"""
+        style = self.get_style(role)
+        
+        try:
+            # 정렬
+            align_actions = {
+                'left': "ParagraphShapeAlignLeft",
+                'center': "ParagraphShapeAlignCenter",
+                'right': "ParagraphShapeAlignRight",
+                'justify': "ParagraphShapeAlignJustify"
+            }
+            if style.align in align_actions:
+                hwp.HAction.Run(align_actions[style.align])
+            
+            # 문단 모양 상세 설정
+            hwp.HAction.GetDefault("ParagraphShape", hwp.HParameterSet.HParaShape.HSet)
+            p = hwp.HParameterSet.HParaShape
+            p.LineSpaceType = 0  # 퍼센트
+            p.LineSpacing = int(style.line_spacing)
+            p.LeftMargin = hwp.MiliToHwpUnit(style.indent_left)
+            p.IndentMargin = hwp.MiliToHwpUnit(style.indent_first)
+            p.SpaceBeforePara = hwp.PointToHwpUnit(style.space_before)
+            p.SpaceAfterPara = hwp.PointToHwpUnit(style.space_after)
+            hwp.HAction.Execute("ParagraphShape", p.HSet)
+            
+        except Exception as e:
+            print(f"    [경고] 문단 모양 적용 실패 ({role}): {e}")
+    
+    def apply_style(self, hwp, role: str):
+        """역할에 맞는 전체 스타일 적용 (글자 + 문단)"""
+        self.apply_char_shape(hwp, role)
+        self.apply_para_shape(hwp, role)
+    
+    def export_sty(self, hwp, output_path: str) -> bool:
+        """스타일 파일 내보내기 (현재 미지원)"""
+        print(f"  [알림] .sty 내보내기는 현재 미지원")
+        return False
+
+
+# =============================================================================
+# 번호 제거 유틸리티
+# =============================================================================
+import re
+
+NUMBERING_PATTERNS = {
+    'H1': re.compile(r'^(\d+)\.\s*'),           # "1. " → ""
+    'H2': re.compile(r'^(\d+)\.(\d+)\s*'),      # "1.1 " → ""
+    'H3': re.compile(r'^(\d+)\.(\d+)\.(\d+)\s*'), # "1.1.1 " → ""
+    'H4': re.compile(r'^[가-하]\.\s*'),          # "가. " → ""
+    'H5': re.compile(r'^(\d+)\)\s*'),           # "1) " → ""
+    'H6': re.compile(r'^\((\d+)\)\s*'),         # "(1) " → ""
+    'H7': re.compile(r'^[①②③④⑤⑥⑦⑧⑨⑩]\s*'),  # "① " → ""
+    'LIST_ITEM': re.compile(r'^[•\-○]\s*'),    # "• " → ""
+}
+
+def strip_numbering(text: str, role: str) -> str:
+    """
+    역할에 따라 텍스트 앞의 번호/기호 제거
+    HWP 개요 기능이 번호를 자동 생성하므로 중복 방지
+    """
+    if not text:
+        return text
+    
+    pattern = NUMBERING_PATTERNS.get(role)
+    if pattern:
+        return pattern.sub('', text).strip()
+    
+    return text.strip()
+
+
+if __name__ == "__main__":
+    # 테스트
+    print("=== 스타일 매핑 테스트 ===")
+    
+    gen = HwpStyGenerator()
+    
+    # HTML 스타일 시뮬레이션
+    html_styles = {
+        'H1': {'font_size': 20, 'color': '#1a365d', 'bold': True},
+        'H2': {'font_size': 16, 'color': '#2c5282', 'bold': True},
+        'BODY': {'font_size': 11, 'align': 'justify'},
+    }
+    
+    gen.update_from_html(html_styles)
+    
+    for role, style in gen.styles.items():
+        print(f"{role:15} → size={style.font_size}pt, bold={style.font_bold}, color=#{style.font_color}")
--- a/Code/geulbeot_5th/converters/hwpx_generator.py
+++ b/Code/geulbeot_5th/converters/hwpx_generator.py
@@ -0,0 +1,431 @@
+"""
+HWPX 파일 생성기
+StyleAnalyzer 결과를 받아 스타일이 적용된 HWPX 파일 생성
+"""
+
+import os
+import zipfile
+import xml.etree.ElementTree as ET
+from typing import List, Dict, Optional
+from dataclasses import dataclass
+from pathlib import Path
+
+from style_analyzer import StyleAnalyzer, StyledElement
+from hwp_style_mapping import HwpStyleMapper, HwpStyle, ROLE_TO_STYLE_NAME
+
+
+@dataclass
+class HwpxConfig:
+    """HWPX 생성 설정"""
+    paper_width: int = 59528      # A4 너비 (hwpunit, 1/7200 inch)
+    paper_height: int = 84188     # A4 높이
+    margin_left: int = 8504
+    margin_right: int = 8504
+    margin_top: int = 5668
+    margin_bottom: int = 4252
+    default_font: str = "함초롬바탕"
+    default_font_size: int = 1000  # 10pt (hwpunit)
+
+
+class HwpxGenerator:
+    """HWPX 파일 생성기"""
+    
+    def __init__(self, config: Optional[HwpxConfig] = None):
+        self.config = config or HwpxConfig()
+        self.mapper = HwpStyleMapper()
+        self.used_styles: set = set()
+    
+    def generate(self, elements: List[StyledElement], output_path: str) -> str:
+        """
+        StyledElement 리스트로부터 HWPX 파일 생성
+        
+        Args:
+            elements: StyleAnalyzer로 분류된 요소 리스트
+            output_path: 출력 파일 경로 (.hwpx)
+        
+        Returns:
+            생성된 파일 경로
+        """
+        # 사용된 스타일 수집
+        self.used_styles = {e.role for e in elements}
+        
+        # 임시 디렉토리 생성
+        temp_dir = Path(output_path).with_suffix('.temp')
+        temp_dir.mkdir(parents=True, exist_ok=True)
+        
+        try:
+            # HWPX 구조 생성
+            self._create_mimetype(temp_dir)
+            self._create_meta_inf(temp_dir)
+            self._create_version(temp_dir)
+            self._create_header(temp_dir)
+            self._create_content(temp_dir, elements)
+            self._create_settings(temp_dir)
+            
+            # ZIP으로 압축
+            self._create_hwpx(temp_dir, output_path)
+            
+            return output_path
+            
+        finally:
+            # 임시 파일 정리
+            import shutil
+            if temp_dir.exists():
+                shutil.rmtree(temp_dir)
+    
+    def _create_mimetype(self, temp_dir: Path):
+        """mimetype 파일 생성"""
+        mimetype_path = temp_dir / "mimetype"
+        mimetype_path.write_text("application/hwp+zip")
+    
+    def _create_meta_inf(self, temp_dir: Path):
+        """META-INF/manifest.xml 생성"""
+        meta_dir = temp_dir / "META-INF"
+        meta_dir.mkdir(exist_ok=True)
+        
+        manifest = """<?xml version="1.0" encoding="UTF-8"?>
+<manifest:manifest xmlns:manifest="urn:oasis:names:tc:opendocument:xmlns:manifest:1.0">
+    <manifest:file-entry manifest:full-path="/" manifest:media-type="application/hwp+zip"/>
+    <manifest:file-entry manifest:full-path="version.xml" manifest:media-type="application/xml"/>
+    <manifest:file-entry manifest:full-path="Contents/header.xml" manifest:media-type="application/xml"/>
+    <manifest:file-entry manifest:full-path="Contents/section0.xml" manifest:media-type="application/xml"/>
+    <manifest:file-entry manifest:full-path="settings.xml" manifest:media-type="application/xml"/>
+</manifest:manifest>"""
+        
+        (meta_dir / "manifest.xml").write_text(manifest, encoding='utf-8')
+    
+    def _create_version(self, temp_dir: Path):
+        """version.xml 생성"""
+        version = """<?xml version="1.0" encoding="UTF-8"?>
+<hh:HWPMLVersion xmlns:hh="http://www.hancom.co.kr/hwpml/2011/head" version="1.1"/>"""
+        
+        (temp_dir / "version.xml").write_text(version, encoding='utf-8')
+    
+    def _create_header(self, temp_dir: Path):
+        """Contents/header.xml 생성 (스타일 정의 포함)"""
+        contents_dir = temp_dir / "Contents"
+        contents_dir.mkdir(exist_ok=True)
+        
+        # 스타일별 속성 생성
+        char_props_xml = self._generate_char_properties()
+        para_props_xml = self._generate_para_properties()
+        styles_xml = self._generate_styles_xml()
+        
+        header = f"""<?xml version="1.0" encoding="UTF-8"?>
+<hh:head xmlns:hh="http://www.hancom.co.kr/hwpml/2011/head"
+         xmlns:hc="http://www.hancom.co.kr/hwpml/2011/core"
+         xmlns:hp="http://www.hancom.co.kr/hwpml/2011/paragraph"
+         version="1.5" secCnt="1">
+    <hh:beginNum page="1" footnote="1" endnote="1" pic="1" tbl="1" equation="1"/>
+    <hh:refList>
+        <hh:fontfaces itemCnt="7">
+            <hh:fontface lang="HANGUL" fontCnt="2">
+                <hh:font id="0" face="맑은 고딕" type="TTF" isEmbedded="0"/>
+                <hh:font id="1" face="함초롬돋움" type="TTF" isEmbedded="0"/>
+            </hh:fontface>
+            <hh:fontface lang="LATIN" fontCnt="2">
+                <hh:font id="0" face="맑은 고딕" type="TTF" isEmbedded="0"/>
+                <hh:font id="1" face="함초롬돋움" type="TTF" isEmbedded="0"/>
+            </hh:fontface>
+            <hh:fontface lang="HANJA" fontCnt="2">
+                <hh:font id="0" face="맑은 고딕" type="TTF" isEmbedded="0"/>
+                <hh:font id="1" face="함초롬돋움" type="TTF" isEmbedded="0"/>
+            </hh:fontface>
+            <hh:fontface lang="JAPANESE" fontCnt="1">
+                <hh:font id="0" face="맑은 고딕" type="TTF" isEmbedded="0"/>
+            </hh:fontface>
+            <hh:fontface lang="OTHER" fontCnt="1">
+                <hh:font id="0" face="맑은 고딕" type="TTF" isEmbedded="0"/>
+            </hh:fontface>
+            <hh:fontface lang="SYMBOL" fontCnt="1">
+                <hh:font id="0" face="맑은 고딕" type="TTF" isEmbedded="0"/>
+            </hh:fontface>
+            <hh:fontface lang="USER" fontCnt="1">
+                <hh:font id="0" face="맑은 고딕" type="TTF" isEmbedded="0"/>
+            </hh:fontface>
+        </hh:fontfaces>
+        <hh:borderFills itemCnt="2">
+            <hh:borderFill id="1" threeD="0" shadow="0" centerLine="NONE">
+                <hh:slash type="NONE" Crooked="0" isCounter="0"/>
+                <hh:backSlash type="NONE" Crooked="0" isCounter="0"/>
+                <hh:leftBorder type="NONE" width="0.1 mm" color="#000000"/>
+                <hh:rightBorder type="NONE" width="0.1 mm" color="#000000"/>
+                <hh:topBorder type="NONE" width="0.1 mm" color="#000000"/>
+                <hh:bottomBorder type="NONE" width="0.1 mm" color="#000000"/>
+            </hh:borderFill>
+            <hh:borderFill id="2" threeD="0" shadow="0" centerLine="NONE">
+                <hh:slash type="NONE" Crooked="0" isCounter="0"/>
+                <hh:backSlash type="NONE" Crooked="0" isCounter="0"/>
+                <hh:leftBorder type="NONE" width="0.1 mm" color="#000000"/>
+                <hh:rightBorder type="NONE" width="0.1 mm" color="#000000"/>
+                <hh:topBorder type="NONE" width="0.1 mm" color="#000000"/>
+                <hh:bottomBorder type="NONE" width="0.1 mm" color="#000000"/>
+                <hc:fillBrush><hc:winBrush faceColor="none" hatchColor="#000000" alpha="0"/></hc:fillBrush>
+            </hh:borderFill>
+        </hh:borderFills>
+{char_props_xml}
+{para_props_xml}
+{styles_xml}
+    </hh:refList>
+    <hh:compatibleDocument targetProgram="HWP201X"/>
+    <hh:docOption>
+        <hh:linkinfo path="" pageInherit="1" footnoteInherit="0"/>
+    </hh:docOption>
+</hh:head>"""
+        
+        (contents_dir / "header.xml").write_text(header, encoding='utf-8')
+    
+    def _generate_char_properties(self) -> str:
+        """글자 속성 XML 생성"""
+        lines = [f'        <hh:charProperties itemCnt="{len(self.used_styles) + 1}">']
+        
+        # 기본 글자 속성 (id=0)
+        lines.append('''            <hh:charPr id="0" height="1000" textColor="#000000" shadeColor="none" useFontSpace="0" useKerning="0" symMark="NONE" borderFillIDRef="1">
+                <hh:fontRef hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
+                <hh:ratio hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/>
+                <hh:spacing hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
+                <hh:relSz hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/>
+                <hh:offset hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
+                <hh:underline type="NONE" shape="SOLID" color="#000000"/>
+                <hh:strikeout shape="NONE" color="#000000"/>
+                <hh:outline type="NONE"/>
+                <hh:shadow type="NONE" color="#B2B2B2" offsetX="10" offsetY="10"/>
+            </hh:charPr>''')
+        
+        # 역할별 글자 속성
+        for idx, role in enumerate(sorted(self.used_styles), start=1):
+            style = self.mapper.get_style(role)
+            height = int(style.font_size * 100)  # pt → hwpunit
+            color = style.font_color.lstrip('#')
+            font_id = "1" if style.font_bold else "0"  # 굵게면 함초롬돋움
+            
+            lines.append(f'''            <hh:charPr id="{idx}" height="{height}" textColor="#{color}" shadeColor="none" useFontSpace="0" useKerning="0" symMark="NONE" borderFillIDRef="1">
+                <hh:fontRef hangul="{font_id}" latin="{font_id}" hanja="{font_id}" japanese="{font_id}" other="{font_id}" symbol="{font_id}" user="{font_id}"/>
+                <hh:ratio hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/>
+                <hh:spacing hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
+                <hh:relSz hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/>
+                <hh:offset hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
+                <hh:underline type="NONE" shape="SOLID" color="#000000"/>
+                <hh:strikeout shape="NONE" color="#000000"/>
+                <hh:outline type="NONE"/>
+                <hh:shadow type="NONE" color="#B2B2B2" offsetX="10" offsetY="10"/>
+            </hh:charPr>''')
+        
+        lines.append('        </hh:charProperties>')
+        return '\n'.join(lines)
+    
+    def _generate_para_properties(self) -> str:
+        """문단 속성 XML 생성"""
+        lines = [f'        <hh:paraProperties itemCnt="{len(self.used_styles) + 1}">']
+        
+        # 기본 문단 속성 (id=0)
+        lines.append('''            <hh:paraPr id="0" tabPrIDRef="0" condense="0" fontLineHeight="0" snapToGrid="0" suppressLineNumbers="0" checked="0">
+                <hh:align horizontal="JUSTIFY" vertical="BASELINE"/>
+                <hh:heading type="NONE" idRef="0" level="0"/>
+                <hh:breakSetting breakLatinWord="KEEP_WORD" breakNonLatinWord="KEEP_WORD" widowOrphan="0" keepWithNext="0" keepLines="0" pageBreakBefore="0" lineWrap="BREAK"/>
+                <hh:autoSpacing eAsianEng="0" eAsianNum="0"/>
+                <hp:switch xmlns:hp="http://www.hancom.co.kr/hwpml/2011/paragraph">
+                    <hp:case hp:required-namespace="http://www.hancom.co.kr/hwpml/2016/HwpUnitChar">
+                        <hh:margin><hc:intent value="0" unit="HWPUNIT"/><hc:left value="0" unit="HWPUNIT"/><hc:right value="0" unit="HWPUNIT"/><hc:prev value="0" unit="HWPUNIT"/><hc:next value="0" unit="HWPUNIT"/></hh:margin>
+                        <hh:lineSpacing type="PERCENT" value="160" unit="HWPUNIT"/>
+                    </hp:case>
+                    <hp:default>
+                        <hh:margin><hc:intent value="0" unit="HWPUNIT"/><hc:left value="0" unit="HWPUNIT"/><hc:right value="0" unit="HWPUNIT"/><hc:prev value="0" unit="HWPUNIT"/><hc:next value="0" unit="HWPUNIT"/></hh:margin>
+                        <hh:lineSpacing type="PERCENT" value="160" unit="HWPUNIT"/>
+                    </hp:default>
+                </hp:switch>
+                <hh:border borderFillIDRef="1" offsetLeft="0" offsetRight="0" offsetTop="0" offsetBottom="0" connect="0" ignoreMargin="0"/>
+            </hh:paraPr>''')
+        
+        # 역할별 문단 속성
+        align_map = {"left": "LEFT", "center": "CENTER", "right": "RIGHT", "justify": "JUSTIFY"}
+        
+        for idx, role in enumerate(sorted(self.used_styles), start=1):
+            style = self.mapper.get_style(role)
+            align_val = align_map.get(style.align, "JUSTIFY")
+            line_spacing = int(style.line_spacing)
+            left_margin = int(style.indent_left * 100)
+            indent = int(style.indent_first * 100)
+            space_before = int(style.space_before * 100)
+            space_after = int(style.space_after * 100)
+            
+            lines.append(f'''            <hh:paraPr id="{idx}" tabPrIDRef="0" condense="0" fontLineHeight="0" snapToGrid="0" suppressLineNumbers="0" checked="0">
+                <hh:align horizontal="{align_val}" vertical="BASELINE"/>
+                <hh:heading type="NONE" idRef="0" level="0"/>
+                <hh:breakSetting breakLatinWord="KEEP_WORD" breakNonLatinWord="KEEP_WORD" widowOrphan="0" keepWithNext="0" keepLines="0" pageBreakBefore="0" lineWrap="BREAK"/>
+                <hh:autoSpacing eAsianEng="0" eAsianNum="0"/>
+                <hp:switch xmlns:hp="http://www.hancom.co.kr/hwpml/2011/paragraph">
+                    <hp:case hp:required-namespace="http://www.hancom.co.kr/hwpml/2016/HwpUnitChar">
+                        <hh:margin><hc:intent value="{indent}" unit="HWPUNIT"/><hc:left value="{left_margin}" unit="HWPUNIT"/><hc:right value="0" unit="HWPUNIT"/><hc:prev value="{space_before}" unit="HWPUNIT"/><hc:next value="{space_after}" unit="HWPUNIT"/></hh:margin>
+                        <hh:lineSpacing type="PERCENT" value="{line_spacing}" unit="HWPUNIT"/>
+                    </hp:case>
+                    <hp:default>
+                        <hh:margin><hc:intent value="{indent}" unit="HWPUNIT"/><hc:left value="{left_margin}" unit="HWPUNIT"/><hc:right value="0" unit="HWPUNIT"/><hc:prev value="{space_before}" unit="HWPUNIT"/><hc:next value="{space_after}" unit="HWPUNIT"/></hh:margin>
+                        <hh:lineSpacing type="PERCENT" value="{line_spacing}" unit="HWPUNIT"/>
+                    </hp:default>
+                </hp:switch>
+                <hh:border borderFillIDRef="1" offsetLeft="0" offsetRight="0" offsetTop="0" offsetBottom="0" connect="0" ignoreMargin="0"/>
+            </hh:paraPr>''')
+        
+        lines.append('        </hh:paraProperties>')
+        return '\n'.join(lines)
+    
+    def _generate_styles_xml(self) -> str:
+        """스타일 정의 XML 생성 (charPrIDRef, paraPrIDRef 참조)"""
+        lines = [f'        <hh:styles itemCnt="{len(self.used_styles) + 1}">']
+        
+        # 기본 스타일 (id=0, 바탕글)
+        lines.append('            <hh:style id="0" type="PARA" name="바탕글" engName="Normal" paraPrIDRef="0" charPrIDRef="0" nextStyleIDRef="0" langID="1042" lockForm="0"/>')
+        
+        # 역할별 스타일 (charPrIDRef, paraPrIDRef 참조)
+        for idx, role in enumerate(sorted(self.used_styles), start=1):
+            style = self.mapper.get_style(role)
+            style_name = style.name.replace('<', '&lt;').replace('>', '&gt;')
+            
+            lines.append(f'            <hh:style id="{idx}" type="PARA" name="{style_name}" engName="" paraPrIDRef="{idx}" charPrIDRef="{idx}" nextStyleIDRef="{idx}" langID="1042" lockForm="0"/>')
+        
+        lines.append('        </hh:styles>')
+        return '\n'.join(lines)
+    
+    def _create_content(self, temp_dir: Path, elements: List[StyledElement]):
+        """Contents/section0.xml 생성 (본문 + 스타일 참조)"""
+        contents_dir = temp_dir / "Contents"
+        
+        # 문단 XML 생성
+        paragraphs = []
+        current_table = None
+        
+        # 역할 → 스타일 인덱스 매핑 생성
+        role_to_idx = {role: idx for idx, role in enumerate(sorted(self.used_styles), start=1)}
+        
+        for elem in elements:
+            style = self.mapper.get_style(elem.role)
+            style_idx = role_to_idx.get(elem.role, 0)
+            
+            # 테이블 요소는 특수 처리
+            if elem.role in ["TH", "TD", "TABLE_CAPTION", "TABLE", "FIGURE"]:
+                continue  # 테이블/그림은 별도 처리 필요
+            
+            # 일반 문단
+            para_xml = self._create_paragraph(elem.text, style, style_idx)
+            paragraphs.append(para_xml)
+        
+        section = f"""<?xml version="1.0" encoding="UTF-8"?>
+<hs:sec xmlns:hs="http://www.hancom.co.kr/hwpml/2011/section"
+        xmlns:hc="http://www.hancom.co.kr/hwpml/2011/core">
+{"".join(paragraphs)}
+</hs:sec>"""
+        
+        (contents_dir / "section0.xml").write_text(section, encoding='utf-8')
+    
+    def _create_paragraph(self, text: str, style: HwpStyle, style_idx: int) -> str:
+        """단일 문단 XML 생성"""
+        text = self._escape_xml(text)
+        
+        return f'''
+    <hp:p xmlns:hp="http://www.hancom.co.kr/hwpml/2011/paragraph" 
+          paraPrIDRef="{style_idx}" styleIDRef="{style_idx}" pageBreak="0" columnBreak="0" merged="0">
+        <hp:run charPrIDRef="{style_idx}">
+            <hp:t>{text}</hp:t>
+        </hp:run>
+    </hp:p>'''
+    
+    def _escape_xml(self, text: str) -> str:
+        """XML 특수문자 이스케이프"""
+        return (text
+            .replace("&", "&amp;")
+            .replace("<", "&lt;")
+            .replace(">", "&gt;")
+            .replace('"', "&quot;")
+            .replace("'", "&apos;"))
+    
+    def _create_settings(self, temp_dir: Path):
+        """settings.xml 생성"""
+        settings = """<?xml version="1.0" encoding="UTF-8"?>
+<hs:settings xmlns:hs="http://www.hancom.co.kr/hwpml/2011/settings">
+    <hs:viewSetting>
+        <hs:viewType val="printView"/>
+        <hs:zoom val="100"/>
+    </hs:viewSetting>
+</hs:settings>"""
+        
+        (temp_dir / "settings.xml").write_text(settings, encoding='utf-8')
+    
+    def _create_hwpx(self, temp_dir: Path, output_path: str):
+        """HWPX 파일 생성 (ZIP 압축)"""
+        with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zf:
+            # mimetype은 압축하지 않고 첫 번째로
+            mimetype_path = temp_dir / "mimetype"
+            zf.write(mimetype_path, "mimetype", compress_type=zipfile.ZIP_STORED)
+            
+            # 나머지 파일들
+            for root, dirs, files in os.walk(temp_dir):
+                for file in files:
+                    if file == "mimetype":
+                        continue
+                    file_path = Path(root) / file
+                    arcname = file_path.relative_to(temp_dir)
+                    zf.write(file_path, arcname)
+
+
+def convert_html_to_hwpx(html: str, output_path: str) -> str:
+    """
+    HTML → HWPX 변환 메인 함수
+    
+    Args:
+        html: HTML 문자열
+        output_path: 출력 파일 경로
+    
+    Returns:
+        생성된 파일 경로
+    """
+    # 1. HTML 분석 → 역할 분류
+    analyzer = StyleAnalyzer()
+    elements = analyzer.analyze(html)
+    
+    print(f"📊 분석 완료: {len(elements)}개 요소")
+    for role, count in analyzer.get_role_summary().items():
+        print(f"   {role}: {count}")
+    
+    # 2. HWPX 생성
+    generator = HwpxGenerator()
+    result_path = generator.generate(elements, output_path)
+    
+    print(f"✅ 생성 완료: {result_path}")
+    return result_path
+
+
+if __name__ == "__main__":
+    # 테스트
+    test_html = """
+    <html>
+    <body>
+        <div class="box-cover">
+            <h1>건설·토목 측량 DX 실무지침</h1>
+            <h2>드론/UAV·GIS·지형/지반 모델 기반</h2>
+            <p>2024년 1월</p>
+        </div>
+        
+        <h1>1. 개요</h1>
+        <p>본 보고서는 건설 및 토목 분야의 측량 디지털 전환에 대한 실무 지침을 제공합니다.</p>
+        
+        <h2>1.1 배경</h2>
+        <p>최근 드론과 GIS 기술의 발전으로 측량 업무가 크게 변화하고 있습니다.</p>
+        
+        <h3>1.1.1 기술 동향</h3>
+        <p>1) <strong>드론 측량의 발전</strong></p>
+        <p>드론을 활용한 측량은 기존 방식 대비 효율성이 크게 향상되었습니다.</p>
+        
+        <p>(1) <strong>RTK 드론</strong></p>
+        <p>실시간 보정 기능을 갖춘 RTK 드론이 보급되고 있습니다.</p>
+        
+        <ul>
+            <li>고정밀 GPS 수신기 내장</li>
+            <li>센티미터 단위 정확도</li>
+        </ul>
+    </body>
+    </html>
+    """
+    
+    output = "/home/claude/test_output.hwpx"
+    convert_html_to_hwpx(test_html, output)
--- a/Code/geulbeot_5th/converters/hwpx_style_injector.py
+++ b/Code/geulbeot_5th/converters/hwpx_style_injector.py
@@ -0,0 +1,750 @@
+"""
+HWPX 스타일 주입기
+pyhwpx로 생성된 HWPX 파일에 커스텀 스타일을 후처리로 주입
+
+워크플로우:
+1. HWPX 압축 해제
+2. header.xml에 커스텀 스타일 정의 추가
+3. section*.xml에서 역할별 styleIDRef 매핑
+4. 다시 압축
+"""
+
+import os
+import re
+import zipfile
+import shutil
+import tempfile
+from pathlib import Path
+from typing import Dict, List, Optional
+from dataclasses import dataclass
+
+
+@dataclass
+class StyleDefinition:
+    """스타일 정의"""
+    id: int
+    name: str
+    font_size: int      # hwpunit (pt * 100)
+    font_bold: bool
+    font_color: str     # #RRGGBB
+    align: str          # LEFT, CENTER, RIGHT, JUSTIFY
+    line_spacing: int   # percent (160 = 160%)
+    indent_left: int    # hwpunit
+    indent_first: int   # hwpunit
+    space_before: int   # hwpunit
+    space_after: int    # hwpunit
+    outline_level: int = -1  # 🆕 개요 수준 (-1=없음, 0=1수준, 1=2수준, ...)
+
+
+# 역할 → 스타일 정의 매핑
+ROLE_STYLES: Dict[str, StyleDefinition] = {
+    # 🆕 개요 문단 (자동 번호 매기기!)
+    'H1': StyleDefinition(
+        id=101, name='제1장 제목', font_size=2200, font_bold=True,
+        font_color='#006400', align='CENTER', line_spacing=200,
+        indent_left=0, indent_first=0, space_before=400, space_after=200,
+        outline_level=0  # 🆕 제^1장
+    ),
+    'H2': StyleDefinition(
+        id=102, name='1.1 제목', font_size=1500, font_bold=True,
+        font_color='#03581d', align='LEFT', line_spacing=200,
+        indent_left=0, indent_first=0, space_before=300, space_after=100,
+        outline_level=1  # 🆕 ^1.^2
+    ),
+    'H3': StyleDefinition(
+        id=103, name='1.1.1 제목', font_size=1400, font_bold=True,
+        font_color='#228B22', align='LEFT', line_spacing=200,
+        indent_left=500, indent_first=0, space_before=200, space_after=100,
+        outline_level=2  # 🆕 ^1.^2.^3
+    ),
+    'H4': StyleDefinition(
+        id=104, name='가. 제목', font_size=1300, font_bold=True,
+        font_color='#000000', align='LEFT', line_spacing=200,
+        indent_left=1000, indent_first=0, space_before=150, space_after=50,
+        outline_level=3  # 🆕 ^4.
+    ),
+    'H5': StyleDefinition(
+        id=105, name='1) 제목', font_size=1200, font_bold=True,
+        font_color='#000000', align='LEFT', line_spacing=200,
+        indent_left=1500, indent_first=0, space_before=100, space_after=50,
+        outline_level=4  # 🆕 ^5)
+    ),
+    'H6': StyleDefinition(
+        id=106, name='가) 제목', font_size=1150, font_bold=True,
+        font_color='#000000', align='LEFT', line_spacing=200,
+        indent_left=2000, indent_first=0, space_before=100, space_after=50,
+        outline_level=5  # 🆕 ^6)
+    ),
+    'H7': StyleDefinition(
+        id=115, name='① 제목', font_size=1100, font_bold=True,
+        font_color='#000000', align='LEFT', line_spacing=200,
+        indent_left=2300, indent_first=0, space_before=100, space_after=50,
+        outline_level=6  # 🆕 ^7 (원문자)
+    ),
+    # 본문 스타일 (개요 아님)
+    'BODY': StyleDefinition(
+        id=107, name='○본문', font_size=1100, font_bold=False,
+        font_color='#000000', align='JUSTIFY', line_spacing=200,
+        indent_left=1500, indent_first=0, space_before=0, space_after=0
+    ),
+    'LIST_ITEM': StyleDefinition(
+        id=108, name='●본문', font_size=1050, font_bold=False,
+        font_color='#000000', align='JUSTIFY', line_spacing=200,
+        indent_left=2500, indent_first=0, space_before=0, space_after=0
+    ),
+    'TABLE_CAPTION': StyleDefinition(
+        id=109, name='<표 제목>', font_size=1100, font_bold=True,
+        font_color='#000000', align='LEFT', line_spacing=130,
+        indent_left=0, indent_first=0, space_before=200, space_after=100
+    ),
+    'FIGURE_CAPTION': StyleDefinition(
+        id=110, name='<그림 제목>', font_size=1100, font_bold=True,
+        font_color='#000000', align='CENTER', line_spacing=130,
+        indent_left=0, indent_first=0, space_before=100, space_after=200
+    ),
+    'COVER_TITLE': StyleDefinition(
+        id=111, name='표지제목', font_size=2800, font_bold=True,
+        font_color='#1a365d', align='CENTER', line_spacing=150,
+        indent_left=0, indent_first=0, space_before=0, space_after=200
+    ),
+    'COVER_SUBTITLE': StyleDefinition(
+        id=112, name='표지부제', font_size=1800, font_bold=False,
+        font_color='#2d3748', align='CENTER', line_spacing=150,
+        indent_left=0, indent_first=0, space_before=0, space_after=100
+    ),
+    'TOC_1': StyleDefinition(
+        id=113, name='목차1수준', font_size=1200, font_bold=True,
+        font_color='#000000', align='LEFT', line_spacing=180,
+        indent_left=0, indent_first=0, space_before=100, space_after=50
+    ),
+    'TOC_2': StyleDefinition(
+        id=114, name='목차2수준', font_size=1100, font_bold=False,
+        font_color='#000000', align='LEFT', line_spacing=180,
+        indent_left=500, indent_first=0, space_before=0, space_after=0
+    ),
+}
+
+# ⚠️ 개요 자동 번호 기능 활성화!
+# idRef="0"은 numbering id=1을 참조하므로, 해당 패턴을 교체하면 동작함
+
+
+class HwpxStyleInjector:
+    """HWPX 스타일 주입기"""
+    
+    def __init__(self):
+        self.temp_dir: Optional[Path] = None
+        self.role_to_style_id: Dict[str, int] = {}
+        self.role_to_para_id: Dict[str, int] = {}   # 🆕
+        self.role_to_char_id: Dict[str, int] = {}   # 🆕
+        self.next_char_id = 0
+        self.next_para_id = 0
+        self.next_style_id = 0
+    
+    def _find_max_ids(self):
+        """기존 스타일 교체: 바탕글(id=0)만 유지, 나머지는 우리 스타일로 교체"""
+        header_path = self.temp_dir / "Contents" / "header.xml"
+        if not header_path.exists():
+            self.next_char_id = 1
+            self.next_para_id = 1
+            self.next_style_id = 1
+            return
+        
+        content = header_path.read_text(encoding='utf-8')
+        
+        # 🆕 기존 "본문", "개요 1~10" 등 스타일 제거 (id=1~22)
+        # 바탕글(id=0)만 유지!
+        
+        # style id=1~30 제거 (바탕글 제외)
+        content = re.sub(r'<hh:style id="([1-9]|[12]\d|30)"[^/]*/>\s*', '', content)
+        
+        # itemCnt는 나중에 _update_item_counts에서 자동 업데이트됨
+        
+        # 파일 저장
+        header_path.write_text(content, encoding='utf-8')
+        print(f"   [INFO] 기존 스타일(본문, 개요1~10 등) 제거 완료")
+        
+        # charPr, paraPr은 기존 것 다음부터 (참조 깨지지 않도록)
+        char_ids = [int(m) for m in re.findall(r'<hh:charPr id="(\d+)"', content)]
+        self.next_char_id = max(char_ids) + 1 if char_ids else 20
+        
+        para_ids = [int(m) for m in re.findall(r'<hh:paraPr id="(\d+)"', content)]
+        self.next_para_id = max(para_ids) + 1 if para_ids else 20
+        
+        # 스타일은 1부터 시작! (Ctrl+2 = id=1, Ctrl+3 = id=2, ...)
+        self.next_style_id = 1
+    
+    def inject(self, hwpx_path: str, role_positions: Dict[str, List[tuple]]) -> str:
+        """
+        HWPX 파일에 커스텀 스타일 주입
+        
+        Args:
+            hwpx_path: 원본 HWPX 파일 경로
+            role_positions: 역할별 위치 정보 {role: [(section_idx, para_idx), ...]}
+        
+        Returns:
+            수정된 HWPX 파일 경로
+        """
+        print(f"\n🎨 HWPX 스타일 주입 시작...")
+        print(f"   입력: {hwpx_path}")
+        
+        # 1. 임시 디렉토리에 압축 해제
+        self.temp_dir = Path(tempfile.mkdtemp(prefix='hwpx_inject_'))
+        print(f"   임시 폴더: {self.temp_dir}")
+        
+        try:
+            with zipfile.ZipFile(hwpx_path, 'r') as zf:
+                zf.extractall(self.temp_dir)
+            
+            # 압축 해제 직후 section 파일 크기 확인
+            print(f"   [DEBUG] After unzip:")
+            for sec in ['section0.xml', 'section1.xml', 'section2.xml']:
+                sec_path = self.temp_dir / "Contents" / sec
+                if sec_path.exists():
+                    print(f"   [DEBUG] {sec} size: {sec_path.stat().st_size} bytes")
+            
+            # 🆕 기존 최대 ID 찾기 (연속 ID 할당을 위해)
+            self._find_max_ids()
+            print(f"   [DEBUG] Starting IDs: char={self.next_char_id}, para={self.next_para_id}, style={self.next_style_id}")
+            
+            # 2. header.xml에 스타일 정의 추가
+            used_roles = set(role_positions.keys())
+            self._inject_header_styles(used_roles)
+            
+            # 3. section*.xml에 styleIDRef 매핑
+            self._inject_section_styles(role_positions)
+            
+            # 4. 다시 압축
+            output_path = hwpx_path  # 원본 덮어쓰기
+            self._repack_hwpx(output_path)
+            
+            print(f"   ✅ 스타일 주입 완료: {output_path}")
+            return output_path
+            
+        finally:
+            # 임시 폴더 정리
+            if self.temp_dir and self.temp_dir.exists():
+                shutil.rmtree(self.temp_dir)
+    
+    def _inject_header_styles(self, used_roles: set):
+        """header.xml에 스타일 정의 추가 (모든 ROLE_STYLES 주입)"""
+        header_path = self.temp_dir / "Contents" / "header.xml"
+        if not header_path.exists():
+            print("   [경고] header.xml 없음")
+            return
+        
+        content = header_path.read_text(encoding='utf-8')
+        
+        # 🆕 모든 ROLE_STYLES 주입 (used_roles 무시)
+        char_props = []
+        para_props = []
+        styles = []
+        
+        for role, style_def in ROLE_STYLES.items():
+            char_id = self.next_char_id
+            para_id = self.next_para_id
+            style_id = self.next_style_id
+            
+            self.role_to_style_id[role] = style_id
+            self.role_to_para_id[role] = para_id    # 🆕
+            self.role_to_char_id[role] = char_id    # 🆕
+            
+            # charPr 생성
+            char_props.append(self._make_char_pr(char_id, style_def))
+            
+            # paraPr 생성
+            para_props.append(self._make_para_pr(para_id, style_def))
+            
+            # style 생성
+            styles.append(self._make_style(style_id, style_def.name, para_id, char_id))
+            
+            self.next_char_id += 1
+            self.next_para_id += 1
+            self.next_style_id += 1
+        
+        if not styles:
+            print("   [정보] 주입할 스타일 없음")
+            return
+        
+        # charProperties에 추가
+        content = self._insert_before_tag(
+            content, '</hh:charProperties>', '\n'.join(char_props) + '\n'
+        )
+        
+        # paraProperties에 추가
+        content = self._insert_before_tag(
+            content, '</hh:paraProperties>', '\n'.join(para_props) + '\n'
+        )
+        
+        # styles에 추가
+        content = self._insert_before_tag(
+            content, '</hh:styles>', '\n'.join(styles) + '\n'
+        )
+        
+        # 🆕 numbering id=1 패턴 교체 (idRef="0"이 참조하는 기본 번호 모양)
+        # 이렇게 하면 개요 자동 번호가 "제1장, 1.1, 1.1.1..." 형식으로 동작!
+        content = self._replace_default_numbering(content)
+        
+        # itemCnt 업데이트
+        content = self._update_item_counts(content)
+        
+        header_path.write_text(content, encoding='utf-8')
+        print(f"   → header.xml 수정 완료 ({len(styles)}개 스타일 추가)")
+    
+    def _make_char_pr(self, id: int, style: StyleDefinition) -> str:
+        """charPr XML 생성 (한 줄로!)"""
+        color = style.font_color.lstrip('#')
+        font_id = "1" if style.font_bold else "0"
+        
+        return f'<hh:charPr id="{id}" height="{style.font_size}" textColor="#{color}" shadeColor="none" useFontSpace="0" useKerning="0" symMark="NONE" borderFillIDRef="1"><hh:fontRef hangul="{font_id}" latin="{font_id}" hanja="{font_id}" japanese="{font_id}" other="{font_id}" symbol="{font_id}" user="{font_id}"/><hh:ratio hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/><hh:spacing hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/><hh:relSz hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/><hh:offset hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/><hh:underline type="NONE" shape="SOLID" color="#000000"/><hh:strikeout shape="NONE" color="#000000"/><hh:outline type="NONE"/><hh:shadow type="NONE" color="#B2B2B2" offsetX="10" offsetY="10"/></hh:charPr>'
+    
+    def _make_para_pr(self, id: int, style: StyleDefinition) -> str:
+        """paraPr XML 생성 (한 줄로!)"""
+        # 개요 문단이면 type="OUTLINE", 아니면 type="NONE"
+        # idRef="0"은 numbering id=1 (기본 번호 모양)을 참조
+        if style.outline_level >= 0:
+            heading = f'<hh:heading type="OUTLINE" idRef="0" level="{style.outline_level}"/>'
+        else:
+            heading = '<hh:heading type="NONE" idRef="0" level="0"/>'
+        
+        return f'<hh:paraPr id="{id}" tabPrIDRef="0" condense="0" fontLineHeight="0" snapToGrid="0" suppressLineNumbers="0" checked="0"><hh:align horizontal="{style.align}" vertical="BASELINE"/>{heading}<hh:breakSetting breakLatinWord="KEEP_WORD" breakNonLatinWord="KEEP_WORD" widowOrphan="0" keepWithNext="0" keepLines="0" pageBreakBefore="0" lineWrap="BREAK"/><hh:autoSpacing eAsianEng="0" eAsianNum="0"/><hh:margin><hc:intent value="{style.indent_first}" unit="HWPUNIT"/><hc:left value="{style.indent_left}" unit="HWPUNIT"/><hc:right value="0" unit="HWPUNIT"/><hc:prev value="{style.space_before}" unit="HWPUNIT"/><hc:next value="{style.space_after}" unit="HWPUNIT"/></hh:margin><hh:lineSpacing type="PERCENT" value="{style.line_spacing}" unit="HWPUNIT"/><hh:border borderFillIDRef="1" offsetLeft="0" offsetRight="0" offsetTop="0" offsetBottom="0" connect="0" ignoreMargin="0"/></hh:paraPr>'
+    
+    def _make_style(self, id: int, name: str, para_id: int, char_id: int) -> str:
+        """style XML 생성"""
+        safe_name = name.replace('<', '&lt;').replace('>', '&gt;')
+        return f'<hh:style id="{id}" type="PARA" name="{safe_name}" engName="" paraPrIDRef="{para_id}" charPrIDRef="{char_id}" nextStyleIDRef="{id}" langID="1042" lockForm="0"/>'
+    
+    def _insert_before_tag(self, content: str, tag: str, insert_text: str) -> str:
+        """특정 태그 앞에 텍스트 삽입"""
+        return content.replace(tag, insert_text + tag)
+    
+    def _update_item_counts(self, content: str) -> str:
+        """itemCnt 속성 업데이트"""
+        # charProperties itemCnt
+        char_count = content.count('<hh:charPr ')
+        content = re.sub(
+            r'<hh:charProperties itemCnt="(\d+)"',
+            f'<hh:charProperties itemCnt="{char_count}"',
+            content
+        )
+        
+        # paraProperties itemCnt
+        para_count = content.count('<hh:paraPr ')
+        content = re.sub(
+            r'<hh:paraProperties itemCnt="(\d+)"',
+            f'<hh:paraProperties itemCnt="{para_count}"',
+            content
+        )
+        
+        # styles itemCnt
+        style_count = content.count('<hh:style ')
+        content = re.sub(
+            r'<hh:styles itemCnt="(\d+)"',
+            f'<hh:styles itemCnt="{style_count}"',
+            content
+        )
+        
+        # 🆕 numberings itemCnt
+        numbering_count = content.count('<hh:numbering ')
+        content = re.sub(
+            r'<hh:numberings itemCnt="(\d+)"',
+            f'<hh:numberings itemCnt="{numbering_count}"',
+            content
+        )
+        
+        return content
+    
+    def _replace_default_numbering(self, content: str) -> str:
+        """numbering id=1의 패턴을 우리 패턴으로 교체"""
+        # 우리가 원하는 개요 번호 패턴
+        new_patterns = [
+            {'level': '1', 'format': 'DIGIT', 'pattern': '제^1장'},
+            {'level': '2', 'format': 'DIGIT', 'pattern': '^1.^2'},
+            {'level': '3', 'format': 'DIGIT', 'pattern': '^1.^2.^3'},
+            {'level': '4', 'format': 'HANGUL_SYLLABLE', 'pattern': '^4.'},
+            {'level': '5', 'format': 'DIGIT', 'pattern': '^5)'},
+            {'level': '6', 'format': 'HANGUL_SYLLABLE', 'pattern': '^6)'},
+            {'level': '7', 'format': 'CIRCLED_DIGIT', 'pattern': '^7'},
+        ]
+        
+        # numbering id="1" 찾기
+        match = re.search(r'(<hh:numbering id="1"[^>]*>)(.*?)(</hh:numbering>)', content, re.DOTALL)
+        if not match:
+            print("   [경고] numbering id=1 없음, 교체 건너뜀")
+            return content
+        
+        numbering_content = match.group(2)
+        
+        for np in new_patterns:
+            level = np['level']
+            fmt = np['format']
+            pattern = np['pattern']
+            
+            # 해당 level의 paraHead 찾아서 교체
+            def replace_parahead(m):
+                tag = m.group(0)
+                # numFormat 변경
+                tag = re.sub(r'numFormat="[^"]*"', f'numFormat="{fmt}"', tag)
+                # 패턴(텍스트 내용) 변경
+                tag = re.sub(r'>([^<]*)</hh:paraHead>', f'>{pattern}</hh:paraHead>', tag)
+                return tag
+            
+            numbering_content = re.sub(
+                rf'<hh:paraHead[^>]*level="{level}"[^>]*>.*?</hh:paraHead>',
+                replace_parahead,
+                numbering_content
+            )
+        
+        new_content = match.group(1) + numbering_content + match.group(3)
+        print("   [INFO] numbering id=1 패턴 교체 완료 (제^1장, ^1.^2, ^1.^2.^3...)")
+        return content.replace(match.group(0), new_content)
+
+    def _adjust_tables(self, content: str) -> str:
+        """표 셀 크기 자동 조정
+        
+        1. 행 높이: 최소 800 hwpunit (내용 잘림 방지)
+        2. 열 너비: 표 전체 너비를 열 개수로 균등 분배 (또는 첫 열 좁게)
+        """
+        
+        def adjust_table(match):
+            tbl = match.group(0)
+            
+            # 표 전체 너비 추출
+            sz_match = re.search(r'<hp:sz width="(\d+)"', tbl)
+            table_width = int(sz_match.group(1)) if sz_match else 47624
+            
+            # 열 개수 추출
+            col_match = re.search(r'colCnt="(\d+)"', tbl)
+            col_cnt = int(col_match.group(1)) if col_match else 4
+            
+            # 열 너비 계산 (첫 열은 30%, 나머지 균등)
+            first_col_width = int(table_width * 0.25)
+            other_col_width = (table_width - first_col_width) // (col_cnt - 1) if col_cnt > 1 else table_width
+            
+            # 행 높이 최소값 설정
+            min_height = 800  # 약 8mm
+            
+            # 셀 크기 조정
+            col_idx = [0]  # closure용
+            
+            def adjust_cell_sz(cell_match):
+                width = int(cell_match.group(1))
+                height = int(cell_match.group(2))
+                
+                # 높이 조정
+                new_height = max(height, min_height)
+                
+                return f'<hp:cellSz width="{width}" height="{new_height}"/>'
+            
+            tbl = re.sub(
+                r'<hp:cellSz width="(\d+)" height="(\d+)"/>',
+                adjust_cell_sz,
+                tbl
+            )
+            
+            return tbl
+        
+        return re.sub(r'<hp:tbl[^>]*>.*?</hp:tbl>', adjust_table, content, flags=re.DOTALL)
+
+    def _inject_section_styles(self, role_positions: Dict[str, List[tuple]]):
+        """section*.xml에 styleIDRef 매핑 (텍스트 매칭 방식)"""
+        contents_dir = self.temp_dir / "Contents"
+        
+        # 🔍 디버그: role_to_style_id 확인
+        print(f"   [DEBUG] role_to_style_id: {self.role_to_style_id}")
+        
+        # section 파일들 찾기
+        section_files = sorted(contents_dir.glob("section*.xml"))
+        print(f"   [DEBUG] section files: {[f.name for f in section_files]}")
+        
+        total_modified = 0
+        
+        for section_file in section_files:
+            print(f"   [DEBUG] Processing: {section_file.name}")
+            original_content = section_file.read_text(encoding='utf-8')
+            print(f"   [DEBUG] File size: {len(original_content)} bytes")
+            
+            content = original_content  # 작업용 복사본
+            
+            # 🆕 머리말/꼬리말 영역 보존 (placeholder로 교체)
+            header_footer_map = {}
+            placeholder_idx = 0
+            
+            def save_header_footer(match):
+                nonlocal placeholder_idx
+                key = f"__HF_PLACEHOLDER_{placeholder_idx}__"
+                header_footer_map[key] = match.group(0)
+                placeholder_idx += 1
+                return key
+            
+            # 머리말/꼬리말 임시 교체
+            content = re.sub(r'<hp:header[^>]*>.*?</hp:header>', save_header_footer, content, flags=re.DOTALL)
+            content = re.sub(r'<hp:footer[^>]*>.*?</hp:footer>', save_header_footer, content, flags=re.DOTALL)
+            
+            # 모든 <hp:p> 태그와 내부 텍스트 추출
+            para_pattern = r'(<hp:p [^>]*>)(.*?)(</hp:p>)'
+            
+            section_modified = 0
+            
+            def replace_style(match):
+                nonlocal total_modified, section_modified
+                open_tag = match.group(1)
+                inner = match.group(2)
+                close_tag = match.group(3)
+                
+                # 텍스트 추출 (태그 제거)
+                text = re.sub(r'<[^>]+>', '', inner).strip()
+                if not text:
+                    return match.group(0)
+                
+                # 텍스트 앞부분으로 역할 판단
+                text_start = text[:50]  # 처음 50자로 판단
+                
+                matched_role = None
+                matched_style_id = None
+                matched_para_id = None
+                matched_char_id = None
+                
+                # 제목 패턴 매칭 (앞에 특수문자 허용)
+                # Unicode: ■\u25a0 ▸\u25b8 ◆\u25c6 ▶\u25b6 ●\u25cf ○\u25cb ▪\u25aa ►\u25ba ☞\u261e ★\u2605 ※\u203b ·\u00b7
+                prefix = r'^[\u25a0\u25b8\u25c6\u25b6\u25cf\u25cb\u25aa\u25ba\u261e\u2605\u203b\u00b7\s]*'
+                
+                # 🆕 FIGURE_CAPTION: "[그림 1-1]", "[그림 1-2]" 등 (가장 먼저 체크!)
+                # 그림 = \uadf8\ub9bc
+                if re.match(r'^\[\uadf8\ub9bc\s*[\d-]+\]', text_start):
+                    matched_role = 'FIGURE_CAPTION'
+                # 🆕 TABLE_CAPTION: "<표 1-1>", "[표 1-1]" 등
+                # 표 = \ud45c
+                elif re.match(r'^[<\[]\ud45c\s*[\d-]+[>\]]', text_start):
+                    matched_role = 'TABLE_CAPTION'
+                # H1: "제1장", "1 개요" 등
+                elif re.match(prefix + r'\uc81c?\s*\d+\uc7a5?\s', text_start) or re.match(prefix + r'[1-9]\s+[\uac00-\ud7a3]', text_start):
+                    matched_role = 'H1'
+                # H3: "1.1.1 " (H2보다 먼저 체크!)
+                elif re.match(prefix + r'\d+\.\d+\.\d+\s', text_start):
+                    matched_role = 'H3'
+                # H2: "1.1 "
+                elif re.match(prefix + r'\d+\.\d+\s', text_start):
+                    matched_role = 'H2'
+                # H4: "가. "
+                elif re.match(prefix + r'[\uac00-\ud7a3]\.\s', text_start):
+                    matched_role = 'H4'
+                # H5: "1) "
+                elif re.match(prefix + r'\d+\)\s', text_start):
+                    matched_role = 'H5'
+                # H6: "(1) " 또는 "가) "
+                elif re.match(prefix + r'\(\d+\)\s', text_start):
+                    matched_role = 'H6'
+                elif re.match(prefix + r'[\uac00-\ud7a3]\)\s', text_start):
+                    matched_role = 'H6'
+                # LIST_ITEM: "○ ", "● ", "• " 등
+                elif re.match(r'^[\u25cb\u25cf\u25e6\u2022\u2023\u25b8]\s', text_start):
+                    matched_role = 'LIST_ITEM'
+                elif re.match(r'^[-\u2013\u2014]\s', text_start):
+                    matched_role = 'LIST_ITEM'
+                
+                # 매칭된 역할이 있고 스타일 ID가 있으면 적용
+                if matched_role and matched_role in self.role_to_style_id:
+                    matched_style_id = self.role_to_style_id[matched_role]
+                    matched_para_id = self.role_to_para_id[matched_role]
+                    matched_char_id = self.role_to_char_id[matched_role]
+                elif 'BODY' in self.role_to_style_id and len(text) > 20:
+                    # 긴 텍스트는 본문으로 간주
+                    matched_role = 'BODY'
+                    matched_style_id = self.role_to_style_id['BODY']
+                    matched_para_id = self.role_to_para_id['BODY']
+                    matched_char_id = self.role_to_char_id['BODY']
+                
+                if matched_style_id:
+                    # 1. hp:p 태그의 styleIDRef 변경
+                    if 'styleIDRef="' in open_tag:
+                        new_open = re.sub(r'styleIDRef="[^"]*"', f'styleIDRef="{matched_style_id}"', open_tag)
+                    else:
+                        new_open = open_tag.replace('<hp:p ', f'<hp:p styleIDRef="{matched_style_id}" ')
+                    
+                    # 2. hp:p 태그의 paraPrIDRef도 변경! (스타일의 paraPrIDRef와 일치!)
+                    new_open = re.sub(r'paraPrIDRef="[^"]*"', f'paraPrIDRef="{matched_para_id}"', new_open)
+                    
+                    # 3. inner에서 hp:run의 charPrIDRef도 변경! (스타일의 charPrIDRef와 일치!)
+                    new_inner = re.sub(r'(<hp:run[^>]*charPrIDRef=")[^"]*(")', f'\\g<1>{matched_char_id}\\2', inner)
+                    
+                    # 🆕 4. 개요 문단이면 수동 번호 제거 (자동 번호가 붙으니까!)
+                    if matched_role in ROLE_STYLES and ROLE_STYLES[matched_role].outline_level >= 0:
+                        new_inner = self._remove_manual_numbering(new_inner, matched_role)
+                    
+                    total_modified += 1
+                    section_modified += 1
+                    return new_open + new_inner + close_tag
+                
+                return match.group(0)
+            
+            new_content = re.sub(para_pattern, replace_style, content, flags=re.DOTALL)
+            
+            # 🆕 표 크기 자동 조정
+            new_content = self._adjust_tables(new_content)
+            
+            # 🆕 outlineShapeIDRef를 1로 변경 (우리가 교체한 numbering id=1 사용)
+            new_content = re.sub(
+                r'outlineShapeIDRef="[^"]*"',
+                'outlineShapeIDRef="1"',
+                new_content
+            )
+
+ 
+            # 🆕 머리말/꼬리말 복원
+            for key, original in header_footer_map.items():
+                new_content = new_content.replace(key, original)
+            
+            print(f"   [DEBUG] {section_file.name}: {section_modified} paras modified, content changed: {new_content != original_content}")
+            
+            if new_content != original_content:
+                section_file.write_text(new_content, encoding='utf-8')
+                print(f"   -> {section_file.name} saved")
+        
+        print(f"   -> Total {total_modified} paragraphs styled")
+    
+    def _update_para_style(self, content: str, para_idx: int, style_id: int) -> str:
+        """특정 인덱스의 문단 styleIDRef 변경"""
+        # <hp:p ...> 태그들 찾기
+        pattern = r'<hp:p\s[^>]*>'
+        matches = list(re.finditer(pattern, content))
+        
+        if para_idx >= len(matches):
+            return content
+        
+        match = matches[para_idx]
+        old_tag = match.group(0)
+        
+        # styleIDRef 속성 변경 또는 추가
+        if 'styleIDRef=' in old_tag:
+            new_tag = re.sub(r'styleIDRef="[^"]*"', f'styleIDRef="{style_id}"', old_tag)
+        else:
+            # 속성 추가
+            new_tag = old_tag.replace('<hp:p ', f'<hp:p styleIDRef="{style_id}" ')
+        
+        return content[:match.start()] + new_tag + content[match.end():]
+    
+    def _remove_manual_numbering(self, inner: str, role: str) -> str:
+        """🆕 개요 문단에서 수동 번호 제거 (자동 번호가 붙으니까!)
+        
+        HTML에서 "제1장 DX 개요" → "DX 개요" (자동으로 "제1장" 붙음)
+        HTML에서 "1.1 측량 DX" → "측량 DX" (자동으로 "1.1" 붙음)
+        """
+        # 역할별 번호 패턴
+        patterns = {
+            'H1': r'^(제\s*\d+\s*장\s*)',          # "제1장 " → 제거
+            'H2': r'^(\d+\.\d+\s+)',               # "1.1 " → 제거
+            'H3': r'^(\d+\.\d+\.\d+\s+)',          # "1.1.1 " → 제거
+            'H4': r'^([가-힣]\.\s+)',              # "가. " → 제거
+            'H5': r'^(\d+\)\s+)',                  # "1) " → 제거
+            'H6': r'^([가-힣]\)\s+|\(\d+\)\s+)',   # "가) " 또는 "(1) " → 제거
+            'H7': r'^([①②③④⑤⑥⑦⑧⑨⑩]+\s*)',   # "① " → 제거
+        }
+        
+        if role not in patterns:
+            return inner
+        
+        pattern = patterns[role]
+        
+        # <hp:t> 태그 내 텍스트에서 번호 제거
+        def remove_number(match):
+            text = match.group(1)
+            # 첫 번째 <hp:t> 내용에서만 번호 제거
+            new_text = re.sub(pattern, '', text, count=1)
+            return f'<hp:t>{new_text}</hp:t>'
+        
+        # 첫 번째 hp:t 태그만 처리
+        new_inner = re.sub(r'<hp:t>([^<]*)</hp:t>', remove_number, inner, count=1)
+        
+        return new_inner
+    
+    def _repack_hwpx(self, output_path: str):
+        """HWPX 재압축"""
+        print(f"   [DEBUG] Repacking to: {output_path}")
+        print(f"   [DEBUG] Source dir: {self.temp_dir}")
+        
+        # 압축 전 section 파일 크기 확인
+        for sec in ['section0.xml', 'section1.xml', 'section2.xml']:
+            sec_path = self.temp_dir / "Contents" / sec
+            if sec_path.exists():
+                print(f"   [DEBUG] {sec} size before zip: {sec_path.stat().st_size} bytes")
+        
+        # 🆕 임시 파일에 먼저 저장 (원본 파일 잠금 문제 회피)
+        temp_output = output_path + ".tmp"
+        
+        with zipfile.ZipFile(temp_output, 'w', zipfile.ZIP_DEFLATED) as zf:
+            # mimetype은 압축 없이 첫 번째로
+            mimetype_path = self.temp_dir / "mimetype"
+            if mimetype_path.exists():
+                zf.write(mimetype_path, "mimetype", compress_type=zipfile.ZIP_STORED)
+            
+            # 나머지 파일들
+            file_count = 0
+            for root, dirs, files in os.walk(self.temp_dir):
+                for file in files:
+                    if file == "mimetype":
+                        continue
+                    file_path = Path(root) / file
+                    arcname = file_path.relative_to(self.temp_dir)
+                    zf.write(file_path, arcname)
+                    file_count += 1
+            
+            print(f"   [DEBUG] Total files zipped: {file_count}")
+        
+        # 🆕 원본 삭제 후 임시 파일을 원본 이름으로 변경
+        import time
+        for attempt in range(3):
+            try:
+                if os.path.exists(output_path):
+                    os.remove(output_path)
+                os.rename(temp_output, output_path)
+                break
+            except PermissionError:
+                print(f"   [DEBUG] 파일 잠금 대기 중... ({attempt + 1}/3)")
+                time.sleep(0.5)
+        else:
+            # 3번 시도 실패 시 임시 파일 이름으로 유지
+            print(f"   [경고] 원본 덮어쓰기 실패, 임시 파일 사용: {temp_output}")
+            output_path = temp_output
+        
+        # 압축 후 결과 확인
+        print(f"   [DEBUG] Output file size: {Path(output_path).stat().st_size} bytes")
+
+
+def inject_styles_to_hwpx(hwpx_path: str, elements: list) -> str:
+    """
+    편의 함수: StyledElement 리스트로부터 역할 위치 추출 후 스타일 주입
+    
+    Args:
+        hwpx_path: HWPX 파일 경로
+        elements: StyleAnalyzer의 StyledElement 리스트
+    
+    Returns:
+        수정된 HWPX 파일 경로
+    """
+    # 역할별 위치 수집
+    # 참고: 현재는 section 0, para 순서대로 가정
+    role_positions: Dict[str, List[tuple]] = {}
+    
+    for idx, elem in enumerate(elements):
+        role = elem.role
+        if role not in role_positions:
+            role_positions[role] = []
+        # (section_idx, para_idx) - 현재는 section 0 가정
+        role_positions[role].append((0, idx))
+    
+    injector = HwpxStyleInjector()
+    return injector.inject(hwpx_path, role_positions)
+
+
+# 테스트
+if __name__ == "__main__":
+    # 테스트용
+    test_positions = {
+        'H1': [(0, 0), (0, 5)],
+        'H2': [(0, 1), (0, 6)],
+        'BODY': [(0, 2), (0, 3), (0, 4)],
+    }
+    
+    # injector = HwpxStyleInjector()
+    # injector.inject("test.hwpx", test_positions)
+    print("HwpxStyleInjector 모듈 로드 완료")
--- a/Code/geulbeot_5th/converters/hwpx_table_injector.py
+++ b/Code/geulbeot_5th/converters/hwpx_table_injector.py
@@ -0,0 +1,174 @@
+# -*- coding: utf-8 -*-
+"""
+HWPX 표 열 너비 수정기 v2
+표 생성 후 HWPX 파일을 직접 수정하여 열 너비 적용
+"""
+
+import zipfile
+import re
+from pathlib import Path
+import tempfile
+import shutil
+
+# mm → HWPML 단위 변환 (1mm ≈ 283.46 HWPML units)
+MM_TO_HWPML = 7200 / 25.4  # ≈ 283.46
+
+
+def inject_table_widths(hwpx_path: str, table_widths_list: list):
+    """
+    HWPX 파일의 표 열 너비를 수정
+    
+    Args:
+        hwpx_path: HWPX 파일 경로
+        table_widths_list: [[w1, w2, w3], [w1, w2], ...] 형태 (mm 단위)
+    """
+    if not table_widths_list:
+        print("   [INFO] 수정할 표 없음")
+        return
+    
+    print(f"📐 HWPX 표 열 너비 수정 시작... ({len(table_widths_list)}개 표)")
+    
+    # HWPX 압축 해제
+    temp_dir = Path(tempfile.mkdtemp(prefix="hwpx_table_"))
+    
+    with zipfile.ZipFile(hwpx_path, 'r') as zf:
+        zf.extractall(temp_dir)
+    
+    # section*.xml 파일들에서 표 찾기
+    contents_dir = temp_dir / "Contents"
+    
+    table_idx = 0
+    total_modified = 0
+    
+    for section_file in sorted(contents_dir.glob("section*.xml")):
+        with open(section_file, 'r', encoding='utf-8') as f:
+            content = f.read()
+        
+        original_content = content
+        
+        # 모든 표(<hp:tbl>...</hp:tbl>) 찾기
+        tbl_pattern = re.compile(r'(<hp:tbl\b[^>]*>)(.*?)(</hp:tbl>)', re.DOTALL)
+        
+        def process_table(match):
+            nonlocal table_idx, total_modified
+            
+            if table_idx >= len(table_widths_list):
+                return match.group(0)
+            
+            tbl_open = match.group(1)
+            tbl_content = match.group(2)
+            tbl_close = match.group(3)
+            
+            col_widths_mm = table_widths_list[table_idx]
+            col_widths_hwpml = [int(w * MM_TO_HWPML) for w in col_widths_mm]
+            
+            # 표 전체 너비 수정 (hp:sz width="...")
+            total_width = int(sum(col_widths_mm) * MM_TO_HWPML)
+            tbl_content = re.sub(
+                r'(<hp:sz\s+width=")(\d+)(")',
+                lambda m: f'{m.group(1)}{total_width}{m.group(3)}',
+                tbl_content,
+                count=1
+            )
+            
+            # 각 셀의 cellSz width 수정
+            # 방법: colAddr별로 너비 매핑
+            def replace_cell_width(tc_match):
+                tc_content = tc_match.group(0)
+                
+                # colAddr 추출
+                col_addr_match = re.search(r'<hp:cellAddr\s+colAddr="(\d+)"', tc_content)
+                if not col_addr_match:
+                    return tc_content
+                
+                col_idx = int(col_addr_match.group(1))
+                if col_idx >= len(col_widths_hwpml):
+                    return tc_content
+                
+                new_width = col_widths_hwpml[col_idx]
+                
+                # cellSz width 교체
+                tc_content = re.sub(
+                    r'(<hp:cellSz\s+width=")(\d+)(")',
+                    lambda m: f'{m.group(1)}{new_width}{m.group(3)}',
+                    tc_content
+                )
+                
+                return tc_content
+            
+            # 각 <hp:tc>...</hp:tc> 블록 처리
+            tbl_content = re.sub(
+                r'<hp:tc\b[^>]*>.*?</hp:tc>',
+                replace_cell_width,
+                tbl_content,
+                flags=re.DOTALL
+            )
+            
+            print(f"   ✅ 표 #{table_idx + 1}: {col_widths_mm} mm → HWPML 적용")
+            table_idx += 1
+            total_modified += 1
+            
+            return tbl_open + tbl_content + tbl_close
+        
+        # 표 처리
+        new_content = tbl_pattern.sub(process_table, content)
+        
+        # 변경사항 있으면 저장
+        if new_content != original_content:
+            with open(section_file, 'w', encoding='utf-8') as f:
+                f.write(new_content)
+            print(f"   → {section_file.name} 저장됨")
+    
+    # 다시 압축
+    repack_hwpx(temp_dir, hwpx_path)
+    
+    # 임시 폴더 삭제
+    shutil.rmtree(temp_dir)
+    
+    print(f"   ✅ 총 {total_modified}개 표 열 너비 수정 완료")
+
+
+def repack_hwpx(source_dir: Path, output_path: str):
+    """HWPX 파일 다시 압축"""
+    import os
+    import time
+    
+    temp_output = output_path + ".tmp"
+    
+    with zipfile.ZipFile(temp_output, 'w', zipfile.ZIP_DEFLATED) as zf:
+        # mimetype은 압축 없이 첫 번째로
+        mimetype_path = source_dir / "mimetype"
+        if mimetype_path.exists():
+            zf.write(mimetype_path, "mimetype", compress_type=zipfile.ZIP_STORED)
+        
+        # 나머지 파일들
+        for root, dirs, files in os.walk(source_dir):
+            for file in files:
+                if file == "mimetype":
+                    continue
+                file_path = Path(root) / file
+                arcname = file_path.relative_to(source_dir)
+                zf.write(file_path, arcname)
+    
+    # 원본 교체
+    for attempt in range(3):
+        try:
+            if os.path.exists(output_path):
+                os.remove(output_path)
+            os.rename(temp_output, output_path)
+            break
+        except PermissionError:
+            time.sleep(0.5)
+
+
+# 테스트용
+if __name__ == "__main__":
+    test_widths = [
+        [18.2, 38.9, 42.8, 70.1],
+        [19.9, 79.6, 70.5],
+        [28.7, 81.4, 59.9],
+        [19.2, 61.4, 89.5],
+    ]
+    
+    hwpx_path = r"C:\Users\User\AppData\Local\Temp\geulbeot_output.hwpx"
+    inject_table_widths(hwpx_path, test_widths)
--- a/Code/geulbeot_5th/converters/pipeline/init.py
+++ b/Code/geulbeot_5th/converters/pipeline/init.py
@@ -0,0 +1 @@
+from .router import process_document, is_long_document
--- a/Code/geulbeot_5th/converters/pipeline/router.py
+++ b/Code/geulbeot_5th/converters/pipeline/router.py
@@ -0,0 +1,139 @@
+# -*- coding: utf-8 -*-
+"""
+router.py
+
+기능:
+- HTML 입력의 분량을 판단하여 적절한 파이프라인으로 분기
+- 긴 문서 (5000자 이상): RAG 파이프라인 (step3→4→5→6→7→8→9)
+- 짧은 문서 (5000자 미만): 직접 생성 (step7→8→9)
+"""
+
+import re
+import os
+from typing import Dict, Any
+
+# 분량 판단 기준
+LONG_DOC_THRESHOLD = 5000  # 5000자 이상이면 긴 문서
+
+# 이미지 assets 경로 (개발용 고정) - r prefix 필수!
+ASSETS_BASE_PATH = r"D:\for python\geulbeot-light\geulbeot-light\output\assets"
+
+def count_characters(html_content: str) -> int:
+    """HTML 태그 제외한 순수 텍스트 글자 수 계산"""
+    # HTML 태그 제거
+    text_only = re.sub(r'<[^>]+>', '', html_content)
+    # 공백 정리
+    text_only = ' '.join(text_only.split())
+    return len(text_only)
+
+
+def is_long_document(html_content: str) -> bool:
+    """긴 문서 여부 판단"""
+    char_count = count_characters(html_content)
+    return char_count >= LONG_DOC_THRESHOLD
+
+def convert_image_paths(html_content: str) -> str:
+    """
+    HTML 내 상대 이미지 경로를 서버 경로로 변환
+    assets/xxx.png → /assets/xxx.png
+    """
+    result = re.sub(r'src="assets/', 'src="/assets/', html_content)
+    return result
+    
+    def replace_src(match):
+        original_path = match.group(1)
+        # 이미 절대 경로이거나 URL이면 그대로
+        if original_path.startswith(('http://', 'https://', 'file://', 'D:', 'C:')):
+            return match.group(0)
+        
+        # assets/로 시작하면 절대 경로로 변환
+        if original_path.startswith('assets/'):
+            filename = original_path.replace('assets/', '')
+            absolute_path = os.path.join(ASSETS_BASE_PATH, filename)
+            return f'src="{absolute_path}"'
+        
+        return match.group(0)
+    
+    # src="..." 패턴 찾아서 변환
+    result = re.sub(r'src="([^"]+)"', replace_src, html_content)
+    return result
+
+def run_short_pipeline(html_content: str, options: dict) -> Dict[str, Any]:
+    """
+    짧은 문서 파이프라인 (5000자 미만)
+    """
+    try:
+        # 이미지 경로 변환
+        processed_html = convert_image_paths(html_content)
+        
+        # TODO: step7, step8, step9 연동
+        return {
+            'success': True,
+            'pipeline': 'short',
+            'char_count': count_characters(html_content),
+            'html': processed_html
+        }
+    except Exception as e:
+        return {
+            'success': False,
+            'error': str(e),
+            'pipeline': 'short'
+        }
+
+
+def run_long_pipeline(html_content: str, options: dict) -> Dict[str, Any]:
+    """
+    긴 문서 파이프라인 (5000자 이상)
+    """
+    try:
+        # 이미지 경로 변환
+        processed_html = convert_image_paths(html_content)
+        
+        # TODO: step3~9 순차 실행
+        return {
+            'success': True,
+            'pipeline': 'long',
+            'char_count': count_characters(html_content),
+            'html': processed_html
+        }
+    except Exception as e:
+        return {
+            'success': False,
+            'error': str(e),
+            'pipeline': 'long'
+        }
+
+
+def process_document(content: str, options: dict = None) -> Dict[str, Any]:
+    """
+    메인 라우터 함수
+    - 분량에 따라 적절한 파이프라인으로 분기
+    
+    Args:
+        content: HTML 문자열
+        options: 추가 옵션 (page_option, instruction 등)
+    
+    Returns:
+        {'success': bool, 'html': str, 'pipeline': str, ...}
+    """
+    if options is None:
+        options = {}
+    
+    if not content or not content.strip():
+        return {
+            'success': False,
+            'error': '내용이 비어있습니다.'
+        }
+    
+    char_count = count_characters(content)
+    
+    if is_long_document(content):
+        result = run_long_pipeline(content, options)
+    else:
+        result = run_short_pipeline(content, options)
+    
+    # 공통 정보 추가
+    result['char_count'] = char_count
+    result['threshold'] = LONG_DOC_THRESHOLD
+    
+    return result
--- a/Code/geulbeot_5th/converters/pipeline/step1_convert.py
+++ b/Code/geulbeot_5th/converters/pipeline/step1_convert.py
@@ -0,0 +1,784 @@
+"""
+측량/GIS/드론 관련 자료 PDF 변환 및 정리 시스템
+- 모든 파일 형식을 PDF로 변환
+- DWG 파일: DWG TrueView를 사용한 자동 PDF 변환
+- 동영상 파일: Whisper를 사용한 음성→텍스트 변환 후 PDF 생성
+- 원본 경로와 변환 파일 경로를 엑셀로 관리
+"""
+
+import os
+import shutil
+from pathlib import Path
+from datetime import datetime
+import openpyxl
+from openpyxl.styles import Font, PatternFill, Alignment
+import win32com.client
+import pythoncom
+from PIL import Image
+import subprocess
+import json
+
+class SurveyingFileConverter:
+    def _dbg(self, msg):
+        if getattr(self, "debug", False):
+            print(msg)
+
+    def _ensure_ffmpeg_on_path(self):
+        import os
+        import shutil
+        from pathlib import Path
+
+        found = shutil.which("ffmpeg")
+        self._dbg(f"DEBUG ffmpeg which before: {found}")
+        if found:
+            self.ffmpeg_exe = found
+            return True
+
+        try:
+            import imageio_ffmpeg
+            
+            src = Path(imageio_ffmpeg.get_ffmpeg_exe())
+            self._dbg(f"DEBUG imageio ffmpeg exe: {src}")
+            self._dbg(f"DEBUG imageio ffmpeg exists: {src.exists()}")
+
+            if not src.exists():
+                return False
+
+            tools_dir = Path(self.output_dir) / "tools_ffmpeg"
+            tools_dir.mkdir(parents=True, exist_ok=True)
+
+            dst = tools_dir / "ffmpeg.exe"
+
+            if not dst.exists():
+                shutil.copyfile(str(src), str(dst))
+
+            os.environ["PATH"] = str(tools_dir) + os.pathsep + os.environ.get("PATH", "")
+
+            found2 = shutil.which("ffmpeg")
+            self._dbg(f"DEBUG ffmpeg which after: {found2}")
+
+            if found2:
+                self.ffmpeg_exe = found2
+                return True
+
+            return False
+
+        except Exception as e:
+            self._dbg(f"DEBUG ensure ffmpeg error: {e}")
+            return False
+
+
+    def __init__(self, source_dir, output_dir):
+        self.source_dir = Path(source_dir)
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+
+        self.debug = True
+        self.ffmpeg_exe = None
+        ok = self._ensure_ffmpeg_on_path()
+        self._dbg(f"DEBUG ensure_ffmpeg_on_path result: {ok}")
+
+        # 변환 로그를 저장할 리스트
+        self.conversion_log = []
+        
+        # ★ 추가: 도메인 용어 사전
+        self.domain_terms = ""
+
+        # HWP 보안 모듈 후보 목록 추가
+        self.hwp_security_modules = [
+            "FilePathCheckerModuleExample",
+            "SecurityModule",
+            ""
+        ]
+    
+        # 지원 파일 확장자 정의
+        self.image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff', '.tif', '.webp'}
+        self.office_extensions = {'.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.hwp', '.hwpx'}
+        self.video_extensions = {'.mp4', '.avi', '.mov', '.wmv', '.flv', '.mkv', '.m4v'}
+        self.text_extensions = {'.txt', '.csv', '.log', '.md'}
+        self.pdf_extension = {'.pdf'}
+        self.dwg_extensions = {'.dwg', '.dxf'}
+        
+        # DWG TrueView 경로 설정 (설치 버전에 맞게 조정)
+        self.trueview_path = self._find_trueview()
+        
+    def _find_trueview(self):
+        """DWG TrueView 설치 경로 자동 탐색"""
+        possible_paths = [
+            r"C:\Program Files\Autodesk\DWG TrueView 2025\dwgviewr.exe",
+            r"C:\Program Files\Autodesk\DWG TrueView 2024\dwgviewr.exe",
+            r"C:\Program Files\Autodesk\DWG TrueView 2023\dwgviewr.exe",
+            r"C:\Program Files (x86)\Autodesk\DWG TrueView 2025\dwgviewr.exe",
+            r"C:\Program Files (x86)\Autodesk\DWG TrueView 2024\dwgviewr.exe",
+        ]
+        
+        for path in possible_paths:
+            if Path(path).exists():
+                return path
+        
+        return None
+        
+    def get_all_files(self):
+        """하위 모든 폴더의 파일 목록 가져오기"""
+        all_files = []
+        for file_path in self.source_dir.rglob('*'):
+            if file_path.is_file():
+                all_files.append(file_path)
+        return all_files
+        
+    def extract_audio_from_video(self, video_path, audio_output_path):
+        try:
+            import imageio_ffmpeg
+            from pathlib import Path
+
+            ffmpeg_exe = imageio_ffmpeg.get_ffmpeg_exe()
+            self._dbg(f"DEBUG extract ffmpeg_exe: {ffmpeg_exe}")
+            self._dbg(f"DEBUG extract ffmpeg_exe exists: {Path(ffmpeg_exe).exists()}")
+            self._dbg(f"DEBUG extract input exists: {Path(video_path).exists()}")
+            self._dbg(f"DEBUG extract out path: {audio_output_path}")
+
+            cmd = [
+                ffmpeg_exe,
+                "-i", str(video_path),
+                "-vn",
+                "-acodec", "pcm_s16le",
+                "-ar", "16000",
+                "-ac", "1",
+                "-y",
+                str(audio_output_path),
+            ]
+            self._dbg("DEBUG extract cmd: " + " ".join(cmd))
+
+            result = subprocess.run(cmd, capture_output=True, timeout=300, check=True, text=True)
+            self._dbg(f"DEBUG extract returncode: {result.returncode}")
+            self._dbg(f"DEBUG extract stderr tail: {(result.stderr or '')[-300:]}")
+            return True
+
+        except subprocess.CalledProcessError as e:
+            self._dbg(f"DEBUG extract CalledProcessError returncode: {e.returncode}")
+            self._dbg(f"DEBUG extract stderr tail: {(e.stderr or '')[-300:]}")
+            return False
+        except Exception as e:
+            self._dbg(f"DEBUG extract exception: {e}")
+            return False
+    
+    def transcribe_audio_with_whisper(self, audio_path):
+        try:
+            self._ensure_ffmpeg_on_path()
+            
+            import shutil
+            from pathlib import Path
+            
+            ffmpeg_path = shutil.which("ffmpeg")
+            self._dbg(f"DEBUG whisper ffmpeg which: {ffmpeg_path}")
+            
+            if not ffmpeg_path:
+                if self.ffmpeg_exe:
+                    import os
+                    os.environ["PATH"] = str(Path(self.ffmpeg_exe).parent) + os.pathsep + os.environ.get("PATH", "")
+            
+            audio_file = Path(audio_path)
+            self._dbg(f"DEBUG whisper audio exists: {audio_file.exists()}")
+            self._dbg(f"DEBUG whisper audio size: {audio_file.stat().st_size if audio_file.exists() else 'NA'}")
+            
+            if not audio_file.exists() or audio_file.stat().st_size == 0:
+                return "[오디오 파일이 비어있거나 존재하지 않음]"
+            
+            import whisper
+            model = whisper.load_model("medium")  # ★ base → medium 변경
+            
+            # ★ domain_terms를 initial_prompt로 사용
+            result = model.transcribe(
+                str(audio_path),
+                language="ko",
+                task="transcribe",
+                initial_prompt=self.domain_terms if self.domain_terms else None,
+                condition_on_previous_text=True,  # ★ 다시 True로
+            )
+
+            # ★ 후처리: 반복 및 이상한 텍스트 제거
+            text = result["text"]
+            text = self.clean_transcript(text)
+            return text
+
+        except Exception as e:
+            import traceback
+            self._dbg(f"DEBUG whisper traceback: {traceback.format_exc()}")
+            return f"[음성 인식 실패: {str(e)}]"
+
+    def clean_transcript(self, text):
+        """Whisper 결과 후처리 - 반복/환각 제거"""
+        import re
+            
+        # 1. 영어/일본어/중국어 환각 제거
+        text = re.sub(r'[A-Za-z]{3,}', '', text)  # 3글자 이상 영어 제거
+        text = re.sub(r'[\u3040-\u309F\u30A0-\u30FF]+', '', text)  # 일본어 제거
+        text = re.sub(r'[\u4E00-\u9FFF]+', '', text)  # 한자 제거 (필요시)
+            
+        # 2. 반복 문장 제거
+        sentences = text.split('.')
+        seen = set()
+        unique_sentences = []
+        for s in sentences:
+            s_clean = s.strip()
+            if s_clean and s_clean not in seen:
+                seen.add(s_clean)
+                unique_sentences.append(s_clean)
+            
+        text = '. '.join(unique_sentences)
+            
+        # 3. 이상한 문자 정리
+        text = re.sub(r'\s+', ' ', text)  # 다중 공백 제거
+        text = text.strip()
+            
+        return text
+
+    def get_video_transcript(self, video_path):
+        """동영상 파일의 음성을 텍스트로 변환"""
+        try:
+            # 임시 오디오 파일 경로
+            temp_audio = video_path.parent / f"{video_path.stem}_temp_audio.wav"
+            
+            # 1. 동영상에서 오디오 추출
+            if not self.extract_audio_from_video(video_path, temp_audio):
+                return self.get_basic_file_info(video_path) + "\n\n[오디오 추출 실패]"
+            if (not temp_audio.exists()) or temp_audio.stat().st_size == 0:
+                return self.get_basic_file_info(video_path) + "\n\n[오디오 파일 생성 실패]"
+            
+            # 2. Whisper로 음성 인식
+            transcript = self.transcribe_audio_with_whisper(temp_audio)
+            
+            # 3. 임시 오디오 파일 삭제
+            if temp_audio.exists():
+                temp_audio.unlink()
+            
+            # 4. 결과 포맷팅
+            stat = video_path.stat()
+            lines = []
+            lines.append(f"동영상 파일 음성 전사 (Speech-to-Text)")
+            lines.append(f"=" * 60)
+            lines.append(f"파일명: {video_path.name}")
+            lines.append(f"경로: {video_path}")
+            lines.append(f"파일 크기: {self.format_file_size(stat.st_size)}")
+            lines.append(f"생성일: {datetime.fromtimestamp(stat.st_ctime).strftime('%Y-%m-%d %H:%M:%S')}")
+            lines.append("")
+            lines.append("=" * 60)
+            lines.append("음성 내용:")
+            lines.append("=" * 60)
+            lines.append("")
+            lines.append(transcript)
+            
+            return "\n".join(lines)
+                
+        except Exception as e:
+            return self.get_basic_file_info(video_path) + f"\n\n[음성 인식 오류: {str(e)}]"
+    
+    def convert_dwg_to_pdf_trueview(self, dwg_path, pdf_path):
+        """DWG TrueView를 사용한 DWG → PDF 변환"""
+        if not self.trueview_path:
+            return False, "DWG TrueView가 설치되지 않음"
+        
+        try:
+            # AutoCAD 스크립트 생성
+            script_content = f"""_-EXPORT_PDF{pdf_path}_Y"""            
+            script_path = dwg_path.parent / f"{dwg_path.stem}_plot.scr"
+            with open(script_path, 'w') as f:
+                f.write(script_content)
+            
+            # TrueView 실행
+            cmd = [
+                self.trueview_path,
+                str(dwg_path.absolute()),
+                "/b", str(script_path.absolute()),
+                "/nologo"
+            ]
+            
+            result = subprocess.run(cmd, timeout=120, capture_output=True)
+            
+            # 스크립트 파일 삭제
+            if script_path.exists():
+                try:
+                    script_path.unlink()
+                except:
+                    pass
+            
+            # PDF 생성 확인
+            if pdf_path.exists():
+                return True, "성공"
+            else:
+                return False, "PDF 생성 실패"
+                
+        except subprocess.TimeoutExpired:
+            return False, "변환 시간 초과"
+        except Exception as e:
+            return False, f"DWG 변환 실패: {str(e)}"
+    
+    def get_basic_file_info(self, file_path):
+        """기본 파일 정보 반환"""
+        stat = file_path.stat()
+        lines = []
+        lines.append(f"파일 정보")
+        lines.append(f"=" * 60)
+        lines.append(f"파일명: {file_path.name}")
+        lines.append(f"경로: {file_path}")
+        lines.append(f"파일 크기: {self.format_file_size(stat.st_size)}")
+        lines.append(f"생성일: {datetime.fromtimestamp(stat.st_ctime).strftime('%Y-%m-%d %H:%M:%S')}")
+        lines.append(f"수정일: {datetime.fromtimestamp(stat.st_mtime).strftime('%Y-%m-%d %H:%M:%S')}")
+        return "\n".join(lines)
+    
+    def format_file_size(self, size_bytes):
+        """파일 크기를 읽기 쉬운 형식으로 변환"""
+        for unit in ['B', 'KB', 'MB', 'GB']:
+            if size_bytes < 1024.0:
+                return f"{size_bytes:.2f} {unit}"
+            size_bytes /= 1024.0
+        return f"{size_bytes:.2f} TB"
+    
+    def convert_image_to_pdf(self, image_path, output_path):
+        """이미지 파일을 PDF로 변환"""
+        try:
+            img = Image.open(image_path)
+            # RGB 모드로 변환 (RGBA나 다른 모드 처리)
+            if img.mode in ('RGBA', 'LA', 'P'):
+                # 흰색 배경 생성
+                background = Image.new('RGB', img.size, (255, 255, 255))
+                if img.mode == 'P':
+                    img = img.convert('RGBA')
+                background.paste(img, mask=img.split()[-1] if img.mode == 'RGBA' else None)
+                img = background
+            elif img.mode != 'RGB':
+                img = img.convert('RGB')
+            
+            img.save(output_path, 'PDF', resolution=100.0)
+            return True, "성공"
+        except Exception as e:
+            return False, f"이미지 변환 실패: {str(e)}"
+    
+    def convert_office_to_pdf(self, file_path, output_path):
+        """Office 문서를 PDF로 변환"""
+        pythoncom.CoInitialize()
+        try:
+            ext = file_path.suffix.lower()
+            
+            if ext in {'.hwp', '.hwpx'}:
+                return self.convert_hwp_to_pdf(file_path, output_path)
+            elif ext in {'.doc', '.docx'}:
+                return self.convert_word_to_pdf(file_path, output_path)
+            elif ext in {'.xls', '.xlsx'}:
+                return self.convert_excel_to_pdf(file_path, output_path)
+            elif ext in {'.ppt', '.pptx'}:
+                return self.convert_ppt_to_pdf(file_path, output_path)
+            else:
+                return False, "지원하지 않는 Office 형식"
+                
+        except Exception as e:
+            return False, f"Office 변환 실패: {str(e)}"
+        finally:
+            pythoncom.CoUninitialize()
+    
+    def convert_word_to_pdf(self, file_path, output_path):
+        """Word 문서를 PDF로 변환"""
+        try:
+            word = win32com.client.Dispatch("Word.Application")
+            word.Visible = False
+            doc = word.Documents.Open(str(file_path.absolute()))
+            doc.SaveAs(str(output_path.absolute()), FileFormat=17)  # 17 = PDF
+            doc.Close()
+            word.Quit()
+            return True, "성공"
+        except Exception as e:
+            return False, f"Word 변환 실패: {str(e)}"
+    
+    def convert_excel_to_pdf(self, file_path, output_path):
+        """Excel 파일을 PDF로 변환 - 열 너비에 맞춰 출력"""
+        try:
+            excel = win32com.client.Dispatch("Excel.Application")
+            excel.Visible = False
+            wb = excel.Workbooks.Open(str(file_path.absolute()))
+            
+            # 모든 시트에 대해 페이지 설정
+            for ws in wb.Worksheets:
+                # 페이지 설정
+                ws.PageSetup.Zoom = False  # 자동 크기 조정 비활성화
+                ws.PageSetup.FitToPagesWide = 1  # 너비를 1페이지에 맞춤
+                ws.PageSetup.FitToPagesTall = False  # 높이는 자동 (내용에 따라)
+                
+                # 여백 최소화 (단위: 포인트, 1cm ≈ 28.35 포인트)
+                ws.PageSetup.LeftMargin = excel.CentimetersToPoints(1)
+                ws.PageSetup.RightMargin = excel.CentimetersToPoints(1)
+                ws.PageSetup.TopMargin = excel.CentimetersToPoints(1)
+                ws.PageSetup.BottomMargin = excel.CentimetersToPoints(1)
+                
+                # 용지 방향 자동 결정 (가로가 긴 경우 가로 방향)
+                used_range = ws.UsedRange
+                if used_range.Columns.Count > used_range.Rows.Count:
+                    ws.PageSetup.Orientation = 2  # xlLandscape (가로)
+                else:
+                    ws.PageSetup.Orientation = 1  # xlPortrait (세로)
+            
+            # PDF로 저장
+            wb.ExportAsFixedFormat(0, str(output_path.absolute()))  # 0 = PDF
+            wb.Close()
+            excel.Quit()
+            return True, "성공"
+        except Exception as e:
+            return False, f"Excel 변환 실패: {str(e)}"
+            
+
+    def convert_ppt_to_pdf(self, file_path, output_path):
+        """PowerPoint 파일을 PDF로 변환"""
+        try:
+            ppt = win32com.client.Dispatch("PowerPoint.Application")
+            ppt.Visible = True
+            presentation = ppt.Presentations.Open(str(file_path.absolute()))
+            presentation.SaveAs(str(output_path.absolute()), 32)  # 32 = PDF
+            presentation.Close()
+            ppt.Quit()
+            return True, "성공"
+        except Exception as e:
+            return False, f"PowerPoint 변환 실패: {str(e)}"
+    
+    def convert_hwp_to_pdf(self, file_path, output_path):
+        hwp = None
+        try:
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+
+            try:
+                hwp = win32com.client.gencache.EnsureDispatch("HWPFrame.HwpObject")
+            except Exception:
+                hwp = win32com.client.Dispatch("HWPFrame.HwpObject")
+
+            registered = False
+            last_reg_error = None
+
+            for module_name in getattr(self, "hwp_security_modules", [""]):
+                try:
+                    hwp.RegisterModule("FilePathCheckDLL", module_name)
+                    registered = True
+                    break
+                except Exception as e:
+                    last_reg_error = e
+
+            if not registered:
+                return False, f"HWP 보안 모듈 등록 실패: {last_reg_error}"
+
+            hwp.Open(str(file_path.absolute()), "", "")
+
+            hwp.HAction.GetDefault("FileSaveAsPdf", hwp.HParameterSet.HFileOpenSave.HSet)
+            hwp.HParameterSet.HFileOpenSave.filename = str(output_path.absolute())
+            hwp.HParameterSet.HFileOpenSave.Format = "PDF"
+            hwp.HAction.Execute("FileSaveAsPdf", hwp.HParameterSet.HFileOpenSave.HSet)
+
+            if output_path.exists() and output_path.stat().st_size > 0:
+                return True, "성공"
+            return False, "PDF 생성 확인 실패"
+
+        except Exception as e:
+            return False, f"HWP 변환 실패: {str(e)}"
+        finally:
+            try:
+                if hwp:
+                    try:
+                        hwp.Clear(1)
+                    except Exception:
+                        pass
+                    try:
+                        hwp.Quit()
+                    except Exception:
+                        pass
+            except Exception:
+                pass
+
+
+
+    def convert_text_to_pdf(self, text_path, output_path):
+        """텍스트 파일을 PDF로 변환 (reportlab 사용)"""
+        try:
+            from reportlab.lib.pagesizes import A4
+            from reportlab.pdfgen import canvas
+            from reportlab.pdfbase import pdfmetrics
+            from reportlab.pdfbase.ttfonts import TTFont
+            
+            # 한글 폰트 등록 (시스템에 설치된 폰트 사용)
+            try:
+                pdfmetrics.registerFont(TTFont('Malgun', 'malgun.ttf'))
+                font_name = 'Malgun'
+            except:
+                font_name = 'Helvetica'
+            
+            # 텍스트 읽기
+            with open(text_path, 'r', encoding='utf-8', errors='ignore') as f:
+                content = f.read()
+            
+            # PDF 생성
+            c = canvas.Canvas(str(output_path), pagesize=A4)
+            width, height = A4
+            
+            c.setFont(font_name, 10)
+            
+            # 여백 설정
+            margin = 50
+            y = height - margin
+            line_height = 14
+            
+            # 줄 단위로 처리
+            for line in content.split('\n'):
+                if y < margin:  # 페이지 넘김
+                    c.showPage()
+                    c.setFont(font_name, 10)
+                    y = height - margin
+                
+                # 긴 줄은 자동으로 줄바꿈
+                if len(line) > 100:
+                    chunks = [line[i:i+100] for i in range(0, len(line), 100)]
+                    for chunk in chunks:
+                        c.drawString(margin, y, chunk)
+                        y -= line_height
+                else:
+                    c.drawString(margin, y, line)
+                    y -= line_height
+            
+            c.save()
+            return True, "성공"
+        except Exception as e:
+            return False, f"텍스트 변환 실패: {str(e)}"
+    
+    def process_file(self, file_path):
+        """개별 파일 처리"""
+        ext = file_path.suffix.lower()
+        
+        # 출력 파일명 생성 (원본 경로 구조 유지)
+        relative_path = file_path.relative_to(self.source_dir)
+        output_subdir = self.output_dir / relative_path.parent
+        output_subdir.mkdir(parents=True, exist_ok=True)
+        
+        # PDF 파일명
+        output_pdf = output_subdir / f"{file_path.stem}.pdf"
+        
+        success = False
+        message = ""
+        
+        try:
+            # 이미 PDF인 경우
+            if ext in self.pdf_extension:
+                shutil.copy2(file_path, output_pdf)
+                success = True
+                message = "PDF 복사 완료"
+            
+            # DWG/DXF 파일
+            elif ext in self.dwg_extensions:
+                success, message = self.convert_dwg_to_pdf_trueview(file_path, output_pdf)
+            
+            # 이미지 파일
+            elif ext in self.image_extensions:
+                success, message = self.convert_image_to_pdf(file_path, output_pdf)
+            
+            # Office 문서
+            elif ext in self.office_extensions:
+                success, message = self.convert_office_to_pdf(file_path, output_pdf)
+            
+            # 동영상 파일 - 음성을 텍스트로 변환 후 PDF 생성
+            elif ext in self.video_extensions:
+                # 음성→텍스트 변환
+                transcript_text = self.get_video_transcript(file_path)
+                
+                # 임시 txt 파일 생성
+                temp_txt = output_subdir / f"{file_path.stem}_transcript.txt"
+                with open(temp_txt, 'w', encoding='utf-8') as f:
+                    f.write(transcript_text)
+                
+                # txt를 PDF로 변환
+                success, message = self.convert_text_to_pdf(temp_txt, output_pdf)
+                
+                if success:
+                    message = "성공 (음성 인식 완료)"
+                
+                # 임시 txt 파일은 남겨둠 (참고용)
+            
+            # 텍스트 파일
+            elif ext in self.text_extensions:
+                success, message = self.convert_text_to_pdf(file_path, output_pdf)
+            
+            else:
+                message = f"지원하지 않는 파일 형식: {ext}"
+        
+        except Exception as e:
+            message = f"처리 중 오류: {str(e)}"
+        
+        # 로그 기록
+        self.conversion_log.append({
+            '원본 경로': str(file_path),
+            '파일명': file_path.name,
+            '파일 형식': ext,
+            '변환 PDF 경로': str(output_pdf) if success else "",
+            '상태': "성공" if success else "실패",
+            '메시지': message,
+            '처리 시간': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+        })
+        
+        return success, message
+    
+    def create_excel_report(self, excel_path):
+        """변환 결과를 엑셀로 저장"""
+        wb = openpyxl.Workbook()
+        ws = wb.active
+        ws.title = "변환 결과"
+        
+        # 헤더 스타일
+        header_fill = PatternFill(start_color="366092", end_color="366092", fill_type="solid")
+        header_font = Font(bold=True, color="FFFFFF")
+        
+        # 헤더 작성
+        headers = ['번호', '원본 경로', '파일명', '파일 형식', '변환 PDF 경로', '상태', '메시지', '처리 시간']
+        for col, header in enumerate(headers, 1):
+            cell = ws.cell(row=1, column=col, value=header)
+            cell.fill = header_fill
+            cell.font = header_font
+            cell.alignment = Alignment(horizontal='center', vertical='center')
+        
+        # 데이터 작성
+        for idx, log in enumerate(self.conversion_log, 2):
+            ws.cell(row=idx, column=1, value=idx-1)
+            ws.cell(row=idx, column=2, value=log['원본 경로'])
+            ws.cell(row=idx, column=3, value=log['파일명'])
+            ws.cell(row=idx, column=4, value=log['파일 형식'])
+            ws.cell(row=idx, column=5, value=log['변환 PDF 경로'])
+            
+            # 상태에 따라 색상 표시
+            status_cell = ws.cell(row=idx, column=6, value=log['상태'])
+            if log['상태'] == "성공":
+                status_cell.fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
+                status_cell.font = Font(color="006100")
+            else:
+                status_cell.fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
+                status_cell.font = Font(color="9C0006")
+            
+            ws.cell(row=idx, column=7, value=log['메시지'])
+            ws.cell(row=idx, column=8, value=log['처리 시간'])
+        
+        # 열 너비 자동 조정
+        for column in ws.columns:
+            max_length = 0
+            column_letter = column[0].column_letter
+            for cell in column:
+                try:
+                    if len(str(cell.value)) > max_length:
+                        max_length = len(str(cell.value))
+                except:
+                    pass
+            adjusted_width = min(max_length + 2, 50)
+            ws.column_dimensions[column_letter].width = adjusted_width
+        
+        # 요약 시트 추가
+        summary_ws = wb.create_sheet(title="요약")
+        
+        total_files = len(self.conversion_log)
+        success_count = sum(1 for log in self.conversion_log if log['상태'] == "성공")
+        fail_count = total_files - success_count
+        
+        summary_data = [
+            ['항목', '값'],
+            ['총 파일 수', total_files],
+            ['변환 성공', success_count],
+            ['변환 실패', fail_count],
+            ['성공률', f"{(success_count/total_files*100):.1f}%" if total_files > 0 else "0%"],
+            ['', ''],
+            ['원본 폴더', str(self.source_dir)],
+            ['출력 폴더', str(self.output_dir)],
+            ['작업 완료 시간', datetime.now().strftime('%Y-%m-%d %H:%M:%S')]
+        ]
+        
+        for row_idx, row_data in enumerate(summary_data, 1):
+            for col_idx, value in enumerate(row_data, 1):
+                cell = summary_ws.cell(row=row_idx, column=col_idx, value=value)
+                if row_idx == 1:
+                    cell.fill = header_fill
+                    cell.font = header_font
+                cell.alignment = Alignment(horizontal='center' if col_idx == 1 else 'left')
+        
+        summary_ws.column_dimensions['A'].width = 20
+        summary_ws.column_dimensions['B'].width = 60
+        
+        # 저장
+        wb.save(excel_path)
+        print(f"\n엑셀 보고서 생성 완료: {excel_path}")
+    
+    def run(self):
+        """전체 변환 작업 실행"""
+        print(f"작업 시작: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+        print(f"원본 폴더: {self.source_dir}")
+        print(f"출력 폴더: {self.output_dir}")
+        
+        # DWG TrueView 확인
+        if self.trueview_path:
+            print(f"DWG TrueView 발견: {self.trueview_path}")
+        else:
+            print("경고: DWG TrueView를 찾을 수 없습니다. DWG 파일 변환이 불가능합니다.")
+        
+        print("-" * 80)
+        
+        # 모든 파일 가져오기
+        all_files = self.get_all_files()
+        total_files = len(all_files)
+        
+        # ★ 파일 분류: 동영상 vs 나머지
+        video_files = []
+        other_files = []
+        
+        for file_path in all_files:
+            if file_path.suffix.lower() in self.video_extensions:
+                video_files.append(file_path)
+            else:
+                other_files.append(file_path)
+        
+        print(f"\n총 {total_files}개 파일 발견")
+        print(f"  - 문서/이미지 등: {len(other_files)}개")
+        print(f"  - 동영상: {len(video_files)}개")
+        print("\n[1단계] 문서 파일 변환 시작...\n")
+        
+        # ★ 1단계: 문서 파일 먼저 처리
+        for idx, file_path in enumerate(other_files, 1):
+            print(f"[{idx}/{len(other_files)}] {file_path.name} 처리 중...", end=' ')
+            success, message = self.process_file(file_path)
+            print(f"{'✓' if success else '✗'} {message}")
+        
+        # ★ 2단계: domain.txt 로드
+        domain_path = self.source_dir.parent / "domain.txt"  # D:\for python\테스트 중(측량)\domain.txt
+        if domain_path.exists():
+            self.domain_terms = domain_path.read_text(encoding='utf-8')
+            print(f"\n[2단계] 도메인 용어 사전 로드 완료: {domain_path}")
+            print(f"  - 용어 수: 약 {len(self.domain_terms.split())}개 단어")
+        else:
+            print(f"\n[2단계] 도메인 용어 사전 없음: {domain_path}")
+            print("  - 기본 음성 인식으로 진행합니다.")
+        
+        # ★ 3단계: 동영상 파일 처리
+        if video_files:
+            print(f"\n[3단계] 동영상 음성 인식 시작...\n")
+            for idx, file_path in enumerate(video_files, 1):
+                print(f"[{idx}/{len(video_files)}] {file_path.name} 처리 중...", end=' ')
+                success, message = self.process_file(file_path)
+                print(f"{'✓' if success else '✗'} {message}")
+        
+        # 엑셀 보고서 생성
+        excel_path = self.output_dir / f"변환_결과_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
+        self.create_excel_report(excel_path)
+        
+        # 최종 요약
+        success_count = sum(1 for log in self.conversion_log if log['상태'] == "성공")
+        print("\n" + "=" * 80)
+        print(f"작업 완료!")
+        print(f"총 파일: {total_files}개")
+        print(f"성공: {success_count}개")
+        print(f"실패: {total_files - success_count}개")
+        print(f"성공률: {(success_count/total_files*100):.1f}%" if total_files > 0 else "0%")
+        print("=" * 80)
+
+if __name__ == "__main__":
+    # 경로 설정
+    SOURCE_DIR = r"D:\for python\테스트 중(측량)\측량_GIS_드론 관련 자료들"
+    OUTPUT_DIR = r"D:\for python\테스트 중(측량)\추출"
+    
+    # 변환기 실행
+    converter = SurveyingFileConverter(SOURCE_DIR, OUTPUT_DIR)
+    converter.run()
--- a/Code/geulbeot_5th/converters/pipeline/step2_extract.py
+++ b/Code/geulbeot_5th/converters/pipeline/step2_extract.py
@@ -0,0 +1,789 @@
+# -*- coding: utf-8 -*-
+"""
+extract_1_v2.py
+
+PDF에서 텍스트(md)와 이미지(png)를 추출
+- 하위 폴더 구조 유지
+- 이미지 메타데이터 JSON 생성 (폴더경로, 파일명, 페이지, 위치, 캡션 등)
+"""
+
+import fitz  # PyMuPDF
+import os
+import re
+import json
+import numpy as np
+from pathlib import Path
+from datetime import datetime
+from PIL import Image
+import io
+
+# ===== OCR 설정 (선택적) =====
+try:
+    import pytesseract
+    pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
+    TESSERACT_AVAILABLE = True
+except ImportError:
+    TESSERACT_AVAILABLE = False
+    print("[INFO] pytesseract 미설치 - 텍스트 잘림 필터 비활성화")
+
+# ===== 경로 설정 =====
+BASE_DIR = Path(r"D:\for python\survey_test\extract")        # PDF 원본 위치
+OUTPUT_BASE = Path(r"D:\for python\survey_test\process")   # 출력 위치
+
+CAPTION_PATTERN = re.compile(
+    r'^\s*(?:[<\[\(\{]\s*)?(그림|figure|fig)\s*\.?\s*(?:[<\[\(\{]\s*)?0*\d+(?:\s*[-–]\s*\d+)?',
+    re.IGNORECASE
+)
+
+
+def get_figure_rects(page):
+    """
+    Identifies figure regions based on '<그림 N>' captions and vector drawings.
+    Returns a list of dicts: {'rect': fitz.Rect, 'caption_block': block_index}
+    """
+    drawings = page.get_drawings()
+
+    blocks = page.get_text("blocks")
+    captions = []
+     
+    for i, b in enumerate(blocks):
+        text = b[4]
+        if CAPTION_PATTERN.search(text):
+            captions.append({'rect': fitz.Rect(b[:4]), 'index': i, 'text': text, 'drawings': []})
+
+    if not captions:
+        return []
+
+    filtered_drawings_rects = []
+    for d in drawings:
+        r = d["rect"]
+        if r.height > page.rect.height / 3 and r.width < 5:
+            continue
+        if r.width > page.rect.width * 0.9:
+            continue
+        filtered_drawings_rects.append(r)
+
+    page_area = page.rect.get_area()
+    img_rects = []
+    for b in page.get_text("dict")["blocks"]:
+        if b.get("type") == 1:
+            ir = fitz.Rect(b["bbox"])
+            if ir.get_area() < page_area * 0.01:
+                continue
+            img_rects.append(ir)
+
+    remaining_drawings = filtered_drawings_rects + img_rects
+    caption_clusters = {cap['index']: [cap['rect']] for cap in captions}
+    
+    def is_text_between(r1, r2, text_blocks):
+        if r1.intersects(r2):
+            return False
+        union = r1 | r2
+        for b in text_blocks:
+            b_rect = fitz.Rect(b[:4])
+            text_content = b[4]
+            if len(text_content.strip()) < 20: 
+                continue
+            if not b_rect.intersects(union):
+                continue
+            if b_rect.intersects(r1) or b_rect.intersects(r2):
+                continue
+            return True
+        return False
+
+    changed = True
+    while changed:
+        changed = False
+        to_remove = []
+        
+        for d_rect in remaining_drawings:
+            best_cluster_key = None
+            min_dist = float('inf')
+            
+            for cap_index, cluster_rects in caption_clusters.items():
+                for r in cluster_rects:
+                    dist = 0
+                    if d_rect.intersects(r):
+                        dist = 0
+                    else:
+                        x_dist = 0
+                        if d_rect.x1 < r.x0: x_dist = r.x0 - d_rect.x1
+                        elif d_rect.x0 > r.x1: x_dist = d_rect.x0 - r.x1
+                        
+                        y_dist = 0
+                        if d_rect.y1 < r.y0: y_dist = r.y0 - d_rect.y1
+                        elif d_rect.y0 > r.y1: y_dist = d_rect.y0 - r.y1
+                        
+                        if x_dist < 150 and y_dist < 150:
+                            dist = max(x_dist, y_dist) + 0.1 
+                        else:
+                            dist = float('inf')
+                    
+                    if dist < min_dist:
+                         if not is_text_between(r, d_rect, blocks):
+                             min_dist = dist
+                             best_cluster_key = cap_index
+                
+                if min_dist == 0: 
+                    break
+            
+            if best_cluster_key is not None and min_dist < 150:
+                caption_clusters[best_cluster_key].append(d_rect)
+                to_remove.append(d_rect)
+                changed = True
+        
+        for r in to_remove:
+            remaining_drawings.remove(r)
+            
+    figure_regions = []
+    
+    for cap in captions:
+        cluster_rects = caption_clusters[cap['index']]
+        content_rects = cluster_rects[1:] 
+        
+        if not content_rects:
+            continue
+            
+        union_rect = content_rects[0]
+        for r in content_rects[1:]:
+            union_rect = union_rect | r
+            
+        union_rect.x0 = max(0, union_rect.x0 - 5)
+        union_rect.x1 = min(page.rect.width, union_rect.x1 + 5)
+        union_rect.y0 = max(0, union_rect.y0 - 5)
+        union_rect.y1 = min(page.rect.height, union_rect.y1 + 5)
+        
+        cap_rect = cap['rect']
+        
+        if cap_rect.y0 + cap_rect.height/2 < union_rect.y0 + union_rect.height/2:
+             if union_rect.y0 < cap_rect.y1: union_rect.y0 = cap_rect.y1 + 2 
+        else:
+             if union_rect.y1 > cap_rect.y0: union_rect.y1 = cap_rect.y0 - 2 
+             
+        area = union_rect.get_area()
+        page_area = page.rect.get_area()
+
+        if area < page_area * 0.01:
+            continue
+
+        if union_rect.height < 20 and union_rect.width > page.rect.width * 0.6:
+            continue
+        if union_rect.width < 20 and union_rect.height > page.rect.height * 0.6:
+            continue
+
+        text_blocks = page.get_text("blocks")
+        text_count = 0
+
+        for b in text_blocks:
+            b_rect = fitz.Rect(b[:4])
+            if not b_rect.intersects(union_rect):
+                continue
+            text = b[4].strip()
+            if len(text) < 5:
+                continue
+            text_count += 1
+
+        if text_count < 0:
+            continue
+
+        figure_regions.append({
+            'rect': union_rect,
+            'caption_index': cap['index'],
+            'caption_rect': cap['rect'],
+            'caption_text': cap['text'].strip()  # ★ 캡션 텍스트 저장
+        })
+
+    return figure_regions
+
+
+def pixmap_metrics(pix):
+    arr = np.frombuffer(pix.samples, dtype=np.uint8)
+    c = 4 if pix.alpha else 3
+    arr = arr.reshape(pix.height, pix.width, c)[:, :, :3]
+    gray = (0.299 * arr[:, :, 0] + 0.587 * arr[:, :, 1] + 0.114 * arr[:, :, 2]).astype(np.uint8)
+    white = gray > 245
+    nonwhite_ratio = float(1.0 - white.mean())
+    gx = np.abs(np.diff(gray.astype(np.int16), axis=1))
+    gy = np.abs(np.diff(gray.astype(np.int16), axis=0))
+    edge = (gx[:-1, :] + gy[:, :-1]) > 40
+    edge_ratio = float(edge.mean())
+    var = float(gray.var())
+    return nonwhite_ratio, edge_ratio, var
+
+
+def keep_figure(pix):
+    nonwhite_ratio, edge_ratio, var = pixmap_metrics(pix)
+    if nonwhite_ratio < 0.004:
+        return False, nonwhite_ratio, edge_ratio, var
+    if nonwhite_ratio < 0.012 and edge_ratio < 0.004 and var < 20:
+        return False, nonwhite_ratio, edge_ratio, var
+    return True, nonwhite_ratio, edge_ratio, var
+
+
+# ===== 추가 이미지 필터 함수들 (v2.1) =====
+
+def pix_to_pil(pix):
+    """PyMuPDF Pixmap을 PIL Image로 변환"""
+    img_data = pix.tobytes("png")
+    return Image.open(io.BytesIO(img_data))
+
+
+def has_cut_text_at_boundary(pix, margin=5):
+    """
+    이미지 경계에서 텍스트가 잘렸는지 감지
+    - 이미지 테두리 근처에 텍스트 박스가 있으면 잘린 것으로 판단
+    
+    Args:
+        pix: PyMuPDF Pixmap
+        margin: 경계로부터의 여유 픽셀 (기본 5px)
+    
+    Returns:
+        bool: 텍스트가 잘렸으면 True
+    """
+    if not TESSERACT_AVAILABLE:
+        return False  # OCR 없으면 필터 비활성화
+    
+    try:
+        img = pix_to_pil(pix)
+        width, height = img.size
+        
+        # OCR로 텍스트 위치 추출
+        data = pytesseract.image_to_data(img, lang='kor+eng', output_type=pytesseract.Output.DICT)
+        
+        for i, text in enumerate(data['text']):
+            text = str(text).strip()
+            if len(text) < 2:  # 너무 짧은 텍스트는 무시
+                continue
+            
+            x = data['left'][i]
+            y = data['top'][i]
+            w = data['width'][i]
+            h = data['height'][i]
+            
+            # 텍스트가 이미지 경계에 너무 가까우면 = 잘린 것
+            # 왼쪽 경계
+            if x <= margin:
+                return True
+            # 오른쪽 경계
+            if x + w >= width - margin:
+                return True
+            # 상단 경계 (헤더 제외를 위해 좀 더 여유)
+            if y <= margin and h < height * 0.3:
+                return True
+            # 하단 경계
+            if y + h >= height - margin:
+                return True
+        
+        return False
+        
+    except Exception as e:
+        # OCR 실패 시 필터 통과 (이미지 유지)
+        return False
+
+
+def is_decorative_background(pix, edge_threshold=0.02, color_var_threshold=500):
+    """
+    배경 패턴 + 텍스트만 있는 장식용 이미지인지 감지
+    - 엣지가 적고 (복잡한 도표/사진이 아님)
+    - 색상 다양성이 낮으면 (단순 그라데이션 배경)
+    
+    Args:
+        pix: PyMuPDF Pixmap
+        edge_threshold: 엣지 비율 임계값 (기본 0.02 = 2%)
+        color_var_threshold: 색상 분산 임계값
+    
+    Returns:
+        bool: 장식용 배경이면 True
+    """
+    try:
+        nonwhite_ratio, edge_ratio, var = pixmap_metrics(pix)
+        
+        # 엣지가 거의 없고 (단순한 이미지)
+        # 색상 분산도 낮으면 (배경 패턴)
+        if edge_ratio < edge_threshold and var < color_var_threshold:
+            # 추가 확인: 텍스트만 있는지 OCR로 체크
+            if TESSERACT_AVAILABLE:
+                try:
+                    img = pix_to_pil(pix)
+                    text = pytesseract.image_to_string(img, lang='kor+eng').strip()
+                    
+                    # 텍스트가 있고, 이미지가 단순하면 = 텍스트 배경
+                    if len(text) > 3 and edge_ratio < 0.015:
+                        return True
+                except:
+                    pass
+            
+            return True
+        
+        return False
+        
+    except Exception:
+        return False
+
+
+def is_header_footer_region(rect, page_rect, height_threshold=0.12):
+    """
+    헤더/푸터 영역에 있는 이미지인지 감지
+    - 페이지 상단 12% 또는 하단 12%에 위치
+    - 높이가 낮은 strip 형태
+    
+    Args:
+        rect: 이미지 영역 (fitz.Rect)
+        page_rect: 페이지 전체 영역 (fitz.Rect)
+        height_threshold: 헤더/푸터 영역 비율 (기본 12%)
+    
+    Returns:
+        bool: 헤더/푸터 영역이면 True
+    """
+    page_height = page_rect.height
+    img_height = rect.height
+    
+    # 상단 영역 체크
+    if rect.y0 < page_height * height_threshold:
+        # 높이가 페이지의 15% 미만인 strip이면 헤더
+        if img_height < page_height * 0.15:
+            return True
+    
+    # 하단 영역 체크
+    if rect.y1 > page_height * (1 - height_threshold):
+        # 높이가 페이지의 15% 미만인 strip이면 푸터
+        if img_height < page_height * 0.15:
+            return True
+    
+    return False
+
+
+def should_filter_image(pix, rect, page_rect):
+    """
+    이미지를 필터링해야 하는지 종합 판단
+    
+    Args:
+        pix: PyMuPDF Pixmap
+        rect: 이미지 영역
+        page_rect: 페이지 전체 영역
+    
+    Returns:
+        tuple: (필터링 여부, 필터링 사유)
+    """
+    # 1. 헤더/푸터 영역 체크
+    if is_header_footer_region(rect, page_rect):
+        return True, "header_footer"
+    
+    # 2. 텍스트 잘림 체크
+    if has_cut_text_at_boundary(pix):
+        return True, "cut_text"
+    
+    # 3. 장식용 배경 체크
+    if is_decorative_background(pix):
+        return True, "decorative_background"
+    
+    return False, None
+
+
+def extract_pdf_content(pdf_path, output_md_path, img_dir, metadata):
+    """
+    PDF 내용 추출
+    
+    Args:
+        pdf_path: PDF 파일 경로
+        output_md_path: 출력 MD 파일 경로
+        img_dir: 이미지 저장 폴더
+        metadata: 메타데이터 딕셔너리 (폴더 경로, 파일명 등)
+    
+    Returns:
+        image_metadata_list: 추출된 이미지들의 메타데이터 리스트
+    """
+    os.makedirs(img_dir, exist_ok=True)
+    
+    image_metadata_list = []  # ★ 이미지 메타데이터 수집
+    
+    doc = fitz.open(pdf_path)
+    total_pages = len(doc)
+    
+    with open(output_md_path, "w", encoding="utf-8") as md_file:
+        # ★ 메타데이터 헤더 추가
+        md_file.write(f"---\n")
+        md_file.write(f"source_pdf: {metadata['pdf_name']}\n")
+        md_file.write(f"source_folder: {metadata['relative_folder']}\n")
+        md_file.write(f"total_pages: {total_pages}\n")
+        md_file.write(f"extracted_at: {datetime.now().isoformat()}\n")
+        md_file.write(f"---\n\n")
+        md_file.write(f"# {metadata['pdf_name']}\n\n")
+        
+        for page_num, page in enumerate(doc):
+            md_file.write(f"\n## Page {page_num + 1}\n\n")
+            img_rel_dir = os.path.basename(img_dir)
+            
+            figure_regions = get_figure_rects(page)
+            
+            kept_figures = []
+            for i, fig in enumerate(figure_regions):
+                rect = fig['rect']
+                pix_preview = page.get_pixmap(clip=rect, dpi=100, colorspace=fitz.csRGB)
+                ok, nonwhite_ratio, edge_ratio, var = keep_figure(pix_preview)
+                if not ok:
+                    continue
+
+                pix = page.get_pixmap(clip=rect, dpi=150, colorspace=fitz.csRGB)
+                
+                # ★ 추가 필터 적용 (v2.1)
+                should_filter, filter_reason = should_filter_image(pix, rect, page.rect)
+                if should_filter:
+                    continue
+                
+                img_name = f"p{page_num + 1:03d}_fig{len(kept_figures):02d}.png"
+                img_path = os.path.join(img_dir, img_name)
+                pix.save(img_path)
+
+                fig['img_path'] = os.path.join(img_rel_dir, img_name).replace("\\", "/")
+                fig['img_name'] = img_name
+                kept_figures.append(fig)
+                
+                # ★ 이미지 메타데이터 수집
+                image_metadata_list.append({
+                    "image_file": img_name,
+                    "image_path": str(Path(img_dir) / img_name),
+                    "type": "figure",
+                    "source_pdf": metadata['pdf_name'],
+                    "source_folder": metadata['relative_folder'],
+                    "full_path": metadata['full_path'],
+                    "page": page_num + 1,
+                    "total_pages": total_pages,
+                    "caption": fig.get('caption_text', ''),
+                    "rect": {
+                        "x0": round(rect.x0, 2),
+                        "y0": round(rect.y0, 2),
+                        "x1": round(rect.x1, 2),
+                        "y1": round(rect.y1, 2)
+                    }
+                })
+
+            figure_regions = kept_figures
+
+            caption_present = any(
+                CAPTION_PATTERN.search((tb[4] or "")) for tb in page.get_text("blocks")
+            )
+            uncaptioned_idx = 0
+
+            items = []
+
+            def inside_any_figure(block_rect, figures):
+                for fig in figures:
+                    intersect = block_rect & fig["rect"]
+                    if intersect.get_area() > 0.5 * block_rect.get_area():
+                        return True
+                return False
+
+            def is_full_width_rect(r, page_rect):
+                return r.width >= page_rect.width * 0.78
+
+            def figure_anchor_rect(fig, page_rect):
+                cap = fig["caption_rect"]
+                rect = fig["rect"]
+                if cap.y0 >= rect.y0:
+                    y = max(0.0, cap.y0 - 0.02)
+                else:
+                    y = min(page_rect.height - 0.02, cap.y1 + 0.02)
+                return fitz.Rect(cap.x0, y, cap.x1, y + 0.02)
+
+            for fig in figure_regions:
+                anchor = figure_anchor_rect(fig, page.rect)
+                md = (
+                    f"\n![{fig.get('caption_text', 'Figure')}]({fig['img_path']})\n"
+                    f"*{fig.get('caption_text', '')}*\n\n"
+                )
+                items.append({
+                    "kind": "figure",
+                    "rect": anchor,
+                    "kind_order": 0,
+                    "md": md,
+                })
+
+            raw_blocks = page.get_text("dict")["blocks"]
+
+            for block in raw_blocks:
+                block_rect = fitz.Rect(block["bbox"])
+
+                if block.get("type") == 0:
+                    if inside_any_figure(block_rect, figure_regions):
+                        continue
+                    items.append({
+                        "kind": "text",
+                        "rect": block_rect,
+                        "kind_order": 2,
+                        "block": block,
+                    })
+                    continue
+
+                if block.get("type") == 1:
+                    if inside_any_figure(block_rect, figure_regions):
+                        continue
+                    if caption_present:
+                        continue
+
+                    page_area = page.rect.get_area()
+                    if block_rect.get_area() < page_area * 0.005:
+                        continue
+
+                    ratio = block_rect.width / max(1.0, block_rect.height)
+                    if ratio < 0.25 or ratio > 4.0:
+                        continue
+
+                    pix_preview = page.get_pixmap(
+                        clip=block_rect, dpi=80, colorspace=fitz.csRGB
+                    )
+                    ok, nonwhite_ratio, edge_ratio, var = keep_figure(pix_preview)
+                    if not ok:
+                        continue
+
+                    pix = page.get_pixmap(
+                        clip=block_rect, dpi=150, colorspace=fitz.csRGB
+                    )
+                    
+                    # ★ 추가 필터 적용 (v2.1)
+                    should_filter, filter_reason = should_filter_image(pix, block_rect, page.rect)
+                    if should_filter:
+                        continue
+                    
+                    img_name = f"p{page_num + 1:03d}_photo{uncaptioned_idx:02d}.png"
+                    img_path = os.path.join(img_dir, img_name)
+                    pix.save(img_path)
+
+                    rel = os.path.join(img_rel_dir, img_name).replace("\\", "/")
+                    r = block_rect
+                    md = (
+                        f'\n![Photo]({rel})\n'
+                        f'*Page {page_num + 1} Photo*\n\n'
+                    )
+
+                    items.append({
+                        "kind": "raster",
+                        "rect": block_rect,
+                        "kind_order": 1,
+                        "md": md,
+                    })
+                    
+                    # ★ 캡션 없는 이미지 메타데이터
+                    image_metadata_list.append({
+                        "image_file": img_name,
+                        "image_path": str(Path(img_dir) / img_name),
+                        "type": "photo",
+                        "source_pdf": metadata['pdf_name'],
+                        "source_folder": metadata['relative_folder'],
+                        "full_path": metadata['full_path'],
+                        "page": page_num + 1,
+                        "total_pages": total_pages,
+                        "caption": "",
+                        "rect": {
+                            "x0": round(r.x0, 2),
+                            "y0": round(r.y0, 2),
+                            "x1": round(r.x1, 2),
+                            "y1": round(r.y1, 2)
+                        }
+                    })
+
+                    uncaptioned_idx += 1
+                    continue
+
+            # 읽기 순서 정렬
+            text_items = [it for it in items if it["kind"] == "text"]
+            page_w = page.rect.width
+            mid = page_w / 2.0
+
+            candidates = []
+            for it in text_items:
+                r = it["rect"]
+                if is_full_width_rect(r, page.rect):
+                    continue
+                if r.width < page_w * 0.2:
+                    continue
+                candidates.append(it)
+
+            left = [it for it in candidates if it["rect"].x0 < mid * 0.95]
+            right = [it for it in candidates if it["rect"].x0 > mid * 1.05]
+            two_cols = len(left) >= 3 and len(right) >= 3
+
+            col_y0 = None
+            col_y1 = None
+            seps = []
+
+            if two_cols and left and right:
+                col_y0 = min(
+                    min(it["rect"].y0 for it in left),
+                    min(it["rect"].y0 for it in right),
+                )
+                col_y1 = max(
+                    max(it["rect"].y1 for it in left),
+                    max(it["rect"].y1 for it in right),
+                )
+                for it in text_items:
+                    r = it["rect"]
+                    if col_y0 < r.y0 < col_y1 and is_full_width_rect(r, page.rect):
+                        seps.append(r.y0)
+                seps = sorted(set(seps))
+
+            def seg_index(y0, separators):
+                if not separators:
+                    return 0
+                n = 0
+                for s in separators:
+                    if y0 >= s:
+                        n += 1
+                    else:
+                        break
+                return n
+
+            def order_key(it):
+                r = it["rect"]
+                if not two_cols:
+                    return (r.y0, r.x0, it["kind_order"])
+                if col_y0 is not None and r.y1 <= col_y0:
+                    return (0, r.y0, r.x0, it["kind_order"])
+                if col_y1 is not None and r.y0 >= col_y1:
+                    return (2, r.y0, r.x0, it["kind_order"])
+                seg = seg_index(r.y0, seps)
+                if is_full_width_rect(r, page.rect):
+                    col = 2
+                else:
+                    col = 0 if r.x0 < mid else 1
+                return (1, seg, col, r.y0, r.x0, it["kind_order"])
+
+            items.sort(key=order_key)
+
+            for it in items:
+                if it["kind"] in ("figure", "raster"):
+                    md_file.write(it["md"])
+                    continue
+
+                block = it["block"]
+                for line in block.get("lines", []):
+                    for span in line.get("spans", []):
+                        md_file.write(span.get("text", "") + " ")
+                    md_file.write("\n")
+                md_file.write("\n")
+
+    doc.close()
+    return image_metadata_list
+
+
+def process_all_pdfs():
+    """
+    BASE_DIR 하위의 모든 PDF를 재귀적으로 처리
+    폴더 구조를 유지하면서 OUTPUT_BASE에 저장
+    """
+    # 출력 폴더 생성
+    OUTPUT_BASE.mkdir(parents=True, exist_ok=True)
+    
+    # 전체 이미지 메타데이터 수집
+    all_image_metadata = []
+    
+    # 처리 통계
+    stats = {
+        "total_pdfs": 0,
+        "success": 0,
+        "failed": 0,
+        "total_images": 0
+    }
+    
+    # 실패 로그
+    failed_files = []
+    
+    print(f"=" * 60)
+    print(f"PDF 추출 시작")
+    print(f"원본 폴더: {BASE_DIR}")
+    print(f"출력 폴더: {OUTPUT_BASE}")
+    print(f"=" * 60)
+    
+    # 모든 PDF 파일 찾기
+    pdf_files = list(BASE_DIR.rglob("*.pdf"))
+    stats["total_pdfs"] = len(pdf_files)
+    
+    print(f"\n총 {len(pdf_files)}개 PDF 발견\n")
+    
+    for idx, pdf_path in enumerate(pdf_files, 1):
+        try:
+            # 상대 경로 계산
+            relative_path = pdf_path.relative_to(BASE_DIR)
+            relative_folder = str(relative_path.parent)
+            if relative_folder == ".":
+                relative_folder = ""
+            
+            pdf_name = pdf_path.name
+            pdf_stem = pdf_path.stem
+            
+            # 출력 경로 설정 (폴더 구조 유지)
+            output_folder = OUTPUT_BASE / relative_path.parent
+            output_folder.mkdir(parents=True, exist_ok=True)
+            
+            output_md = output_folder / f"{pdf_stem}.md"
+            img_folder = output_folder / f"{pdf_stem}_img"
+            
+            # 메타데이터 준비
+            metadata = {
+                "pdf_name": pdf_name,
+                "pdf_stem": pdf_stem,
+                "relative_folder": relative_folder,
+                "full_path": str(relative_path),
+            }
+            
+            print(f"[{idx}/{len(pdf_files)}] {relative_path}")
+            
+            # PDF 처리
+            image_metas = extract_pdf_content(
+                str(pdf_path),
+                str(output_md),
+                str(img_folder),
+                metadata
+            )
+            
+            all_image_metadata.extend(image_metas)
+            stats["success"] += 1
+            stats["total_images"] += len(image_metas)
+            
+            print(f"    ✓ 완료 (이미지 {len(image_metas)}개)")
+            
+        except Exception as e:
+            stats["failed"] += 1
+            failed_files.append({
+                "file": str(pdf_path),
+                "error": str(e)
+            })
+            print(f"    ✗ 실패: {e}")
+    
+    # 전체 이미지 메타데이터 저장
+    meta_output_path = OUTPUT_BASE / "image_metadata.json"
+    with open(meta_output_path, "w", encoding="utf-8") as f:
+        json.dump(all_image_metadata, f, ensure_ascii=False, indent=2)
+    
+    # 처리 요약 저장
+    summary = {
+        "processed_at": datetime.now().isoformat(),
+        "source_dir": str(BASE_DIR),
+        "output_dir": str(OUTPUT_BASE),
+        "statistics": stats,
+        "failed_files": failed_files
+    }
+    
+    summary_path = OUTPUT_BASE / "extraction_summary.json"
+    with open(summary_path, "w", encoding="utf-8") as f:
+        json.dump(summary, f, ensure_ascii=False, indent=2)
+    
+    # 결과 출력
+    print(f"\n" + "=" * 60)
+    print(f"추출 완료!")
+    print(f"=" * 60)
+    print(f"총 PDF: {stats['total_pdfs']}개")
+    print(f"성공: {stats['success']}개")
+    print(f"실패: {stats['failed']}개")
+    print(f"추출된 이미지: {stats['total_images']}개")
+    print(f"\n이미지 메타데이터: {meta_output_path}")
+    print(f"처리 요약: {summary_path}")
+    
+    if failed_files:
+        print(f"\n실패한 파일:")
+        for f in failed_files:
+            print(f"  - {f['file']}: {f['error']}")
+
+
+if __name__ == "__main__":
+    process_all_pdfs()
--- a/Code/geulbeot_5th/converters/pipeline/step3_domain.py
+++ b/Code/geulbeot_5th/converters/pipeline/step3_domain.py
@@ -0,0 +1,265 @@
+# -*- coding: utf-8 -*-
+"""
+domain_prompt.py
+
+기능:
+- D:\\test\\report 아래의 pdf/xlsx/png/txt/md 파일들의
+  파일명과 내용 일부를 샘플링한다.
+- 이 샘플을 기반으로, 문서 묶음의 분야/업무 맥락을 파악하고
+  "너는 ~~ 분야의 전문가이다. 나는 ~~를 하고 싶다..." 형식의
+  도메인 전용 시스템 프롬프트를 자동 생성한다.
+- 결과는 output/context/domain_prompt.txt 로 저장된다.
+
+이 domain_prompt.txt 내용은 이후 모든 GPT 호출(system role)에 공통으로 붙여 사용할 수 있다.
+"""
+
+import os
+import sys
+import json
+from pathlib import Path
+
+import pdfplumber
+import fitz  # PyMuPDF
+from PIL import Image
+import pytesseract
+import pandas as pd
+from openai import OpenAI
+import pytesseract
+from api_config import API_KEYS
+pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
+
+# ===== 경로 설정 =====
+DATA_ROOT   = Path(r"D:\for python\survey_test\extract")
+OUTPUT_ROOT = Path(r"D:\for python\survey_test\output")
+CONTEXT_DIR = OUTPUT_ROOT / "context"
+LOG_DIR     = OUTPUT_ROOT / "logs"
+
+for d in [OUTPUT_ROOT, CONTEXT_DIR, LOG_DIR]:
+    d.mkdir(parents=True, exist_ok=True)
+
+# ===== OpenAI 설정 (구조만 유지, 키는 마스터가 직접 입력) =====
+OPENAI_API_KEY = API_KEYS.get('GPT_API_KEY', '')
+GPT_MODEL      = "gpt-5-2025-08-07"
+
+client = OpenAI(api_key=OPENAI_API_KEY)
+
+# ===== OCR 설정 =====
+OCR_LANG = "kor+eng"
+
+SKIP_DIR_NAMES = {"System Volume Information", "$RECYCLE.BIN", ".git", "__pycache__"}
+
+
+def log(msg: str):
+    print(msg, flush=True)
+    with (LOG_DIR / "domain_prompt_log.txt").open("a", encoding="utf-8") as f:
+        f.write(msg + "\n")
+
+
+def safe_rel(p: Path) -> str:
+    try:
+        return str(p.relative_to(DATA_ROOT))
+    except Exception:
+        return str(p)
+
+
+def ocr_image(img_path: Path) -> str:
+    try:
+        return pytesseract.image_to_string(Image.open(img_path), lang=OCR_LANG).strip()
+    except Exception as e:
+        log(f"[WARN] OCR 실패: {safe_rel(img_path)} | {e}")
+        return ""
+
+
+def sample_from_pdf(p: Path, max_chars: int = 1000) -> str:
+    texts = []
+    try:
+        with pdfplumber.open(str(p)) as pdf:
+            # 앞쪽 몇 페이지만 샘플링
+            for page in pdf.pages[:3]:
+                t = page.extract_text() or ""
+                if t:
+                    texts.append(t)
+                if sum(len(x) for x in texts) >= max_chars:
+                    break
+    except Exception as e:
+        log(f"[WARN] PDF 샘플 추출 실패: {safe_rel(p)} | {e}")
+    joined = "\n".join(texts)
+    return joined[:max_chars]
+
+
+def sample_from_xlsx(p: Path, max_chars: int = 1000) -> str:
+    texts = [f"[파일명] {p.name}"]
+    try:
+        xls = pd.ExcelFile(str(p))
+        for sheet_name in xls.sheet_names[:3]:
+            try:
+                df = xls.parse(sheet_name)
+            except Exception as e:
+                log(f"[WARN] 시트 로딩 실패: {safe_rel(p)} | {sheet_name} | {e}")
+                continue
+            texts.append(f"\n[시트] {sheet_name}")
+            texts.append("컬럼: " + ", ".join(map(str, df.columns)))
+            head = df.head(5)
+            texts.append(head.to_string(index=False))
+            if sum(len(x) for x in texts) >= max_chars:
+                break
+    except Exception as e:
+        log(f"[WARN] XLSX 샘플 추출 실패: {safe_rel(p)} | {e}")
+    joined = "\n".join(texts)
+    return joined[:max_chars]
+
+
+def sample_from_text_file(p: Path, max_chars: int = 1000) -> str:
+    try:
+        t = p.read_text(encoding="utf-8", errors="ignore")
+    except Exception:
+        t = p.read_text(encoding="cp949", errors="ignore")
+    return t[:max_chars]
+
+
+def gather_file_samples(
+    max_files_per_type: int = 100,
+    max_total_samples: int = 300,
+    max_chars_per_sample: int = 1000,
+):
+
+    file_names = []
+    samples = []
+
+    count_pdf = 0
+    count_xlsx = 0
+    count_img = 0
+    count_txt = 0
+
+    for root, dirs, files in os.walk(DATA_ROOT):
+        dirs[:] = [d for d in dirs if d not in SKIP_DIR_NAMES and not d.startswith(".")]
+        cur_dir = Path(root)
+
+        for fname in files:
+            fpath = cur_dir / fname
+            ext = fpath.suffix.lower()
+
+            # 파일명은 전체 다 모으되, 샘플 추출은 제한
+            file_names.append(safe_rel(fpath))
+
+            if len(samples) >= max_total_samples:
+                continue
+
+            try:
+                if ext == ".pdf" and count_pdf < max_files_per_type:
+                    s = sample_from_pdf(fpath, max_chars=max_chars_per_sample)
+                    if s.strip():
+                        samples.append(f"[PDF] {safe_rel(fpath)}\n{s}")
+                        count_pdf += 1
+                    continue
+
+                if ext in {".xlsx", ".xls"} and count_xlsx < max_files_per_type:
+                    s = sample_from_xlsx(fpath, max_chars=max_chars_per_sample)
+                    if s.strip():
+                        samples.append(f"[XLSX] {safe_rel(fpath)}\n{s}")
+                        count_xlsx += 1
+                    continue
+
+                if ext in {".png", ".jpg", ".jpeg"} and count_img < max_files_per_type:
+                    s = ocr_image(fpath)
+                    if s.strip():
+                        samples.append(f"[IMG] {safe_rel(fpath)}\n{s[:max_chars_per_sample]}")
+                        count_img += 1
+                    continue
+
+                if ext in {".txt", ".md"} and count_txt < max_files_per_type:
+                    s = sample_from_text_file(fpath, max_chars=max_chars_per_sample)
+                    if s.strip():
+                        samples.append(f"[TEXT] {safe_rel(fpath)}\n{s}")
+                        count_txt += 1
+                    continue
+
+            except Exception as e:
+                log(f"[WARN] 샘플 추출 실패: {safe_rel(fpath)} | {e}")
+                continue
+
+    return file_names, samples
+
+
+def build_domain_prompt():
+    """
+    파일명 + 내용 샘플을 GPT에게 넘겨
+    '너는 ~~ 분야의 전문가이다...' 형태의 시스템 프롬프트를 생성한다.
+    """
+    log("도메인 프롬프트 생성을 위한 샘플 수집 중...")
+    file_names, samples = gather_file_samples()
+
+    if not file_names and not samples:
+        log("파일 샘플이 없어 도메인 프롬프트를 생성할 수 없습니다.")
+        sys.exit(1)
+
+    file_names_text = "\n".join(file_names[:80])
+    sample_text = "\n\n".join(samples[:30])
+
+    prompt = f"""
+다음은 한 기업의 '이슈 리포트 및 시스템 관련 자료'로 추정되는 파일들의 목록과,
+각 파일에서 일부 추출한 내용 샘플이다.
+
+[파일명 목록]
+{file_names_text}
+
+[내용 샘플]
+{sample_text}
+
+위 자료를 바탕으로 다음을 수행하라.
+
+1) 이 문서 묶음이 어떤 산업, 업무, 분야에 대한 것인지,
+   핵심 키워드를 포함해 2~3줄 정도로 설명하라.
+
+2) 이후, 이 문서들을 다루는 AI에게 사용할 "프롬프트 머리말"을 작성하라.
+   이 머리말은 모든 후속 프롬프트 앞에 항상 붙일 예정이며,
+   다음 조건을 만족해야 한다.
+
+   - 첫 문단: "너는 ~~ 분야의 전문가이다." 형식으로, 이 문서 묶음의 분야와 역할을 정의한다.
+   - 두 번째 문단 이후: "나는 ~~을 하고 싶다.", "우리는 ~~ 의 문제를 분석하고 개선방안을 찾고자 한다." 등
+     사용자가 AI에게 요구하는 전반적 목적과 관점을 정리한다.
+   - 총 5~7줄 정도의 한국어 문장으로 작성한다.
+   - 이후에 붙을 프롬프트(청킹, 요약, RAG, 보고서 작성 등)와 자연스럽게 연결될 수 있도록,
+     역할(role), 목적, 기준(추측 금지, 사실 기반, 근거 명시 등)을 모두 포함한다.
+
+출력 형식:
+- 설명과 머리말을 한 번에 출력하되,
+  별도의 마크다운 없이 순수 텍스트로만 작성하라.
+- 이 출력 전체를 domain_prompt.txt에 그대로 저장할 것이다.
+"""
+
+    resp = client.chat.completions.create(
+        model=GPT_MODEL,
+        messages=[
+            {
+                "role": "system",
+                "content": "너는 문서 묶음의 분야를 식별하고, 그에 맞는 AI 시스템 프롬프트와 컨텍스트를 설계하는 컨설턴트이다."
+            },
+            {
+                "role": "user",
+                "content": prompt
+            }
+        ],
+    )
+
+    content = (resp.choices[0].message.content or "").strip()
+    out_path = CONTEXT_DIR / "domain_prompt.txt"
+    out_path.write_text(content, encoding="utf-8")
+
+    log(f"도메인 프롬프트 생성 완료: {out_path}")
+    return content
+
+
+def main():
+    log("=== 도메인 프롬프트 생성 시작 ===")
+    out_path = CONTEXT_DIR / "domain_prompt.txt"
+    if out_path.exists():
+        log(f"이미 domain_prompt.txt가 존재합니다: {out_path}")
+        log("기존 파일을 사용하려면 종료하고, 재생성이 필요하면 파일을 삭제한 뒤 다시 실행하십시오.")
+    else:
+        build_domain_prompt()
+    log("=== 도메인 프롬프트 작업 종료 ===")
+
+
+if __name__ == "__main__":
+    main()
--- a/Code/geulbeot_5th/converters/pipeline/step4_chunk.py
+++ b/Code/geulbeot_5th/converters/pipeline/step4_chunk.py
@@ -0,0 +1,357 @@
+# -*- coding: utf-8 -*-
+"""
+chunk_and_summary_v2.py
+
+기능:
+- 정리중 폴더 아래의 .md 파일들을 대상으로
+  1) domain_prompt.txt 기반 GPT 의미 청킹
+  2) 청크별 요약 생성
+  3) 청크 내 이미지 참조 보존
+  4) JSON 저장 (원문+청크+요약+이미지)
+  5) RAG용 *_chunks.json 저장
+
+전제:
+- extract_1_v2.py 실행 후 .md 파일들이 존재할 것
+- step1_domainprompt.py 실행 후 domain_prompt.txt가 존재할 것
+"""
+
+import os
+import sys
+import json
+import re
+from pathlib import Path
+from datetime import datetime
+
+from openai import OpenAI
+from api_config import API_KEYS
+
+# ===== 경로 =====
+DATA_ROOT      = Path(r"D:\for python\survey_test\process")  
+OUTPUT_ROOT    = Path(r"D:\for python\survey_test\output")
+
+TEXT_DIR       = OUTPUT_ROOT / "text"
+JSON_DIR       = OUTPUT_ROOT / "json"
+RAG_DIR        = OUTPUT_ROOT / "rag"
+CONTEXT_DIR    = OUTPUT_ROOT / "context"
+LOG_DIR        = OUTPUT_ROOT / "logs"
+
+for d in [TEXT_DIR, JSON_DIR, RAG_DIR, CONTEXT_DIR, LOG_DIR]:
+    d.mkdir(parents=True, exist_ok=True)
+
+# ===== OpenAI 설정 =====
+OPENAI_API_KEY = API_KEYS.get('GPT_API_KEY', '')
+GPT_MODEL      = "gpt-5-2025-08-07"
+
+client = OpenAI(api_key=OPENAI_API_KEY)
+
+# ===== 스킵할 폴더 =====
+SKIP_DIR_NAMES = {"System Volume Information", "$RECYCLE.BIN", ".git", "__pycache__", "output"}
+
+# ===== 이미지 참조 패턴 =====
+IMAGE_PATTERN = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)')
+
+
+def log(msg: str):
+    print(msg, flush=True)
+    with (LOG_DIR / "chunk_and_summary_log.txt").open("a", encoding="utf-8") as f:
+        f.write(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}\n")
+
+
+def load_domain_prompt() -> str:
+    p = CONTEXT_DIR / "domain_prompt.txt"
+    if not p.exists():
+        log(f"domain_prompt.txt가 없습니다: {p}")
+        log("먼저 step1_domainprompt.py를 실행해야 합니다.")
+        sys.exit(1)
+    return p.read_text(encoding="utf-8", errors="ignore").strip()
+
+
+def safe_rel(p: Path) -> str:
+    """DATA_ROOT 기준 상대 경로 반환"""
+    try:
+        return str(p.relative_to(DATA_ROOT))
+    except Exception:
+        return str(p)
+
+
+def extract_text_md(p: Path) -> str:
+    """마크다운 파일 텍스트 읽기"""
+    try:
+        return p.read_text(encoding="utf-8", errors="ignore")
+    except Exception:
+        return p.read_text(encoding="cp949", errors="ignore")
+
+
+def find_images_in_text(text: str) -> list:
+    """텍스트에서 이미지 참조 찾기"""
+    matches = IMAGE_PATTERN.findall(text)
+    return [{"alt": m[0], "path": m[1]} for m in matches]
+
+
+def semantic_chunk(domain_prompt: str, text: str, source_name: str):
+    """GPT 기반 의미 청킹"""
+    if not text.strip():
+        return []
+
+    # 텍스트가 너무 짧으면 그냥 하나의 청크로
+    if len(text) < 500:
+        return [{
+            "title": "전체 내용",
+            "keywords": "",
+            "content": text
+        }]
+
+    user_prompt = f"""
+아래 문서를 의미 단위(문단/항목/섹션 등)로 분리하고,
+각 청크는 title / keywords / content 를 포함한 JSON 배열로 출력하라.
+
+규칙:
+1. 추측 금지, 문서 내용 기반으로만 분리
+2. 이미지 참조(![...](...))는 관련 텍스트와 같은 청크에 포함
+3. 각 청크는 최소 100자 이상
+4. keywords는 쉼표로 구분된 핵심 키워드 3~5개
+
+문서:
+{text[:12000]}
+
+JSON 배열만 출력하라. 다른 설명 없이.
+"""
+
+    try:
+        resp = client.chat.completions.create(
+            model=GPT_MODEL,
+            messages=[
+                {"role": "system", "content": domain_prompt + "\n\n너는 의미 기반 청킹 전문가이다. JSON 배열만 출력한다."},
+                {"role": "user", "content": user_prompt},
+            ],
+        )
+        data = resp.choices[0].message.content.strip()
+        
+        # JSON 파싱 시도
+        # ```json ... ``` 형식 처리
+        if "```json" in data:
+            data = data.split("```json")[1].split("```")[0].strip()
+        elif "```" in data:
+            data = data.split("```")[1].split("```")[0].strip()
+        
+        if data.startswith("["):
+            return json.loads(data)
+            
+    except json.JSONDecodeError as e:
+        log(f"[WARN] JSON 파싱 실패 ({source_name}): {e}")
+    except Exception as e:
+        log(f"[WARN] semantic_chunk API 실패 ({source_name}): {e}")
+
+    # fallback: 페이지/섹션 기반 분리
+    log(f"[INFO] Fallback 청킹 적용: {source_name}")
+    return fallback_chunk(text)
+
+
+def fallback_chunk(text: str) -> list:
+    """GPT 실패 시 대체 청킹 (페이지/섹션 기반)"""
+    chunks = []
+    
+    # 페이지 구분자로 분리 시도
+    if "## Page " in text:
+        pages = re.split(r'\n## Page \d+\n', text)
+        for i, page_content in enumerate(pages):
+            if page_content.strip():
+                chunks.append({
+                    "title": f"Page {i+1}",
+                    "keywords": "",
+                    "content": page_content.strip()
+                })
+    else:
+        # 빈 줄 2개 이상으로 분리
+        sections = re.split(r'\n{3,}', text)
+        for i, section in enumerate(sections):
+            if section.strip() and len(section.strip()) > 50:
+                chunks.append({
+                    "title": f"섹션 {i+1}",
+                    "keywords": "",
+                    "content": section.strip()
+                })
+    
+    # 청크가 없으면 전체를 하나로
+    if not chunks:
+        chunks.append({
+            "title": "전체 내용",
+            "keywords": "",
+            "content": text.strip()
+        })
+    
+    return chunks
+
+
+def summary_chunk(domain_prompt: str, text: str, limit: int = 300) -> str:
+    """청크 요약 생성"""
+    if not text.strip():
+        return ""
+    
+    # 이미지 참조 제거 후 요약 (텍스트만)
+    text_only = IMAGE_PATTERN.sub('', text).strip()
+    
+    if len(text_only) < 100:
+        return text_only
+    
+    prompt = f"""
+아래 텍스트를 {limit}자 이내로 사실 기반으로 요약하라.
+추측 금지, 고유명사와 수치는 보존.
+
+{text_only[:8000]}
+"""
+    try:
+        resp = client.chat.completions.create(
+            model=GPT_MODEL,
+            messages=[
+                {"role": "system", "content": domain_prompt + "\n\n너는 사실만 요약하는 전문가이다."},
+                {"role": "user", "content": prompt},
+            ],
+        )
+        return resp.choices[0].message.content.strip()
+    except Exception as e:
+        log(f"[WARN] summary 실패: {e}")
+        return text_only[:limit]
+
+
+def save_chunk_files(src: Path, text: str, domain_prompt: str) -> int:
+    """
+    의미 청킹 → 요약 → JSON 저장
+    
+    Returns:
+        생성된 청크 수
+    """
+    stem = src.stem
+    folder_ctx = safe_rel(src.parent)
+    
+    # 원문 저장
+    (TEXT_DIR / f"{stem}_text.txt").write_text(text, encoding="utf-8", errors="ignore")
+    
+    # 의미 청킹
+    chunks = semantic_chunk(domain_prompt, text, src.name)
+    
+    if not chunks:
+        log(f"[WARN] 청크 없음: {src.name}")
+        return 0
+    
+    rag_items = []
+    
+    for idx, ch in enumerate(chunks, start=1):
+        content = ch.get("content", "")
+        
+        # 요약 생성
+        summ = summary_chunk(domain_prompt, content, 300)
+        
+        # 이 청크에 포함된 이미지 찾기
+        images_in_chunk = find_images_in_text(content)
+        
+        rag_items.append({
+            "source": src.name,
+            "source_path": safe_rel(src),
+            "chunk": idx,
+            "total_chunks": len(chunks),
+            "title": ch.get("title", ""),
+            "keywords": ch.get("keywords", ""),
+            "text": content,
+            "summary": summ,
+            "folder_context": folder_ctx,
+            "images": images_in_chunk,
+            "has_images": len(images_in_chunk) > 0
+        })
+    
+    # JSON 저장
+    (JSON_DIR / f"{stem}.json").write_text(
+        json.dumps(rag_items, ensure_ascii=False, indent=2),
+        encoding="utf-8"
+    )
+    
+    # RAG용 JSON 저장
+    (RAG_DIR / f"{stem}_chunks.json").write_text(
+        json.dumps(rag_items, ensure_ascii=False, indent=2),
+        encoding="utf-8"
+    )
+    
+    return len(chunks)
+
+
+def main():
+    log("=" * 60)
+    log("청킹/요약 파이프라인 시작")
+    log(f"데이터 폴더: {DATA_ROOT}")
+    log(f"출력 폴더: {OUTPUT_ROOT}")
+    log("=" * 60)
+    
+    # 도메인 프롬프트 로드
+    domain_prompt = load_domain_prompt()
+    log(f"도메인 프롬프트 로드 완료 ({len(domain_prompt)}자)")
+    
+    # 통계
+    stats = {"docs": 0, "chunks": 0, "images": 0, "errors": 0}
+    
+    # .md 파일 찾기
+    md_files = []
+    for root, dirs, files in os.walk(DATA_ROOT):
+        dirs[:] = [d for d in dirs if d not in SKIP_DIR_NAMES and not d.startswith(".")]
+        for fname in files:
+            if fname.lower().endswith(".md"):
+                md_files.append(Path(root) / fname)
+    
+    log(f"\n총 {len(md_files)}개 .md 파일 발견\n")
+    
+    for idx, fpath in enumerate(md_files, 1):
+        try:
+            rel_path = safe_rel(fpath)
+            log(f"[{idx}/{len(md_files)}] {rel_path}")
+            
+            # 텍스트 읽기
+            text = extract_text_md(fpath)
+            
+            if not text.strip():
+                log(f"    ⚠ 빈 파일, 스킵")
+                continue
+            
+            # 이미지 개수 확인
+            images = find_images_in_text(text)
+            stats["images"] += len(images)
+            
+            # 청킹 및 저장
+            chunk_count = save_chunk_files(fpath, text, domain_prompt)
+            
+            stats["docs"] += 1
+            stats["chunks"] += chunk_count
+            
+            log(f"    ✓ {chunk_count}개 청크, {len(images)}개 이미지")
+            
+        except Exception as e:
+            stats["errors"] += 1
+            log(f"    ✗ 오류: {e}")
+    
+    # 전체 통계 저장
+    summary = {
+        "processed_at": datetime.now().isoformat(),
+        "data_root": str(DATA_ROOT),
+        "output_root": str(OUTPUT_ROOT),
+        "statistics": stats
+    }
+    
+    (LOG_DIR / "chunk_summary_stats.json").write_text(
+        json.dumps(summary, ensure_ascii=False, indent=2),
+        encoding="utf-8"
+    )
+    
+    # 결과 출력
+    log("\n" + "=" * 60)
+    log("청킹/요약 완료!")
+    log("=" * 60)
+    log(f"처리된 문서: {stats['docs']}개")
+    log(f"생성된 청크: {stats['chunks']}개")
+    log(f"포함된 이미지: {stats['images']}개")
+    log(f"오류: {stats['errors']}개")
+    log(f"\n결과 저장 위치:")
+    log(f"  - 원문: {TEXT_DIR}")
+    log(f"  - JSON: {JSON_DIR}")
+    log(f"  - RAG: {RAG_DIR}")
+
+
+if __name__ == "__main__":
+    main()
--- a/Code/geulbeot_5th/converters/pipeline/step5_rag.py
+++ b/Code/geulbeot_5th/converters/pipeline/step5_rag.py
@@ -0,0 +1,141 @@
+# -*- coding: utf-8 -*-
+"""
+build_rag.py
+
+기능:
+- chunk_and_summary.py 에서 생성된 output/rag/*_chunks.json 파일들을 읽어서
+  text + summary 를 임베딩(text-embedding-3-small)한다.
+- FAISS IndexFlatIP 인덱스를 구축하여
+  output/rag/faiss.index, meta.json, vectors.npy 를 생성한다.
+"""
+
+import os
+import sys
+import json
+from pathlib import Path
+
+import numpy as np
+import faiss
+from openai import OpenAI
+from api_config import API_KEYS
+
+# ===== 경로 설정 =====
+DATA_ROOT   = Path(r"D:\for python\survey_test\process")
+OUTPUT_ROOT = Path(r"D:\for python\survey_test\output")
+RAG_DIR     = OUTPUT_ROOT / "rag"
+LOG_DIR     = OUTPUT_ROOT / "logs"
+
+for d in [RAG_DIR, LOG_DIR]:
+    d.mkdir(parents=True, exist_ok=True)
+
+# ===== OpenAI 설정 (구조 유지) =====
+OPENAI_API_KEY = API_KEYS.get('GPT_API_KEY', '')
+GPT_MODEL      = "gpt-5-2025-08-07"
+EMBED_MODEL    = "text-embedding-3-small"
+
+client = OpenAI(api_key=OPENAI_API_KEY)
+
+
+def log(msg: str):
+    print(msg, flush=True)
+    with (LOG_DIR / "build_rag_log.txt").open("a", encoding="utf-8") as f:
+        f.write(msg + "\n")
+
+
+def embed_texts(texts):
+    if not texts:
+        return np.zeros((0, 1536), dtype="float32")
+    embs = []
+    B = 96
+    for i in range(0, len(texts), B):
+        batch = texts[i:i+B]
+        resp = client.embeddings.create(model=EMBED_MODEL, input=batch)
+        for d in resp.data:
+            embs.append(np.array(d.embedding, dtype="float32"))
+    return np.vstack(embs)
+
+
+def _build_embed_input(u: dict) -> str:
+    """
+    text + summary 를 합쳐 임베딩 입력을 만든다.
+    - text, summary 중 없는 것은 생략
+    - 공백 정리
+    - 최대 길이 제한
+    """
+    sum_ = (u.get("summary") or "").strip()
+    txt  = (u.get("text") or "").strip()
+
+    if txt and sum_:
+        merged = txt + "\n\n요약: " + sum_[:1000]
+    else:
+        merged = txt or sum_
+
+    merged = " ".join(merged.split())
+    if not merged:
+        return ""
+    if len(merged) > 4000:
+        merged = merged[:4000]
+    return merged
+
+
+def build_faiss_index():
+    docs = []
+    metas = []
+
+    rag_files = list(RAG_DIR.glob("*_chunks.json"))
+    if not rag_files:
+        log("RAG 파일(*_chunks.json)이 없습니다. 먼저 chunk_and_summary.py를 실행해야 합니다.")
+        sys.exit(1)
+
+    for f in rag_files:
+        try:
+            units = json.loads(f.read_text(encoding="utf-8", errors="ignore"))
+        except Exception as e:
+            log(f"[WARN] RAG 파일 읽기 실패: {f.name} | {e}")
+            continue
+
+        for u in units:
+            embed_input = _build_embed_input(u)
+            if not embed_input:
+                continue
+            if len(embed_input) < 40:
+                continue
+            docs.append(embed_input)
+            metas.append({
+                "source": u.get("source", ""),
+                "chunk": int(u.get("chunk", 0)),
+                "folder_context": u.get("folder_context", "")
+            })
+
+    if not docs:
+        log("임베딩할 텍스트가 없습니다.")
+        sys.exit(1)
+
+    log(f"임베딩 대상 텍스트 수: {len(docs)}")
+
+    E = embed_texts(docs)
+    if E.shape[0] != len(docs):
+        log(f"[WARN] 임베딩 수 불일치: E={E.shape[0]}, docs={len(docs)}")
+
+    faiss.normalize_L2(E)
+    index = faiss.IndexFlatIP(E.shape[1])
+    index.add(E)
+
+    np.save(str(RAG_DIR / "vectors.npy"), E)
+    (RAG_DIR / "meta.json").write_text(
+        json.dumps(metas, ensure_ascii=False, indent=2),
+        encoding="utf-8"
+    )
+    faiss.write_index(index, str(RAG_DIR / "faiss.index"))
+
+    log(f"FAISS 인덱스 구축 완료: 벡터 수={len(metas)}")
+
+
+def main():
+    log("=== FAISS RAG 인덱스 구축 시작 ===")
+    build_faiss_index()
+    log("=== FAISS RAG 인덱스 구축 종료 ===")
+
+
+if __name__ == "__main__":
+    main()
--- a/Code/geulbeot_5th/converters/pipeline/step6_corpus.py
+++ b/Code/geulbeot_5th/converters/pipeline/step6_corpus.py
@@ -0,0 +1,232 @@
+# -*- coding: utf-8 -*-
+"""
+make_corpus_v2.py
+
+기능:
+- output/rag/*_chunks.json 에서 모든 청크의 summary를 모아
+- AI가 CEL 목적(교육+자사솔루션 홍보)에 맞게 압축 정리
+- 중복은 빈도 표시, 희귀하지만 중요한 건 [핵심] 표시
+- 결과를 output/context/corpus.txt 로 저장
+
+전제:
+- chunk_and_summary.py 실행 후 *_chunks.json 들이 존재해야 한다.
+- domain_prompt.txt가 존재해야 한다.
+"""
+
+import os
+import sys
+import json
+from pathlib import Path
+from datetime import datetime
+
+from openai import OpenAI
+from api_config import API_KEYS
+
+# ===== 경로 설정 =====
+DATA_ROOT   = Path(r"D:\for python\survey_test\process")
+OUTPUT_ROOT = Path(r"D:\for python\survey_test\output")
+RAG_DIR     = OUTPUT_ROOT / "rag"
+CONTEXT_DIR = OUTPUT_ROOT / "context"
+LOG_DIR     = OUTPUT_ROOT / "logs"
+
+for d in [RAG_DIR, CONTEXT_DIR, LOG_DIR]:
+    d.mkdir(parents=True, exist_ok=True)
+
+# ===== OpenAI 설정 =====
+OPENAI_API_KEY = API_KEYS.get('GPT_API_KEY', '')
+GPT_MODEL      = "gpt-5-2025-08-07"
+
+client = OpenAI(api_key=OPENAI_API_KEY)
+
+# ===== 압축 설정 =====
+BATCH_SIZE = 80  # 한 번에 처리할 요약 개수
+MAX_CHARS_PER_BATCH = 3000  # 배치당 압축 결과 글자수
+MAX_FINAL_CHARS = 8000  # 최종 corpus 글자수
+
+
+def log(msg: str):
+    print(msg, flush=True)
+    with (LOG_DIR / "make_corpus_log.txt").open("a", encoding="utf-8") as f:
+        f.write(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}\n")
+
+
+def load_domain_prompt() -> str:
+    p = CONTEXT_DIR / "domain_prompt.txt"
+    if not p.exists():
+        log("domain_prompt.txt가 없습니다. 먼저 step1을 실행해야 합니다.")
+        sys.exit(1)
+    return p.read_text(encoding="utf-8", errors="ignore").strip()
+
+
+def load_all_summaries() -> list:
+    """모든 청크의 summary + 출처 정보 수집"""
+    summaries = []
+    rag_files = sorted(RAG_DIR.glob("*_chunks.json"))
+    
+    if not rag_files:
+        log("RAG 파일(*_chunks.json)이 없습니다. 먼저 chunk_and_summary.py를 실행해야 합니다.")
+        sys.exit(1)
+
+    for f in rag_files:
+        try:
+            units = json.loads(f.read_text(encoding="utf-8", errors="ignore"))
+        except Exception as e:
+            log(f"[WARN] RAG 파일 읽기 실패: {f.name} | {e}")
+            continue
+
+        for u in units:
+            summ = (u.get("summary") or "").strip()
+            source = (u.get("source") or "").strip()
+            keywords = (u.get("keywords") or "")
+            
+            if summ:
+                # 출처와 키워드 포함
+                entry = f"[{source}] {summ}"
+                if keywords:
+                    entry += f" (키워드: {keywords})"
+                summaries.append(entry)
+
+    return summaries
+
+
+def compress_batch(domain_prompt: str, batch: list, batch_num: int, total_batches: int) -> str:
+    """배치 단위로 요약들을 AI가 압축"""
+    
+    batch_text = "\n".join([f"{i+1}. {s}" for i, s in enumerate(batch)])
+    
+    prompt = f"""
+아래는 문서에서 추출한 요약 {len(batch)}개이다. (배치 {batch_num}/{total_batches})
+
+[요약 목록]
+{batch_text}
+
+다음 기준으로 이 요약들을 압축 정리하라:
+
+1) 중복/유사 내용: 하나로 통합하되, 여러 문서에서 언급되면 "(N회 언급)" 표시
+2) domain_prompt에 명시된 핵심 솔루션/시스템: 반드시 보존하고 [솔루션] 표시
+3) domain_prompt의 목적에 중요한 내용 우선 보존:
+   - 해당 분야의 기초 개념
+   - 기존 방식의 한계점과 문제점
+   - 새로운 기술/방식의 장점
+4) 단순 나열/절차만 있는 내용: 과감히 축약
+5) 희귀하지만 핵심적인 인사이트: [핵심] 표시
+
+출력 형식:
+- 주제별로 그룹핑
+- 각 항목은 1~2문장으로 간결하게
+- 전체 {MAX_CHARS_PER_BATCH}자 이내
+- 마크다운 없이 순수 텍스트로
+"""
+    
+    try:
+        resp = client.chat.completions.create(
+            model=GPT_MODEL,
+            messages=[
+                {"role": "system", "content": domain_prompt + "\n\n너는 문서 요약을 주제별로 압축 정리하는 전문가이다."},
+                {"role": "user", "content": prompt}
+            ]
+        )
+        result = resp.choices[0].message.content.strip()
+        log(f"    배치 {batch_num}/{total_batches} 압축 완료 ({len(result)}자)")
+        return result
+    except Exception as e:
+        log(f"[ERROR] 배치 {batch_num} 압축 실패: {e}")
+        # 실패 시 원본 일부 반환
+        return "\n".join(batch[:10])
+
+
+def merge_compressed_parts(domain_prompt: str, parts: list) -> str:
+    """배치별 압축 결과를 최종 통합"""
+    
+    if len(parts) == 1:
+        return parts[0]
+    
+    all_parts = "\n\n---\n\n".join([f"[파트 {i+1}]\n{p}" for i, p in enumerate(parts)])
+    
+    prompt = f"""
+아래는 대량의 문서 요약을 배치별로 압축한 결과이다.
+이것을 최종 corpus로 통합하라.
+
+[배치별 압축 결과]
+{all_parts}
+
+통합 기준:
+1) 파트 간 중복 내용 제거 및 통합
+2) domain_prompt에 명시된 목적과 흐름에 맞게 재구성
+3) [솔루션], [핵심], (N회 언급) 표시는 유지
+4) 전체 {MAX_FINAL_CHARS}자 이내
+
+출력: 주제별로 정리된 최종 corpus (마크다운 없이)
+"""
+    
+    try:
+        resp = client.chat.completions.create(
+            model=GPT_MODEL,
+            messages=[
+                {"role": "system", "content": domain_prompt + "\n\n너는 CEL 교육 콘텐츠 기획을 위한 corpus를 설계하는 전문가이다."},
+                {"role": "user", "content": prompt}
+            ]
+        )
+        return resp.choices[0].message.content.strip()
+    except Exception as e:
+        log(f"[ERROR] 최종 통합 실패: {e}")
+        return "\n\n".join(parts)
+
+
+def main():
+    log("=" * 60)
+    log("corpus 생성 시작 (AI 압축 버전)")
+    log("=" * 60)
+    
+    # 도메인 프롬프트 로드
+    domain_prompt = load_domain_prompt()
+    log(f"도메인 프롬프트 로드 완료 ({len(domain_prompt)}자)")
+    
+    # 모든 요약 수집
+    summaries = load_all_summaries()
+    if not summaries:
+        log("summary가 없습니다. corpus를 생성할 수 없습니다.")
+        sys.exit(1)
+    
+    log(f"원본 요약 수집 완료: {len(summaries)}개")
+    
+    # 원본 저장 (백업)
+    raw_corpus = "\n".join(summaries)
+    raw_path = CONTEXT_DIR / "corpus_raw.txt"
+    raw_path.write_text(raw_corpus, encoding="utf-8")
+    log(f"원본 corpus 백업: {raw_path} ({len(raw_corpus)}자)")
+    
+    # 배치별 압축
+    total_batches = (len(summaries) + BATCH_SIZE - 1) // BATCH_SIZE
+    log(f"\n배치 압축 시작 ({BATCH_SIZE}개씩, 총 {total_batches}배치)")
+    
+    compressed_parts = []
+    for i in range(0, len(summaries), BATCH_SIZE):
+        batch = summaries[i:i+BATCH_SIZE]
+        batch_num = (i // BATCH_SIZE) + 1
+        
+        compressed = compress_batch(domain_prompt, batch, batch_num, total_batches)
+        compressed_parts.append(compressed)
+    
+    # 최종 통합
+    log(f"\n최종 통합 시작 ({len(compressed_parts)}개 파트)")
+    final_corpus = merge_compressed_parts(domain_prompt, compressed_parts)
+    
+    # 저장
+    out_path = CONTEXT_DIR / "corpus.txt"
+    out_path.write_text(final_corpus, encoding="utf-8")
+    
+    # 통계
+    log("\n" + "=" * 60)
+    log("corpus 생성 완료!")
+    log("=" * 60)
+    log(f"원본 요약: {len(summaries)}개 ({len(raw_corpus)}자)")
+    log(f"압축 corpus: {len(final_corpus)}자")
+    log(f"압축률: {100 - (len(final_corpus) / len(raw_corpus) * 100):.1f}%")
+    log(f"\n저장 위치:")
+    log(f"  - 원본: {raw_path}")
+    log(f"  - 압축: {out_path}")
+
+
+if __name__ == "__main__":
+    main()
--- a/Code/geulbeot_5th/converters/pipeline/step7_index.py
+++ b/Code/geulbeot_5th/converters/pipeline/step7_index.py
@@ -0,0 +1,504 @@
+# -*- coding: utf-8 -*-
+"""
+make_outline.py
+
+기능:
+- output_context/context/domain_prompt.txt
+- output_context/context/corpus.txt
+을 기반으로 목차를 생성하고,
+
+1) outline_issue_report.txt 저장
+2) outline_issue_report.html 저장 (테스트.html 레이아웃 기반 표 형태)
+"""
+
+import os
+import sys
+import re
+from pathlib import Path
+from datetime import datetime
+from typing import List, Dict, Any, Tuple
+
+from openai import OpenAI
+from api_config import API_KEYS
+
+# ===== 경로 설정 =====
+DATA_ROOT   = Path(r"D:\for python\survey_test\process")
+OUTPUT_ROOT = Path(r"D:\for python\survey_test\output")
+CONTEXT_DIR = OUTPUT_ROOT / "context"
+LOG_DIR     = OUTPUT_ROOT / "logs"
+
+for d in [CONTEXT_DIR, LOG_DIR]:
+    d.mkdir(parents=True, exist_ok=True)
+
+# ===== OpenAI 설정 (구조 유지) =====
+OPENAI_API_KEY = API_KEYS.get('GPT_API_KEY', '')
+GPT_MODEL      = "gpt-5-2025-08-07"
+
+client = OpenAI(api_key=OPENAI_API_KEY)
+
+# ===== 목차 파싱용 정규식 보완 (5분할 대응) =====
+RE_KEYWORDS = re.compile(r"(#\S+)")
+RE_L1 = re.compile(r"^\s*(\d+)\.\s+(.+?)\s*$")
+RE_L2 = re.compile(r"^\s*(\d+\.\d+)\s+(.+?)\s*$")
+RE_L3 = re.compile(r"^\s*(\d+\.\d+\.\d+)\s+(.+?)\s*$")
+
+def log(msg: str):
+    print(msg, flush=True)
+    with (LOG_DIR / "make_outline_log.txt").open("a", encoding="utf-8") as f:
+        f.write(msg + "\n")
+
+def load_domain_prompt() -> str:
+    p = CONTEXT_DIR / "domain_prompt.txt"
+    if not p.exists():
+        log("domain_prompt.txt가 없습니다. 먼저 domain_prompt.py를 실행해야 합니다.")
+        sys.exit(1)
+    return p.read_text(encoding="utf-8", errors="ignore").strip()
+
+def load_corpus() -> str:
+    p = CONTEXT_DIR / "corpus.txt"
+    if not p.exists():
+        log("corpus.txt가 없습니다. 먼저 make_corpus.py를 실행해야 합니다.")
+        sys.exit(1)
+    return p.read_text(encoding="utf-8", errors="ignore").strip()
+
+
+# 기존 RE_L1, RE_L2는 유지하고 아래 두 개를 추가/교체합니다.
+RE_L3_HEAD = re.compile(r"^\s*(\d+\.\d+\.\d+)\s+(.+)$") 
+RE_L3_TOPIC = re.compile(r"^\s*[\-\*]\s+(.+?)\s*\|\s*(.+?)\s*\|\s*(\[.+?\])\s*\|\s*(.+)$") 
+
+def generate_outline(domain_prompt: str, corpus: str) -> str:
+    sys_msg = {
+        "role": "system",
+        "content": (
+            domain_prompt + "\n\n"
+            "너는 건설/측량 DX 기술 보고서의 구조를 설계하는 시니어 기술사이다. "
+            "주어진 corpus를 분석하여, 실무자가 즉시 활용 가능한 고밀도 지침서 목차를 설계하라."
+        ),
+    }
+
+    user_msg = {
+        "role": "user",
+        "content": f"""
+아래 [corpus]를 바탕으로 보고서 제목과 전략적 목차를 설계하라.
+
+[corpus]
+{corpus}
+
+요구 사항:
+1) 첫 줄에 보고서 제목 1개를 작성하라.
+2) 그 아래 목차를 번호 기반 계측 구조로 작성하라.
+   - 대목차: 1. / 2. / 3. ...
+   - 중목차: 1.1 / 1.2 / ...
+   - 소목차: 1.1.1 / 1.1.2 / ...
+3) **수량 제약 (중요)**:
+   - 대목차(1.)는 5~8개로 구성하라.
+   - **중목차(1.1) 하나당 소목차(1.1.1, 1.1.2...)는 반드시 2개에서 4개 사이로 구성하라.** (절대 1개만 만들지 말 것)
+   - 소목차(1.1.1) 하나당 '핵심주제(꼭지)'는 반드시 2개에서 3개 사이로 구성하라.
+
+[소목차 작성 형식]
+1.1.1 소목차 제목 
+ - 핵심주제 1 | #키워드 | [유형] | 집필가이드(데이터/표 구성 지침) 
+ - 핵심주제 2 | #키워드 | [유형] | 집필가이드(데이터/표 구성 지침)
+
+5) [유형] 분류 가이드:
+   - [비교형]: 기존 vs DX 방식의 비교표(Table)가 필수적인 경우
+   - [기술형]: RMSE, GSD, 중복도 등 정밀 수치와 사양 설명이 핵심인 경우
+   - [절차형]: 단계별 워크플로 및 체크리스트가 중심인 경우
+   - [인사이트형]: 한계점 분석 및 전문가 제언(☞)이 중심인 경우
+6) 집필가이드는 50자 내외로, "어떤 데이터를 검색해서 어떤 표를 그려라"와 같이 구체적으로 지시하라.
+7) 대목차는 최대 8개 이내로 구성하라.
+"""
+    }
+    resp = client.chat.completions.create(
+        model=GPT_MODEL,
+        messages=[sys_msg, user_msg],
+    )
+    return (resp.choices[0].message.content or "").strip()
+
+
+
+def parse_outline(outline_text: str) -> Tuple[str, List[Dict[str, Any]]]:
+    lines = [ln.rstrip() for ln in outline_text.splitlines() if ln.strip()]
+    if not lines: return "", []
+
+    title = lines[0].strip() # 첫 줄은 보고서 제목
+    rows = []
+    current_section = None # 현재 처리 중인 소목차(1.1.1)를 추적
+
+    for ln in lines[1:]:
+        raw = ln.strip()
+        
+        # 1. 소목차 헤더(1.1.1 제목) 발견 시
+        m3_head = RE_L3_HEAD.match(raw)
+        if m3_head:
+            num, s_title = m3_head.groups()
+            current_section = {
+                "depth": 3, 
+                "num": num, 
+                "title": s_title,
+                "sub_topics": [] # 여기에 아래 줄의 꼭지들을 담을 예정
+            }
+            rows.append(current_section)
+            continue
+            
+        # 2. 세부 꼭지(- 주제 | #키워드 | [유형] | 가이드) 발견 시
+        m_topic = RE_L3_TOPIC.match(raw)
+        if m_topic and current_section:
+            t_title, kws_raw, t_type, guide = m_topic.groups()
+            # 키워드 추출 (#키워드 형태)
+            kws = [k.lstrip("#").strip() for k in RE_KEYWORDS.findall(kws_raw)]
+            
+            # 현재 소목차(current_section)의 리스트에 추가
+            current_section["sub_topics"].append({
+                "topic_title": t_title,
+                "keywords": kws,
+                "type": t_type,
+                "guide": guide
+            })
+            continue
+
+        # 3. 대목차(1.) 처리
+        m1 = RE_L1.match(raw)
+        if m1:
+            rows.append({"depth": 1, "num": m1.group(1).strip(), "title": m1.group(2).strip()})
+            current_section = None # 소목차 구간 종료
+            continue
+
+        # 4. 중목차(1.1) 처리
+        m2 = RE_L2.match(raw)
+        if m2:
+            rows.append({"depth": 2, "num": m2.group(1).strip(), "title": m2.group(2).strip()})
+            current_section = None # 소목차 구간 종료
+            continue
+
+    return title, rows
+
+def html_escape(s: str) -> str:
+    s = s or ""
+    return (s.replace("&", "&amp;")
+             .replace("<", "&lt;")
+             .replace(">", "&gt;")
+             .replace('"', "&quot;")
+             .replace("'", "&#39;"))
+
+def chunk_rows(rows: List[Dict[str, Any]], max_rows_per_page: int = 26) -> List[List[Dict[str, Any]]]:
+    """
+    A4 1장에 표가 길어지면 넘치므로, 단순 행 개수로 페이지 분할한다.
+    """
+    out = []
+    cur = []
+    for r in rows:
+        cur.append(r)
+        if len(cur) >= max_rows_per_page:
+            out.append(cur)
+            cur = []
+    if cur:
+        out.append(cur)
+    return out
+
+def build_outline_table_html(rows: List[Dict[str, Any]]) -> str:
+    """
+    테스트.html의 table 스타일을 그대로 쓰는 전제의 표 HTML
+    """
+    head = """
+    <table>
+        <thead>
+            <tr>
+                <th>구분</th>
+                <th>번호</th>
+                <th>제목</th>
+                <th>키워드</th>
+            </tr>
+        </thead>
+        <tbody>
+    """
+
+    body_parts = []
+    for r in rows:
+        depth = r["depth"]
+        num = html_escape(r["num"])
+        title = html_escape(r["title"])
+        kw = " ".join([f"#{k}" for k in r.get("keywords", []) if k])
+        kw = html_escape(kw)
+
+        if depth == 1:
+            body_parts.append(
+                f"""
+                <tr>
+                    <td class="group-cell">대목차</td>
+                    <td>{num}</td>
+                    <td>{title}</td>
+                    <td></td>
+                </tr>
+                """
+            )
+        elif depth == 2:
+            body_parts.append(
+                f"""
+                <tr>
+                    <td class="group-cell">중목차</td>
+                    <td>{num}</td>
+                    <td>{title}</td>
+                    <td></td>
+                </tr>
+                """
+            )
+        else:
+            body_parts.append(
+                f"""
+                <tr>
+                    <td class="group-cell">소목차</td>
+                    <td>{num}</td>
+                    <td>{title}</td>
+                    <td>{kw}</td>
+                </tr>
+                """
+            )
+
+    tail = """
+        </tbody>
+    </table>
+    """
+    return head + "\n".join(body_parts) + tail
+
+def build_outline_html(report_title: str, rows: List[Dict[str, Any]]) -> str:
+    """
+    테스트.html 레이아웃 구조를 그대로 따라 A4 시트 형태로 HTML 생성
+    """
+    css = r"""
+        @import url('https://fonts.googleapis.com/css2?family=Noto+Sans+KR:wght@300;400;500;700;900&display=swap');
+
+        :root {
+            --primary-blue: #3057B9;
+            --gray-light: #F2F2F2;
+            --gray-medium: #E6E6E6;
+            --gray-dark: #666666;
+            --border-light: #DDDDDD;
+            --text-black: #000000;
+        }
+
+        * {
+            margin: 0;
+            padding: 0;
+            box-sizing: border-box;
+            -webkit-print-color-adjust: exact;
+        }
+
+        body {
+            font-family: 'Noto Sans KR', sans-serif;
+            background-color: #f0f0f0;
+            color: var(--text-black);
+            line-height: 1.35;
+            display: flex;
+            justify-content: center;
+            padding: 10px 0;
+        }
+
+        .sheet {
+            background-color: white;
+            width: 210mm;
+            height: 297mm;
+            padding: 20mm 20mm;
+            box-shadow: 0 0 10px rgba(0,0,0,0.1);
+            position: relative;
+            display: flex;
+            flex-direction: column;
+            overflow: hidden;
+            margin-bottom: 12px;
+        }
+
+        @media print {
+            body { background: none; padding: 0; }
+            .sheet { box-shadow: none; margin: 0; border: none; page-break-after: always; }
+        }
+
+        .page-header {
+            display: flex;
+            justify-content: space-between;
+            align-items: flex-start;
+            margin-bottom: 15px;
+            font-size: 8.5pt;
+            color: var(--gray-dark);
+        }
+
+        .header-title {
+            font-size: 24pt;
+            font-weight: 900;
+            margin-bottom: 8px;
+            letter-spacing: -1.5px;
+            color: #111;
+        }
+
+        .title-divider {
+            height: 4px;
+            background-color: var(--primary-blue);
+            width: 100%;
+            margin-bottom: 20px;
+        }
+
+        .lead-box {
+            background-color: var(--gray-light);
+            padding: 18px 20px;
+            margin-bottom: 5px;
+            border-radius: 2px;
+            text-align: center;
+        }
+
+        .lead-box div {
+            font-size: 13pt;
+            font-weight: 700;
+            color: var(--primary-blue);
+            letter-spacing: -0.5px;
+        }
+
+        .lead-notes {
+            font-size: 8.5pt;
+            color: #777;
+            margin-bottom: 20px;
+            padding-left: 5px;
+            text-align: right;
+        }
+
+        .body-content { flex: 1; }
+
+        .section { margin-bottom: 22px; }
+
+        .section-title {
+            font-size: 13pt;
+            font-weight: 700;
+            display: flex;
+            align-items: center;
+            margin-bottom: 10px;
+            color: #111;
+        }
+
+        .section-title::before {
+            content: "";
+            display: inline-block;
+            width: 10px;
+            height: 10px;
+            background-color: #999;
+            margin-right: 10px;
+        }
+
+        table {
+            width: 100%;
+            border-collapse: collapse;
+            margin: 8px 0;
+            font-size: 9.5pt;
+            border-top: 1.5px solid #333;
+        }
+
+        th {
+            background-color: var(--gray-medium);
+            font-weight: 700;
+            padding: 10px;
+            border: 1px solid var(--border-light);
+        }
+
+        td {
+            padding: 10px;
+            border: 1px solid var(--border-light);
+            vertical-align: middle;
+        }
+
+        .group-cell {
+            background-color: #F9F9F9;
+            font-weight: 700;
+            width: 16%;
+            text-align: center;
+            color: var(--primary-blue);
+            white-space: nowrap;
+        }
+
+        .page-footer {
+            margin-top: 15px;
+            padding-top: 10px;
+            display: flex;
+            justify-content: space-between;
+            font-size: 8.5pt;
+            color: var(--gray-dark);
+            border-top: 1px solid #EEE;
+        }
+
+        .footer-page { flex: 1; text-align: center; }
+    """
+
+    pages = chunk_rows(rows, max_rows_per_page=26)
+
+    html_pages = []
+    total_pages = len(pages) if pages else 1
+    for i, page_rows in enumerate(pages, start=1):
+        table_html = build_outline_table_html(page_rows)
+
+        html_pages.append(f"""
+        <div class="sheet">
+            <header class="page-header">
+                <div class="header-left">
+                    보고서: 목차 자동 생성 결과
+                </div>
+                <div class="header-right">
+                    작성일자: {datetime.now().strftime("%Y. %m. %d.")}
+                </div>
+            </header>
+
+            <div class="title-block">
+                <h1 class="header-title">{html_escape(report_title)}</h1>
+                <div class="title-divider"></div>
+            </div>
+
+            <div class="body-content">
+                <div class="lead-box">
+                    <div>확정 목차 표 형태 정리본</div>
+                </div>
+                <div class="lead-notes">목차는 outline_issue_report.txt를 기반으로 표로 재구성됨</div>
+
+                <div class="section">
+                    <div class="section-title">목차</div>
+                    {table_html}
+                </div>
+            </div>
+
+            <footer class="page-footer">
+                <div class="footer-slogan">Word Style v2 Outline</div>
+                <div class="footer-page">- {i} / {total_pages} -</div>
+                <div class="footer-info">outline_issue_report.html</div>
+            </footer>
+        </div>
+        """)
+
+    return f"""<!DOCTYPE html>
+<html lang="ko">
+<head>
+  <meta charset="UTF-8">
+  <title>{html_escape(report_title)} - Outline</title>
+  <style>{css}</style>
+</head>
+<body>
+  {''.join(html_pages)}
+</body>
+</html>
+"""
+
+def main():
+    log("=== 목차 생성 시작 ===")
+    domain_prompt = load_domain_prompt()
+    corpus        = load_corpus()
+
+    outline = generate_outline(domain_prompt, corpus)
+
+    # TXT 저장 유지
+    out_txt = CONTEXT_DIR / "outline_issue_report.txt"
+    out_txt.write_text(outline, encoding="utf-8")
+    log(f"목차 TXT 저장 완료: {out_txt}")
+
+    # HTML 추가 저장
+    title, rows = parse_outline(outline)
+    out_html = CONTEXT_DIR / "outline_issue_report.html"
+    out_html.write_text(build_outline_html(title, rows), encoding="utf-8")
+    log(f"목차 HTML 저장 완료: {out_html}")
+
+    log("=== 목차 생성 종료 ===")
+
+if __name__ == "__main__":
+    main()
--- a/Code/geulbeot_5th/converters/pipeline/step8_content.py
+++ b/Code/geulbeot_5th/converters/pipeline/step8_content.py
--- a/Code/geulbeot_5th/converters/pipeline/step9_html.py
+++ b/Code/geulbeot_5th/converters/pipeline/step9_html.py
--- a/Code/geulbeot_5th/converters/style_analyzer.py
+++ b/Code/geulbeot_5th/converters/style_analyzer.py
@@ -0,0 +1,935 @@
+"""
+HTML 스타일 분석기 v3.0
+HTML 요소를 분석하여 역할(Role)을 자동 분류
+
+✅ v3.0 변경사항:
+- 글벗 HTML 구조 완벽 지원 (.sheet, .body-content)
+- 머리말/꼬리말/페이지번호 제거
+- 강력한 중복 콘텐츠 필터링
+- 제목 계층 구조 정확한 인식
+"""
+
+import re
+from bs4 import BeautifulSoup, Tag, NavigableString
+from dataclasses import dataclass
+from typing import List, Dict, Optional, Tuple, Set
+from enum import Enum
+
+
+class DocumentSection(Enum):
+    """문서 섹션 유형"""
+    COVER = "cover"      # 표지
+    TOC = "toc"          # 목차
+    CONTENT = "content"  # 본문
+
+
+@dataclass
+class StyledElement:
+    """스타일이 지정된 요소"""
+    role: str           # 역할 (H1, BODY, TH 등)
+    text: str           # 텍스트 내용
+    tag: str            # 원본 HTML 태그
+    html: str           # 원본 HTML
+    section: str        # 섹션 (cover, toc, content)
+    attributes: Dict    # 추가 속성 (이미지 src 등)
+    
+    def __repr__(self):
+        preview = self.text[:30] + "..." if len(self.text) > 30 else self.text
+        return f"<{self.role}> {preview}"
+
+
+class StyleAnalyzer:
+    """HTML 문서를 분석하여 역할 분류"""
+    
+    # 번호 패턴 정의
+    PATTERNS = {
+        # 장 번호: "제1장", "제2장"
+        "chapter": re.compile(r'^제\s*\d+\s*장'),
+        # 1단계 제목: "1 ", "2 " (숫자+공백, 점 없음)
+        "h1_num": re.compile(r'^(\d+)\s+[가-힣]'),
+        # 대항목: "1.", "2."
+        "h2_num": re.compile(r'^(\d+)\.\s'),
+        # 중항목: "1.1 ", "1.2 "
+        "h3_num": re.compile(r'^(\d+)\.(\d+)\s'),
+        # 소항목: "1.1.1"
+        "h4_num": re.compile(r'^(\d+)\.(\d+)\.(\d+)'),
+        # 세부: "1)", "2)"
+        "h5_paren": re.compile(r'^(\d+)\)\s*'),
+        # 세세부: "(1)", "(2)"
+        "h6_paren": re.compile(r'^\((\d+)\)\s*'),
+        # 가나다: "가.", "나."
+        "h4_korean": re.compile(r'^[가-하]\.\s'),
+        # 가나다 괄호: "가)", "나)"
+        "h5_korean": re.compile(r'^[가-하]\)\s'),
+        # 원문자: "①", "②"
+        "h6_circle": re.compile(r'^[①②③④⑤⑥⑦⑧⑨⑩]'),
+        # 목록: "•", "-", "○"
+        "list_bullet": re.compile(r'^[•\-○]\s'),
+        # 페이지 번호 패턴: "- 1 -", "- 12 -"
+        "page_number": re.compile(r'^-\s*\d+\s*-$'),
+        # 꼬리말 패턴: "문서제목- 1 -"
+        "footer_pattern": re.compile(r'.+[-–]\s*\d+\s*[-–]$'),
+    }
+    
+    # 제거할 텍스트 패턴들
+    REMOVE_PATTERNS = [
+        re.compile(r'^-\s*\d+\s*-$'),                    # "- 1 -"
+        re.compile(r'[-–]\s*\d+\s*[-–]\s*$'),           # "문서제목- 1 -"
+        re.compile(r'^\d+\s*×\s*\d+$'),                  # "643 × 236" (이미지 크기)
+        re.compile(r'^\[이미지 없음:.*\]$'),              # "[이미지 없음: xxx]"
+        re.compile(r'^\[그림\s*\d+-\d+\]$'),              # "[그림 1-1]"
+    ]
+    
+    def __init__(self):
+        self.elements: List[StyledElement] = []
+        self.current_section = DocumentSection.CONTENT
+        self.seen_texts: Set[str] = set()  # 중복 방지용
+        self.document_title = ""  # 문서 제목 (꼬리말 제거용)
+    
+    def analyze(self, html: str) -> List[StyledElement]:
+        """HTML 문서 분석하여 역할 분류된 요소 리스트 반환"""
+        soup = BeautifulSoup(html, 'html.parser')
+        self.elements = []
+        self.seen_texts = set()
+        
+        # 1. 전처리: 불필요한 요소 제거
+        self._preprocess(soup)
+        
+        # 2. 문서 제목 추출 (꼬리말 패턴 감지용)
+        self._extract_document_title(soup)
+        
+        # 3. 섹션 감지 및 순회
+        self._detect_and_process_sections(soup)
+        
+        # 4. 후처리: 중복 및 불필요 요소 제거
+        self._postprocess()
+        
+        return self.elements
+    
+    def _preprocess(self, soup: BeautifulSoup):
+        """HTML 전처리 - 불필요한 요소 제거"""
+        print("  🔧 HTML 전처리 중...")
+        
+        # 1. 스크립트/스타일 태그 제거
+        removed_count = 0
+        for tag in soup(['script', 'style', 'noscript', 'meta', 'link', 'head']):
+            tag.decompose()
+            removed_count += 1
+        
+        if removed_count > 0:
+            print(f"     - script/style 등 {removed_count}개 제거")
+        
+        # 2. 머리말/꼬리말 영역 제거 (글벗 HTML 구조)
+        header_footer_count = 0
+        for selector in ['.page-header', '.page-footer', '.header', '.footer', 
+                        '[class*="header"]', '[class*="footer"]',
+                        '.running-header', '.running-footer']:
+            for elem in soup.select(selector):
+                # 실제 콘텐츠 헤더가 아닌 페이지 헤더만 제거
+                text = elem.get_text(strip=True)
+                if self._is_header_footer_text(text):
+                    elem.decompose()
+                    header_footer_count += 1
+        
+        if header_footer_count > 0:
+            print(f"     - 머리말/꼬리말 {header_footer_count}개 제거")
+        
+        # 3. 숨겨진 요소 제거
+        hidden_count = 0
+        for elem in soup.select('[style*="display:none"], [style*="display: none"]'):
+            elem.decompose()
+            hidden_count += 1
+        for elem in soup.select('[style*="visibility:hidden"], [style*="visibility: hidden"]'):
+            elem.decompose()
+            hidden_count += 1
+        
+        # 4. #raw-container 외부의 .sheet 제거 (글벗 구조)
+        raw_container = soup.find(id='raw-container')
+        if raw_container:
+            print("     - 글벗 구조 감지: #raw-container 우선 사용")
+            # raw-container 외부의 모든 .sheet 제거
+            for sheet in soup.select('.sheet'):
+                if not self._is_descendant_of(sheet, raw_container):
+                    sheet.decompose()
+    
+    def _extract_document_title(self, soup: BeautifulSoup):
+        """문서 제목 추출 (꼬리말 패턴 감지용)"""
+        # 표지에서 제목 찾기
+        cover = soup.find(id='box-cover') or soup.find(class_='box-cover')
+        if cover:
+            h1 = cover.find('h1')
+            if h1:
+                self.document_title = h1.get_text(strip=True)
+                print(f"     - 문서 제목 감지: {self.document_title[:30]}...")
+    
+    def _is_header_footer_text(self, text: str) -> bool:
+        """머리말/꼬리말 텍스트인지 판단"""
+        if not text:
+            return False
+        
+        # 페이지 번호 패턴
+        if self.PATTERNS['page_number'].match(text):
+            return True
+        
+        # "문서제목- 1 -" 패턴
+        if self.PATTERNS['footer_pattern'].match(text):
+            return True
+        
+        # 문서 제목 + 페이지번호 조합
+        if self.document_title and self.document_title in text:
+            if re.search(r'[-–]\s*\d+\s*[-–]', text):
+                return True
+        
+        return False
+    
+    def _should_skip_text(self, text: str) -> bool:
+        """건너뛸 텍스트인지 판단"""
+        if not text:
+            return True
+        
+        # 제거 패턴 체크
+        for pattern in self.REMOVE_PATTERNS:
+            if pattern.match(text):
+                return True
+        
+        # 머리말/꼬리말 체크
+        if self._is_header_footer_text(text):
+            return True
+        
+        # 문서 제목만 있는 줄 (꼬리말에서 온 것)
+        if self.document_title and text.strip() == self.document_title:
+            # 이미 표지에서 처리했으면 스킵
+            if any(e.role == 'COVER_TITLE' and self.document_title in e.text 
+                   for e in self.elements):
+                return True
+        
+        return False
+    
+    def _is_descendant_of(self, element: Tag, ancestor: Tag) -> bool:
+        """element가 ancestor의 자손인지 확인"""
+        parent = element.parent
+        while parent:
+            if parent == ancestor:
+                return True
+            parent = parent.parent
+        return False
+    
+    def _detect_and_process_sections(self, soup: BeautifulSoup):
+        """섹션 감지 및 처리"""
+        
+        # 글벗 구조 (#raw-container) 우선 처리
+        raw = soup.find(id='raw-container')
+        if raw:
+            self._process_geulbeot_structure(raw)
+            return
+        
+        # .sheet 구조 처리 (렌더링된 페이지)
+        sheets = soup.select('.sheet')
+        if sheets:
+            self._process_sheet_structure(sheets)
+            return
+        
+        # 일반 HTML 구조 처리
+        self._process_generic_html(soup)
+    
+    def _process_geulbeot_structure(self, raw: Tag):
+        """글벗 HTML #raw-container 구조 처리"""
+        print("  📄 글벗 #raw-container 구조 처리 중...")
+        
+        # 표지
+        cover = raw.find(id='box-cover')
+        if cover:
+            print("     - 표지 섹션")
+            self.current_section = DocumentSection.COVER
+            self._process_cover(cover)
+        
+        # 목차
+        toc = raw.find(id='box-toc')
+        if toc:
+            print("     - 목차 섹션")
+            self.current_section = DocumentSection.TOC
+            self._process_toc(toc)
+        
+        # 요약
+        summary = raw.find(id='box-summary')
+        if summary:
+            print("     - 요약 섹션")
+            self.current_section = DocumentSection.CONTENT
+            self._process_content_element(summary)
+        
+        # 본문
+        content = raw.find(id='box-content')
+        if content:
+            print("     - 본문 섹션")
+            self.current_section = DocumentSection.CONTENT
+            self._process_content_element(content)
+    
+    def _process_sheet_structure(self, sheets: List[Tag]):
+        """글벗 .sheet 페이지 구조 처리"""
+        print(f"  📄 .sheet 페이지 구조 처리 중... ({len(sheets)}페이지)")
+        
+        for i, sheet in enumerate(sheets):
+            # 페이지 내 body-content만 추출
+            body_content = sheet.select_one('.body-content')
+            if body_content:
+                self._process_content_element(body_content)
+            else:
+                # body-content가 없으면 머리말/꼬리말 제외하고 처리
+                for child in sheet.children:
+                    if isinstance(child, Tag):
+                        classes = child.get('class', [])
+                        class_str = ' '.join(classes) if classes else ''
+                        
+                        # 머리말/꼬리말 스킵
+                        if any(x in class_str.lower() for x in ['header', 'footer']):
+                            continue
+                        
+                        self._process_content_element(child)
+    
+    def _process_generic_html(self, soup: BeautifulSoup):
+        """일반 HTML 구조 처리"""
+        print("  📄 일반 HTML 구조 처리 중...")
+        
+        # 표지
+        cover = soup.find(class_=re.compile(r'cover|title-page|box-cover'))
+        if cover:
+            self.current_section = DocumentSection.COVER
+            self._process_cover(cover)
+        
+        # 목차
+        toc = soup.find(class_=re.compile(r'toc|table-of-contents'))
+        if toc:
+            self.current_section = DocumentSection.TOC
+            self._process_toc(toc)
+        
+        # 본문
+        self.current_section = DocumentSection.CONTENT
+        main_content = soup.find('main') or soup.find('article') or soup.find('body') or soup
+        
+        for child in main_content.children:
+            if isinstance(child, Tag):
+                self._process_content_element(child)
+    
+    def _process_cover(self, cover: Tag):
+        """표지 처리"""
+        # H1 = 제목
+        h1 = cover.find('h1')
+        if h1:
+            text = h1.get_text(strip=True)
+            if text and not self._is_duplicate(text):
+                self.elements.append(StyledElement(
+                    role="COVER_TITLE",
+                    text=text,
+                    tag="h1",
+                    html=str(h1)[:200],
+                    section="cover",
+                    attributes={}
+                ))
+        
+        # H2 = 부제목
+        h2 = cover.find('h2')
+        if h2:
+            text = h2.get_text(strip=True)
+            if text and not self._is_duplicate(text):
+                self.elements.append(StyledElement(
+                    role="COVER_SUBTITLE",
+                    text=text,
+                    tag="h2",
+                    html=str(h2)[:200],
+                    section="cover",
+                    attributes={}
+                ))
+        
+        # P = 정보
+        for p in cover.find_all('p'):
+            text = p.get_text(strip=True)
+            if text and not self._is_duplicate(text):
+                self.elements.append(StyledElement(
+                    role="COVER_INFO",
+                    text=text,
+                    tag="p",
+                    html=str(p)[:200],
+                    section="cover",
+                    attributes={}
+                ))
+    
+    def _process_toc(self, toc: Tag):
+        """목차 처리"""
+        # UL/OL 기반 목차
+        for li in toc.find_all('li'):
+            text = li.get_text(strip=True)
+            if not text or self._is_duplicate(text):
+                continue
+            
+            classes = li.get('class', [])
+            class_str = ' '.join(classes) if classes else ''
+            
+            # 레벨 판단 (구체적 → 일반 순서!)
+            if 'lvl-1' in class_str or 'toc-lvl-1' in class_str:
+                role = "TOC_H1"
+            elif 'lvl-2' in class_str or 'toc-lvl-2' in class_str:
+                role = "TOC_H2"
+            elif 'lvl-3' in class_str or 'toc-lvl-3' in class_str:
+                role = "TOC_H3"
+            elif self.PATTERNS['h4_num'].match(text):   # 1.1.1 먼저!
+                role = "TOC_H3"
+            elif self.PATTERNS['h3_num'].match(text):   # 1.1 그다음
+                role = "TOC_H2"
+            elif self.PATTERNS['h2_num'].match(text):   # 1. 그다음
+                role = "TOC_H1"
+            else:
+                role = "TOC_H1"
+            
+            self.elements.append(StyledElement(
+                role=role,
+                text=text,
+                tag="li",
+                html=str(li)[:200],
+                section="toc",
+                attributes={}
+            ))
+    
+    def _process_content_element(self, element: Tag):
+        """본문 요소 재귀 처리"""
+        if not isinstance(element, Tag):
+            return
+        
+        tag_name = element.name.lower() if element.name else ""
+        classes = element.get('class', [])
+        class_str = ' '.join(classes) if classes else ''
+        
+        # 머리말/꼬리말 클래스 스킵
+        if any(x in class_str.lower() for x in ['header', 'footer', 'page-num']):
+            return
+        
+        # 테이블 특수 처리
+        if tag_name == 'table':
+            self._process_table(element)
+            return
+        
+        # 그림 특수 처리
+        if tag_name in ['figure', 'img']:
+            self._process_figure(element)
+            return
+        
+        # 텍스트 추출
+        text = self._get_direct_text(element)
+        
+        if text:
+            # 건너뛸 텍스트 체크
+            if self._should_skip_text(text):
+                pass  # 자식은 계속 처리
+            elif not self._is_duplicate(text):
+                role = self._classify_role(element, tag_name, classes, text)
+                if role:
+                    self.elements.append(StyledElement(
+                        role=role,
+                        text=text,
+                        tag=tag_name,
+                        html=str(element)[:200],
+                        section=self.current_section.value,
+                        attributes=dict(element.attrs) if element.attrs else {}
+                    ))
+        
+        # 자식 요소 재귀 처리 (컨테이너 태그)
+        if tag_name in ['div', 'section', 'article', 'aside', 'main', 'body', 
+                        'ul', 'ol', 'dl', 'blockquote']:
+            for child in element.children:
+                if isinstance(child, Tag):
+                    self._process_content_element(child)
+    
+    def _get_direct_text(self, element: Tag) -> str:
+        """요소의 직접 텍스트만 추출 (자식 컨테이너 제외)"""
+        # 제목 태그는 전체 텍스트
+        if element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li', 'td', 'th', 'caption']:
+            return element.get_text(strip=True)
+        
+        # 컨테이너 태그는 직접 텍스트만
+        texts = []
+        for child in element.children:
+            if isinstance(child, NavigableString):
+                t = str(child).strip()
+                if t:
+                    texts.append(t)
+        
+        return ' '.join(texts)
+    
+    def _is_duplicate(self, text: str) -> bool:
+        """중복 텍스트인지 확인"""
+        if not text:
+            return True
+        
+        # 정규화
+        normalized = re.sub(r'\s+', ' ', text.strip())
+        
+        # 짧은 텍스트는 중복 허용 (번호 등)
+        if len(normalized) < 10:
+            return False
+        
+        # 첫 50자로 체크
+        key = normalized[:50]
+        
+        if key in self.seen_texts:
+            return True
+        
+        self.seen_texts.add(key)
+        return False
+    
+    def _classify_role(self, element: Tag, tag: str, classes: List[str], text: str) -> Optional[str]:
+        """요소의 역할 분류
+        
+        ⚠️ 중요: 패턴 매칭은 반드시 구체적인 것 → 일반적인 것 순서로!
+           1.1.1 → 1.1 → 1. → 1
+           (1) → 1)
+           가) → 가.
+        """
+        
+        class_str = ' '.join(classes) if classes else ''
+        
+        # ============ 제목 태그 (HTML 태그 우선) ============
+        if tag == 'h1':
+            return "H1"
+        if tag == 'h2':
+            return "H2"
+        if tag == 'h3':
+            return "H3"
+        if tag == 'h4':
+            return "H4"
+        if tag == 'h5':
+            return "H5"
+        if tag == 'h6':
+            return "H6"
+        
+        # ============ 본문 (p, div 등) - 번호 패턴으로 분류 ============
+        if tag in ['p', 'div', 'span']:
+            
+            # ------ 숫자.숫자 패턴 (구체적 → 일반 순서!) ------
+            
+            # "1.1.1" 패턴 (가장 구체적 - 먼저 체크!)
+            if self.PATTERNS['h4_num'].match(text):
+                if len(text) < 100:
+                    return "H3"
+                return "BODY"
+            
+            # "1.1 " 패턴
+            if self.PATTERNS['h3_num'].match(text):
+                if len(text) < 100:
+                    return "H2"
+                return "BODY"
+            
+            # "1." 패턴
+            if self.PATTERNS['h2_num'].match(text):
+                if len(text) < 100:
+                    return "H1"
+                return "BODY"
+            
+            # "1 가나다..." 패턴 (숫자+공백+한글)
+            if self.PATTERNS['h1_num'].match(text):
+                return "H1"
+            
+            # ------ 괄호 패턴 (구체적 → 일반 순서!) ------
+            
+            # "(1)" 패턴 (괄호로 감싼 게 더 구체적 - 먼저 체크!)
+            if self.PATTERNS['h6_paren'].match(text):
+                if element.find('strong') or len(text) < 80:
+                    return "H5"
+                return "BODY"
+            
+            # "1)" 패턴
+            if self.PATTERNS['h5_paren'].match(text):
+                if element.find('strong') or len(text) < 80:
+                    return "H4"
+                return "BODY"
+            
+            # ------ 한글 패턴 (구체적 → 일반 순서!) ------
+            
+            # "가)" 패턴 (괄호가 더 구체적 - 먼저 체크!)
+            if self.PATTERNS['h5_korean'].match(text):
+                return "H5"
+            
+            # "가." 패턴
+            if self.PATTERNS['h4_korean'].match(text):
+                return "H4"
+            
+            # ------ 특수 기호 패턴 ------
+            
+            # "①②③" 패턴
+            if self.PATTERNS['h6_circle'].match(text):
+                return "H6"
+            
+            # ------ 기타 ------
+            
+            # 강조 박스
+            if any(x in class_str for x in ['highlight', 'box', 'note', 'tip']):
+                return "HIGHLIGHT_BOX"
+            
+            # 일반 본문
+            return "BODY"
+        
+        # ============ 목록 ============
+        if tag == 'li':
+            return "LIST_ITEM"
+        
+        # ============ 정의 목록 ============
+        if tag == 'dt':
+            return "H5"
+        if tag == 'dd':
+            return "BODY"
+        
+        return "BODY"
+    
+    def _process_table(self, table: Tag):
+        """테이블 처리 - 구조 데이터 포함"""
+        
+        # 캡션
+        caption = table.find('caption')
+        caption_text = ""
+        if caption:
+            caption_text = caption.get_text(strip=True)
+            if caption_text and not self._is_duplicate(caption_text):
+                self.elements.append(StyledElement(
+                    role="TABLE_CAPTION",
+                    text=caption_text,
+                    tag="caption",
+                    html=str(caption)[:100],
+                    section=self.current_section.value,
+                    attributes={}
+                ))
+        
+        # 🆕 표 구조 데이터 수집
+        table_data = {'rows': [], 'caption': caption_text}
+        
+        for tr in table.find_all('tr'):
+            row = []
+            for cell in tr.find_all(['th', 'td']):
+                cell_info = {
+                    'text': cell.get_text(strip=True),
+                    'is_header': cell.name == 'th',
+                    'colspan': int(cell.get('colspan', 1)),
+                    'rowspan': int(cell.get('rowspan', 1)),
+                    'bg_color': self._extract_bg_color(cell),
+                }
+                row.append(cell_info)
+            if row:
+                table_data['rows'].append(row)
+        
+        # 🆕 TABLE 요소로 추가 (개별 TH/TD 대신)
+        if table_data['rows']:
+            self.elements.append(StyledElement(
+                role="TABLE",
+                text=f"[표: {len(table_data['rows'])}행]",
+                tag="table",
+                html=str(table)[:200],
+                section=self.current_section.value,
+                attributes={'table_data': table_data}
+            ))
+
+    def _extract_bg_color(self, element: Tag) -> str:
+        """요소에서 배경색 추출"""
+        style = element.get('style', '')
+        
+        # background-color 추출
+        match = re.search(r'background-color:\s*([^;]+)', style)
+        if match:
+            return self._normalize_color(match.group(1))
+        
+        # bgcolor 속성
+        bgcolor = element.get('bgcolor', '')
+        if bgcolor:
+            return self._normalize_color(bgcolor)
+        
+        return ''
+    
+    def _process_figure(self, element: Tag):
+        """그림 처리"""
+        img = element.find('img') if element.name == 'figure' else element
+        
+        if img and img.name == 'img':
+            src = img.get('src', '')
+            alt = img.get('alt', '')
+            
+            if src:  # src가 있을 때만 추가
+                self.elements.append(StyledElement(
+                    role="FIGURE",
+                    text=alt or "이미지",
+                    tag="img",
+                    html=str(img)[:100],
+                    section=self.current_section.value,
+                    attributes={"src": src, "alt": alt}
+                ))
+        
+        # 캡션
+        if element.name == 'figure':
+            figcaption = element.find('figcaption')
+            if figcaption:
+                text = figcaption.get_text(strip=True)
+                if text and not self._should_skip_text(text):
+                    self.elements.append(StyledElement(
+                        role="FIGURE_CAPTION",
+                        text=text,
+                        tag="figcaption",
+                        html=str(figcaption)[:100],
+                        section=self.current_section.value,
+                        attributes={}
+                    ))
+    
+    def _postprocess(self):
+        """후처리: 불필요 요소 제거"""
+        print(f"  🧹 후처리 중... (처리 전: {len(self.elements)}개)")
+        
+        filtered = []
+        for elem in self.elements:
+            # 빈 텍스트 제거
+            if not elem.text or not elem.text.strip():
+                continue
+            
+            # 머리말/꼬리말 텍스트 제거
+            if self._is_header_footer_text(elem.text):
+                continue
+            
+            # 제거 패턴 체크
+            skip = False
+            for pattern in self.REMOVE_PATTERNS:
+                if pattern.match(elem.text.strip()):
+                    skip = True
+                    break
+            
+            if not skip:
+                filtered.append(elem)
+        
+        self.elements = filtered
+        print(f"     - 처리 후: {len(self.elements)}개")
+    
+    def get_role_summary(self) -> Dict[str, int]:
+        """역할별 요소 수 요약"""
+        summary = {}
+        for elem in self.elements:
+            summary[elem.role] = summary.get(elem.role, 0) + 1
+        return dict(sorted(summary.items()))
+
+
+    def extract_css_styles(self, html: str) -> Dict[str, Dict]:
+        """
+        HTML에서 역할별 CSS 스타일 추출
+        Returns: {역할: {font_size, color, bold, ...}}
+        """
+        soup = BeautifulSoup(html, 'html.parser')
+        role_styles = {}
+        
+        # <style> 태그에서 CSS 파싱
+        style_tag = soup.find('style')
+        if style_tag:
+            css_text = style_tag.string or ''
+            role_styles.update(self._parse_css_rules(css_text))
+        
+        # 인라인 스타일에서 추출 (요소별)
+        for elem in self.elements:
+            if elem.role not in role_styles:
+                role_styles[elem.role] = self._extract_inline_style(elem.html)
+        
+        return role_styles
+
+    def _parse_css_rules(self, css_text: str) -> Dict[str, Dict]:
+        """CSS 텍스트에서 규칙 파싱"""
+        import re
+        rules = {}
+        
+        # h1, h2, .section-title 등의 패턴
+        pattern = r'([^{]+)\{([^}]+)\}'
+        for match in re.finditer(pattern, css_text):
+            selector = match.group(1).strip()
+            properties = match.group(2)
+            
+            style = {}
+            for prop in properties.split(';'):
+                if ':' in prop:
+                    key, value = prop.split(':', 1)
+                    key = key.strip().lower()
+                    value = value.strip()
+                    
+                    if key == 'font-size':
+                        style['font_size'] = self._parse_font_size(value)
+                    elif key == 'color':
+                        style['color'] = self._normalize_color(value)
+                    elif key == 'font-weight':
+                        style['bold'] = value in ['bold', '700', '800', '900']
+                    elif key == 'text-align':
+                        style['align'] = value
+            
+            # 셀렉터 → 역할 매핑
+            role = self._selector_to_role(selector)
+            if role:
+                rules[role] = style
+        
+        return rules
+
+    def _selector_to_role(self, selector: str) -> str:
+        """CSS 셀렉터 → 역할 매핑"""
+        selector = selector.lower().strip()
+        mapping = {
+            'h1': 'H1', 'h2': 'H2', 'h3': 'H3', 'h4': 'H4',
+            '.cover-title': 'COVER_TITLE',
+            '.section-title': 'H1',
+            'th': 'TH', 'td': 'TD',
+            'p': 'BODY',
+        }
+        for key, role in mapping.items():
+            if key in selector:
+                return role
+        return None
+
+    def _parse_font_size(self, value: str) -> float:
+        """폰트 크기 파싱 (pt 단위로 변환)"""
+        import re
+        match = re.search(r'([\d.]+)(pt|px|em|rem)?', value)
+        if match:
+            size = float(match.group(1))
+            unit = match.group(2) or 'pt'
+            if unit == 'px':
+                size = size * 0.75  # px → pt
+            elif unit in ['em', 'rem']:
+                size = size * 11  # 기본 11pt 기준
+            return size
+        return 11.0
+
+    def _normalize_color(self, value: str) -> str:
+        """색상값 정규화 (#RRGGBB)"""
+        import re
+        value = value.strip().lower()
+        
+        # 이미 #rrggbb 형식
+        if re.match(r'^#[0-9a-f]{6}$', value):
+            return value.upper()
+        
+        # #rgb → #rrggbb
+        if re.match(r'^#[0-9a-f]{3}$', value):
+            return f'#{value[1]*2}{value[2]*2}{value[3]*2}'.upper()
+        
+        # rgb(r, g, b)
+        match = re.search(r'rgb\s*\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)', value)
+        if match:
+            r, g, b = int(match.group(1)), int(match.group(2)), int(match.group(3))
+            return f'#{r:02X}{g:02X}{b:02X}'
+        
+        # 색상 이름
+        color_names = {
+            'black': '#000000', 'white': '#FFFFFF',
+            'red': '#FF0000', 'green': '#008000', 'blue': '#0000FF',
+            'navy': '#1A365D',
+        }
+        return color_names.get(value, '#000000')
+
+    def _extract_inline_style(self, html: str) -> Dict:
+        """HTML 요소에서 인라인 스타일 추출"""
+        style = {}
+        
+        # style 속성 찾기
+        match = re.search(r'style\s*=\s*["\']([^"\']+)["\']', html)
+        if match:
+            style_str = match.group(1)
+            for prop in style_str.split(';'):
+                if ':' in prop:
+                    key, value = prop.split(':', 1)
+                    key = key.strip().lower()
+                    value = value.strip()
+                    
+                    if key == 'font-size':
+                        style['font_size'] = self._parse_font_size(value)
+                    elif key == 'color':
+                        style['color'] = self._normalize_color(value)
+                    elif key == 'font-weight':
+                        style['bold'] = value in ['bold', '700', '800', '900']
+                    elif key == 'text-align':
+                        style['align'] = value
+                    elif key == 'background-color':
+                        style['bg_color'] = self._normalize_color(value)
+        
+        return style
+
+    def _extract_bg_color(self, element) -> str:
+        """요소에서 배경색 추출"""
+        if not hasattr(element, 'get'):
+            return ''
+        
+        style = element.get('style', '')
+        
+        # background-color 추출
+        match = re.search(r'background-color:\s*([^;]+)', style)
+        if match:
+            return self._normalize_color(match.group(1))
+        
+        # bgcolor 속성
+        bgcolor = element.get('bgcolor', '')
+        if bgcolor:
+            return self._normalize_color(bgcolor)
+        
+        return ''
+
+
+    def export_for_hwp(self) -> List[Dict]:
+            """HWP 변환용 데이터 내보내기"""
+            return [
+                {
+                    "role": e.role,
+                    "text": e.text,
+                    "tag": e.tag,
+                    "section": e.section,
+                    "attributes": e.attributes
+                }
+                for e in self.elements
+            ]
+
+
+if __name__ == "__main__":
+    # 테스트
+    test_html = """
+    <html>
+    <head>
+        <script>var x = 1;</script>
+        <style>.test { color: red; }</style>
+    </head>
+    <body>
+        <div class="sheet">
+            <div class="page-header">건설·토목 측량 DX 실무지침</div>
+            <div class="body-content">
+                <h1>1 DX 개요와 기본 개념</h1>
+                <h2>1.1 측량 DX 프레임</h2>
+                <h3>1.1.1 측량 DX 발전 단계</h3>
+                <p>1) <strong>Digitization 정의</strong></p>
+                <p>본문 내용입니다. 이것은 충분히 긴 텍스트로 본문으로 인식되어야 합니다.</p>
+                <p>(1) 단계별 정의 및 진화</p>
+                <p>측량 기술의 발전은 장비의 변화와 성과물의 차원에 따라 구분된다.</p>
+            </div>
+            <div class="page-footer">건설·토목 측량 DX 실무지침- 1 -</div>
+        </div>
+        
+        <div class="sheet">
+            <div class="page-header">건설·토목 측량 DX 실무지침</div>
+            <div class="body-content">
+                <p>① 첫 번째 항목</p>
+                <table>
+                    <caption>표 1. 데이터 비교</caption>
+                    <tr><th>구분</th><th>내용</th></tr>
+                    <tr><td>항목1</td><td>설명1</td></tr>
+                </table>
+            </div>
+            <div class="page-footer">건설·토목 측량 DX 실무지침- 2 -</div>
+        </div>
+    </body>
+    </html>
+    """
+    
+    analyzer = StyleAnalyzer()
+    elements = analyzer.analyze(test_html)
+    
+    print("\n" + "="*60)
+    print("분석 결과")
+    print("="*60)
+    for elem in elements:
+        print(f"  {elem.role:18} | {elem.section:7} | {elem.text[:50]}")
+    
+    print("\n" + "="*60)
+    print("역할 요약")
+    print("="*60)
+    for role, count in analyzer.get_role_summary().items():
+        print(f"  {role}: {count}")
				`@@ -0,0 +1 @@`
				`from .router import process_document, is_long_document`