# -*- coding: utf-8 -*- """ 9_md_to_html_publisher.py 기능: - report_draft.md + report_sections.json → report.html 변환 - A4 규격 페이지네이션 템플릿 적용 - 마크다운 테이블 → HTML 테이블 변환 - 이미지 플레이스홀더 {{IMG:xxx}} →
변환 - 목차(TOC) 자동 생성 사용법: python 9_md_to_html_publisher.py python 9_md_to_html_publisher.py --md report_draft.md --json report_sections.json --output report.html python 9_md_to_html_publisher.py --no-toc --no-summary """ import os import re import json import argparse from pathlib import Path from datetime import datetime from typing import List, Dict, Any, Tuple, Optional from dataclasses import dataclass, field # ===== 경로 설정 ===== OUTPUT_ROOT = Path(r"D:\for python\geulbeot-light\geulbeot-light\00.test\hwpx\out\out") # 출력 위치 GEN_DIR = OUTPUT_ROOT / "generated" ASSETS_DIR = GEN_DIR / "assets" LOG_DIR = OUTPUT_ROOT / "logs" # 기본 입출력 파일 DEFAULT_MD_PATH = GEN_DIR / "report_draft.md" DEFAULT_JSON_PATH = GEN_DIR / "report_sections.json" DEFAULT_OUTPUT_PATH = GEN_DIR / "report.html" for d in [GEN_DIR, ASSETS_DIR, LOG_DIR]: d.mkdir(parents=True, exist_ok=True) def log(msg: str): """로깅 함수""" line = f"[{datetime.now().strftime('%H:%M:%S')}] {msg}" print(line, flush=True) with (LOG_DIR / "step9_html_publish_log.txt").open("a", encoding="utf-8") as f: f.write(line + "\n") # ===== 데이터 클래스 ===== @dataclass class ImageAsset: """이미지 자산 정보""" image_id: str filename: str caption: str placeholder: str source_path: str = "" page: Optional[int] = None asset_path: Optional[str] = None @dataclass class Section: """섹션 정보""" section_id: str section_title: str generated_text: str assets: List[ImageAsset] = field(default_factory=list) @dataclass class TocItem: """목차 항목""" number: str title: str level: int # 1, 2, 3 # ===== 파일 로더 ===== def load_json_meta(json_path: Path) -> Tuple[str, List[Section]]: """JSON 파일에서 메타정보와 섹션 로드""" if not json_path.exists(): raise FileNotFoundError(f"JSON 파일 없음: {json_path}") data = json.loads(json_path.read_text(encoding="utf-8")) report_title = data.get("report_title", "보고서") sections = [] for sec in data.get("sections", []): assets = [] for asset in sec.get("assets", []): assets.append(ImageAsset( image_id=asset.get("image_id", ""), filename=asset.get("filename", ""), caption=asset.get("caption", ""), placeholder=asset.get("placeholder", ""), source_path=asset.get("source_path", ""), page=asset.get("page"), asset_path=asset.get("asset_path") )) sections.append(Section( section_id=sec.get("section_id", ""), section_title=sec.get("section_title", ""), generated_text=sec.get("generated_text", ""), assets=assets )) return report_title, sections def load_markdown(md_path: Path) -> str: """마크다운 파일 로드""" if not md_path.exists(): raise FileNotFoundError(f"MD 파일 없음: {md_path}") return md_path.read_text(encoding="utf-8") # ===== 이미지 맵 생성 ===== def build_image_map(sections: List[Section]) -> Dict[str, ImageAsset]: """placeholder → ImageAsset 매핑 생성""" img_map = {} for sec in sections: for asset in sec.assets: if asset.placeholder: # {{IMG:xxx}} 형태에서 xxx 추출 img_map[asset.image_id] = asset return img_map # ===== 목차 생성 ===== def extract_toc_from_md(md_content: str) -> List[TocItem]: """마크다운에서 목차 구조 추출""" toc_items = [] # 헤딩 패턴 patterns = [ (re.compile(r'^##\s+(\d+)\s+(.+)$', re.MULTILINE), 1), # ## 1 대목차 (re.compile(r'^###\s+(\d+\.\d+)\s+(.+)$', re.MULTILINE), 2), # ### 1.1 중목차 (re.compile(r'^####\s+(\d+\.\d+\.\d+)\s+(.+)$', re.MULTILINE), 3), # #### 1.1.1 소목차 ] for pattern, level in patterns: for match in pattern.finditer(md_content): number = match.group(1) title = match.group(2).strip() toc_items.append(TocItem(number=number, title=title, level=level)) # 번호순 정렬 def sort_key(item: TocItem) -> tuple: parts = item.number.split('.') return tuple(int(p) for p in parts) toc_items.sort(key=sort_key) return toc_items def generate_toc_html(toc_items: List[TocItem]) -> str: """목차 HTML 생성""" if not toc_items: return "" lines = ['') return '\n'.join(lines) # ===== 마크다운 → HTML 변환 ===== class MarkdownToHtmlConverter: """마크다운을 HTML로 변환하는 클래스""" def __init__(self, image_map: Dict[str, ImageAsset]): self.image_map = image_map self.table_counter = {} # chapter -> count self.figure_counter = {} # chapter -> count def get_chapter(self, context: str = "1") -> str: """현재 챕터 번호 추출""" return context.split('.')[0] if context else "1" def next_table_num(self, chapter: str) -> str: """다음 표 번호""" if chapter not in self.table_counter: self.table_counter[chapter] = 0 self.table_counter[chapter] += 1 return f"{chapter}-{self.table_counter[chapter]}" def next_figure_num(self, chapter: str) -> str: """다음 그림 번호""" if chapter not in self.figure_counter: self.figure_counter[chapter] = 0 self.figure_counter[chapter] += 1 return f"{chapter}-{self.figure_counter[chapter]}" def convert_table(self, md_table: str, caption: str = "", chapter: str = "1") -> str: """마크다운 테이블 → HTML 테이블""" lines = [l.strip() for l in md_table.strip().split('\n') if l.strip()] if len(lines) < 2: return "" # 헤더 행 header_cells = [c.strip() for c in lines[0].split('|') if c.strip()] # 구분선 건너뛰기 (|---|---|) data_start = 1 if len(lines) > 1 and re.match(r'^[\|\s\-:]+$', lines[1]): data_start = 2 # 데이터 행 data_rows = [] for line in lines[data_start:]: cells = [c.strip() for c in line.split('|') if c.strip()] if cells: data_rows.append(cells) # HTML 생성 html_lines = [''] # thead html_lines.append('') for cell in header_cells: # **text** → text cell = re.sub(r'\*\*(.+?)\*\*', r'\1', cell) html_lines.append(f'') html_lines.append('') # tbody html_lines.append('') for row in data_rows: html_lines.append('') for cell in row: # **text** 처리 cell = re.sub(r'\*\*(.+?)\*\*', r'\1', cell) #
처리 cell = cell.replace('
', '
') html_lines.append(f'') html_lines.append('') html_lines.append('') html_lines.append('
{cell}
{cell}
') # 캡션 추가 if caption: html_lines.append(f'
{caption}
') return '\n'.join(html_lines) def convert_image_placeholder(self, placeholder: str, chapter: str = "1") -> str: """{{IMG:xxx}} →
변환""" # {{IMG:1_1_1_img01}} 에서 ID 추출 match = re.match(r'\{\{IMG:(.+?)\}\}', placeholder) if not match: return placeholder image_id = match.group(1) asset = self.image_map.get(image_id) if asset and asset.asset_path: fig_num = self.next_figure_num(chapter) caption = asset.caption if asset.caption and asset.caption != "Photo" else "" caption_text = f"[그림 {fig_num}] {caption}" if caption else f"[그림 {fig_num}]" return f'''
{caption}
{caption_text}
''' else: # 이미지 파일이 없는 경우 플레이스홀더 주석으로 return f'' def convert_list(self, md_list: str) -> str: """마크다운 리스트 → HTML 리스트""" lines = md_list.strip().split('\n') html_lines = [] in_list = False list_type = 'ul' for line in lines: line = line.strip() if not line: continue # 순서 없는 리스트 ul_match = re.match(r'^[\*\-]\s+(.+)$', line) # 순서 있는 리스트 ol_match = re.match(r'^(\d+)\.\s+(.+)$', line) if ul_match: if not in_list: html_lines.append('