diff --git a/03.Code/업로드용/converters/pipeline/step9_html.py b/03.Code/업로드용/converters/pipeline/step9_html.py deleted file mode 100644 index e6b213f..0000000 --- a/03.Code/업로드용/converters/pipeline/step9_html.py +++ /dev/null @@ -1,1250 +0,0 @@ -# -*- coding: utf-8 -*- -""" -9_md_to_html_publisher.py - -기능: -- report_draft.md + report_sections.json → report.html 변환 -- A4 규격 페이지네이션 템플릿 적용 -- 마크다운 테이블 → HTML 테이블 변환 -- 이미지 플레이스홀더 {{IMG:xxx}} →
변환 -- 목차(TOC) 자동 생성 - -사용법: - python 9_md_to_html_publisher.py - python 9_md_to_html_publisher.py --md report_draft.md --json report_sections.json --output report.html - python 9_md_to_html_publisher.py --no-toc --no-summary -""" - -import os -import re -import json -import argparse -from pathlib import Path -from datetime import datetime -from typing import List, Dict, Any, Tuple, Optional -from dataclasses import dataclass, field - -# ===== 경로 설정 ===== -OUTPUT_ROOT = Path(r"D:\for python\geulbeot-light\geulbeot-light\00.test\hwpx\out\out") # 출력 위치 -GEN_DIR = OUTPUT_ROOT / "generated" -ASSETS_DIR = GEN_DIR / "assets" -LOG_DIR = OUTPUT_ROOT / "logs" - -# 기본 입출력 파일 -DEFAULT_MD_PATH = GEN_DIR / "report_draft.md" -DEFAULT_JSON_PATH = GEN_DIR / "report_sections.json" -DEFAULT_OUTPUT_PATH = GEN_DIR / "report.html" - -for d in [GEN_DIR, ASSETS_DIR, LOG_DIR]: - d.mkdir(parents=True, exist_ok=True) - - -def log(msg: str): - """로깅 함수""" - line = f"[{datetime.now().strftime('%H:%M:%S')}] {msg}" - print(line, flush=True) - with (LOG_DIR / "step9_html_publish_log.txt").open("a", encoding="utf-8") as f: - f.write(line + "\n") - - -# ===== 데이터 클래스 ===== -@dataclass -class ImageAsset: - """이미지 자산 정보""" - image_id: str - filename: str - caption: str - placeholder: str - source_path: str = "" - page: Optional[int] = None - asset_path: Optional[str] = None - - -@dataclass -class Section: - """섹션 정보""" - section_id: str - section_title: str - generated_text: str - assets: List[ImageAsset] = field(default_factory=list) - - -@dataclass -class TocItem: - """목차 항목""" - number: str - title: str - level: int # 1, 2, 3 - - -# ===== 파일 로더 ===== -def load_json_meta(json_path: Path) -> Tuple[str, List[Section]]: - """JSON 파일에서 메타정보와 섹션 로드""" - if not json_path.exists(): - raise FileNotFoundError(f"JSON 파일 없음: {json_path}") - - data = json.loads(json_path.read_text(encoding="utf-8")) - report_title = data.get("report_title", "보고서") - - sections = [] - for sec in data.get("sections", []): - assets = [] - for asset in sec.get("assets", []): - assets.append(ImageAsset( - image_id=asset.get("image_id", ""), - filename=asset.get("filename", ""), - caption=asset.get("caption", ""), - placeholder=asset.get("placeholder", ""), - source_path=asset.get("source_path", ""), - page=asset.get("page"), - asset_path=asset.get("asset_path") - )) - - sections.append(Section( - section_id=sec.get("section_id", ""), - section_title=sec.get("section_title", ""), - generated_text=sec.get("generated_text", ""), - assets=assets - )) - - return report_title, sections - - -def load_markdown(md_path: Path) -> str: - """마크다운 파일 로드""" - if not md_path.exists(): - raise FileNotFoundError(f"MD 파일 없음: {md_path}") - return md_path.read_text(encoding="utf-8") - - -# ===== 이미지 맵 생성 ===== -def build_image_map(sections: List[Section]) -> Dict[str, ImageAsset]: - """placeholder → ImageAsset 매핑 생성""" - img_map = {} - for sec in sections: - for asset in sec.assets: - if asset.placeholder: - # {{IMG:xxx}} 형태에서 xxx 추출 - img_map[asset.image_id] = asset - return img_map - - -# ===== 목차 생성 ===== -def extract_toc_from_md(md_content: str) -> List[TocItem]: - """마크다운에서 목차 구조 추출""" - toc_items = [] - - # 헤딩 패턴 - patterns = [ - (re.compile(r'^##\s+(\d+)\s+(.+)$', re.MULTILINE), 1), # ## 1 대목차 - (re.compile(r'^###\s+(\d+\.\d+)\s+(.+)$', re.MULTILINE), 2), # ### 1.1 중목차 - (re.compile(r'^####\s+(\d+\.\d+\.\d+)\s+(.+)$', re.MULTILINE), 3), # #### 1.1.1 소목차 - ] - - for pattern, level in patterns: - for match in pattern.finditer(md_content): - number = match.group(1) - title = match.group(2).strip() - toc_items.append(TocItem(number=number, title=title, level=level)) - - # 번호순 정렬 - def sort_key(item: TocItem) -> tuple: - parts = item.number.split('.') - return tuple(int(p) for p in parts) - - toc_items.sort(key=sort_key) - return toc_items - - -def generate_toc_html(toc_items: List[TocItem]) -> str: - """목차 HTML 생성""" - if not toc_items: - return "" - - lines = ['') - return '\n'.join(lines) - - -# ===== 마크다운 → HTML 변환 ===== -class MarkdownToHtmlConverter: - """마크다운을 HTML로 변환하는 클래스 Teng-style""" - - def __init__(self, image_map: Dict[str, ImageAsset]): - self.image_map = image_map - self.table_counter = {} # chapter -> count - self.figure_counter = {} # chapter -> count - - def get_chapter(self, context: str = "1") -> str: - """현재 챕터 번호 추출""" - return context.split('.')[0] if context else "1" - - def next_table_num(self, chapter: str) -> str: - """다음 표 번호""" - if chapter not in self.table_counter: - self.table_counter[chapter] = 0 - self.table_counter[chapter] += 1 - return f"{chapter}-{self.table_counter[chapter]}" - - def next_figure_num(self, chapter: str) -> str: - """다음 그림 번호""" - if chapter not in self.figure_counter: - self.figure_counter[chapter] = 0 - self.figure_counter[chapter] += 1 - return f"{chapter}-{self.figure_counter[chapter]}" - - def convert_table(self, md_table: str, caption: str = "", chapter: str = "1") -> str: - """마크다운 테이블 → HTML 테이블""" - lines = [l.strip() for l in md_table.strip().split('\n') if l.strip()] - if len(lines) < 2: - return "" - - # 헤더 행 - header_cells = [c.strip() for c in lines[0].split('|') if c.strip()] - - # 구분선 건너뛰기 (|---|---|) - data_start = 1 - if len(lines) > 1 and re.match(r'^[\|\s\-:]+$', lines[1]): - data_start = 2 - - # 데이터 행 - data_rows = [] - for line in lines[data_start:]: - cells = [c.strip() for c in line.split('|') if c.strip()] - if cells: - data_rows.append(cells) - - # HTML 생성 - html_lines = [''] - - # thead - html_lines.append('') - for cell in header_cells: - # **text** → text - cell = re.sub(r'\*\*(.+?)\*\*', r'\1', cell) - html_lines.append(f'') - html_lines.append('') - - # tbody - html_lines.append('') - for row in data_rows: - html_lines.append('') - for cell in row: - # **text** 처리 - cell = re.sub(r'\*\*(.+?)\*\*', r'\1', cell) - #
처리 - cell = cell.replace('
', '
') - html_lines.append(f'') - html_lines.append('') - html_lines.append('') - html_lines.append('
{cell}
{cell}
') - - # 캡션 추가 - if caption: - html_lines.append(f'
{caption}
') - - return '\n'.join(html_lines) - - def convert_image_placeholder(self, placeholder: str, chapter: str = "1") -> str: - """{{IMG:xxx}} →
변환""" - # {{IMG:1_1_1_img01}} 에서 ID 추출 - match = re.match(r'\{\{IMG:(.+?)\}\}', placeholder) - if not match: - return placeholder - - image_id = match.group(1) - asset = self.image_map.get(image_id) - - if asset and asset.asset_path: - fig_num = self.next_figure_num(chapter) - caption = asset.caption if asset.caption and asset.caption != "Photo" else "" - caption_text = f"[그림 {fig_num}] {caption}" if caption else f"[그림 {fig_num}]" - - return f'''
- {caption} -
{caption_text}
-
''' - else: - # 이미지 파일이 없는 경우 플레이스홀더 주석으로 - return f'' - - def convert_list(self, md_list: str) -> str: - """마크다운 리스트 → HTML 리스트""" - lines = md_list.strip().split('\n') - html_lines = [] - in_list = False - list_type = 'ul' - - for line in lines: - line = line.strip() - if not line: - continue - - # 순서 없는 리스트 - ul_match = re.match(r'^[\*\-]\s+(.+)$', line) - # 순서 있는 리스트 - ol_match = re.match(r'^(\d+)\.\s+(.+)$', line) - - if ul_match: - if not in_list: - html_lines.append('