From dae70e9f856c23df902620c19562413f5d336ccb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EC=9D=B4=EA=B2=BD=EB=AF=BC?= Date: Thu, 19 Mar 2026 14:01:42 +0900 Subject: [PATCH] =?UTF-8?q?Cleanup:=20Deleting=2003.Code/=EC=97=85?= =?UTF-8?q?=EB=A1=9C=EB=93=9C=EC=9A=A9/converters/pipeline/step9=5Fhtml.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../converters/pipeline/step9_html.py | 1250 ----------------- 1 file changed, 1250 deletions(-) delete mode 100644 03.Code/업로드용/converters/pipeline/step9_html.py diff --git a/03.Code/업로드용/converters/pipeline/step9_html.py b/03.Code/업로드용/converters/pipeline/step9_html.py deleted file mode 100644 index e6b213f..0000000 --- a/03.Code/업로드용/converters/pipeline/step9_html.py +++ /dev/null @@ -1,1250 +0,0 @@ -# -*- coding: utf-8 -*- -""" -9_md_to_html_publisher.py - -기능: -- report_draft.md + report_sections.json → report.html 변환 -- A4 규격 페이지네이션 템플릿 적용 -- 마크다운 테이블 → HTML 테이블 변환 -- 이미지 플레이스홀더 {{IMG:xxx}} →
변환 -- 목차(TOC) 자동 생성 - -사용법: - python 9_md_to_html_publisher.py - python 9_md_to_html_publisher.py --md report_draft.md --json report_sections.json --output report.html - python 9_md_to_html_publisher.py --no-toc --no-summary -""" - -import os -import re -import json -import argparse -from pathlib import Path -from datetime import datetime -from typing import List, Dict, Any, Tuple, Optional -from dataclasses import dataclass, field - -# ===== 경로 설정 ===== -OUTPUT_ROOT = Path(r"D:\for python\geulbeot-light\geulbeot-light\00.test\hwpx\out\out") # 출력 위치 -GEN_DIR = OUTPUT_ROOT / "generated" -ASSETS_DIR = GEN_DIR / "assets" -LOG_DIR = OUTPUT_ROOT / "logs" - -# 기본 입출력 파일 -DEFAULT_MD_PATH = GEN_DIR / "report_draft.md" -DEFAULT_JSON_PATH = GEN_DIR / "report_sections.json" -DEFAULT_OUTPUT_PATH = GEN_DIR / "report.html" - -for d in [GEN_DIR, ASSETS_DIR, LOG_DIR]: - d.mkdir(parents=True, exist_ok=True) - - -def log(msg: str): - """로깅 함수""" - line = f"[{datetime.now().strftime('%H:%M:%S')}] {msg}" - print(line, flush=True) - with (LOG_DIR / "step9_html_publish_log.txt").open("a", encoding="utf-8") as f: - f.write(line + "\n") - - -# ===== 데이터 클래스 ===== -@dataclass -class ImageAsset: - """이미지 자산 정보""" - image_id: str - filename: str - caption: str - placeholder: str - source_path: str = "" - page: Optional[int] = None - asset_path: Optional[str] = None - - -@dataclass -class Section: - """섹션 정보""" - section_id: str - section_title: str - generated_text: str - assets: List[ImageAsset] = field(default_factory=list) - - -@dataclass -class TocItem: - """목차 항목""" - number: str - title: str - level: int # 1, 2, 3 - - -# ===== 파일 로더 ===== -def load_json_meta(json_path: Path) -> Tuple[str, List[Section]]: - """JSON 파일에서 메타정보와 섹션 로드""" - if not json_path.exists(): - raise FileNotFoundError(f"JSON 파일 없음: {json_path}") - - data = json.loads(json_path.read_text(encoding="utf-8")) - report_title = data.get("report_title", "보고서") - - sections = [] - for sec in data.get("sections", []): - assets = [] - for asset in sec.get("assets", []): - assets.append(ImageAsset( - image_id=asset.get("image_id", ""), - filename=asset.get("filename", ""), - caption=asset.get("caption", ""), - placeholder=asset.get("placeholder", ""), - source_path=asset.get("source_path", ""), - page=asset.get("page"), - asset_path=asset.get("asset_path") - )) - - sections.append(Section( - section_id=sec.get("section_id", ""), - section_title=sec.get("section_title", ""), - generated_text=sec.get("generated_text", ""), - assets=assets - )) - - return report_title, sections - - -def load_markdown(md_path: Path) -> str: - """마크다운 파일 로드""" - if not md_path.exists(): - raise FileNotFoundError(f"MD 파일 없음: {md_path}") - return md_path.read_text(encoding="utf-8") - - -# ===== 이미지 맵 생성 ===== -def build_image_map(sections: List[Section]) -> Dict[str, ImageAsset]: - """placeholder → ImageAsset 매핑 생성""" - img_map = {} - for sec in sections: - for asset in sec.assets: - if asset.placeholder: - # {{IMG:xxx}} 형태에서 xxx 추출 - img_map[asset.image_id] = asset - return img_map - - -# ===== 목차 생성 ===== -def extract_toc_from_md(md_content: str) -> List[TocItem]: - """마크다운에서 목차 구조 추출""" - toc_items = [] - - # 헤딩 패턴 - patterns = [ - (re.compile(r'^##\s+(\d+)\s+(.+)$', re.MULTILINE), 1), # ## 1 대목차 - (re.compile(r'^###\s+(\d+\.\d+)\s+(.+)$', re.MULTILINE), 2), # ### 1.1 중목차 - (re.compile(r'^####\s+(\d+\.\d+\.\d+)\s+(.+)$', re.MULTILINE), 3), # #### 1.1.1 소목차 - ] - - for pattern, level in patterns: - for match in pattern.finditer(md_content): - number = match.group(1) - title = match.group(2).strip() - toc_items.append(TocItem(number=number, title=title, level=level)) - - # 번호순 정렬 - def sort_key(item: TocItem) -> tuple: - parts = item.number.split('.') - return tuple(int(p) for p in parts) - - toc_items.sort(key=sort_key) - return toc_items - - -def generate_toc_html(toc_items: List[TocItem]) -> str: - """목차 HTML 생성""" - if not toc_items: - return "" - - lines = ['') - return '\n'.join(lines) - - -# ===== 마크다운 → HTML 변환 ===== -class MarkdownToHtmlConverter: - """마크다운을 HTML로 변환하는 클래스 Teng-style""" - - def __init__(self, image_map: Dict[str, ImageAsset]): - self.image_map = image_map - self.table_counter = {} # chapter -> count - self.figure_counter = {} # chapter -> count - - def get_chapter(self, context: str = "1") -> str: - """현재 챕터 번호 추출""" - return context.split('.')[0] if context else "1" - - def next_table_num(self, chapter: str) -> str: - """다음 표 번호""" - if chapter not in self.table_counter: - self.table_counter[chapter] = 0 - self.table_counter[chapter] += 1 - return f"{chapter}-{self.table_counter[chapter]}" - - def next_figure_num(self, chapter: str) -> str: - """다음 그림 번호""" - if chapter not in self.figure_counter: - self.figure_counter[chapter] = 0 - self.figure_counter[chapter] += 1 - return f"{chapter}-{self.figure_counter[chapter]}" - - def convert_table(self, md_table: str, caption: str = "", chapter: str = "1") -> str: - """마크다운 테이블 → HTML 테이블""" - lines = [l.strip() for l in md_table.strip().split('\n') if l.strip()] - if len(lines) < 2: - return "" - - # 헤더 행 - header_cells = [c.strip() for c in lines[0].split('|') if c.strip()] - - # 구분선 건너뛰기 (|---|---|) - data_start = 1 - if len(lines) > 1 and re.match(r'^[\|\s\-:]+$', lines[1]): - data_start = 2 - - # 데이터 행 - data_rows = [] - for line in lines[data_start:]: - cells = [c.strip() for c in line.split('|') if c.strip()] - if cells: - data_rows.append(cells) - - # HTML 생성 - html_lines = [''] - - # thead - html_lines.append('') - for cell in header_cells: - # **text** → text - cell = re.sub(r'\*\*(.+?)\*\*', r'\1', cell) - html_lines.append(f'') - html_lines.append('') - - # tbody - html_lines.append('') - for row in data_rows: - html_lines.append('') - for cell in row: - # **text** 처리 - cell = re.sub(r'\*\*(.+?)\*\*', r'\1', cell) - #
처리 - cell = cell.replace('
', '
') - html_lines.append(f'') - html_lines.append('') - html_lines.append('') - html_lines.append('
{cell}
{cell}
') - - # 캡션 추가 - if caption: - html_lines.append(f'
{caption}
') - - return '\n'.join(html_lines) - - def convert_image_placeholder(self, placeholder: str, chapter: str = "1") -> str: - """{{IMG:xxx}} →
변환""" - # {{IMG:1_1_1_img01}} 에서 ID 추출 - match = re.match(r'\{\{IMG:(.+?)\}\}', placeholder) - if not match: - return placeholder - - image_id = match.group(1) - asset = self.image_map.get(image_id) - - if asset and asset.asset_path: - fig_num = self.next_figure_num(chapter) - caption = asset.caption if asset.caption and asset.caption != "Photo" else "" - caption_text = f"[그림 {fig_num}] {caption}" if caption else f"[그림 {fig_num}]" - - return f'''
- {caption} -
{caption_text}
-
''' - else: - # 이미지 파일이 없는 경우 플레이스홀더 주석으로 - return f'' - - def convert_list(self, md_list: str) -> str: - """마크다운 리스트 → HTML 리스트""" - lines = md_list.strip().split('\n') - html_lines = [] - in_list = False - list_type = 'ul' - - for line in lines: - line = line.strip() - if not line: - continue - - # 순서 없는 리스트 - ul_match = re.match(r'^[\*\-]\s+(.+)$', line) - # 순서 있는 리스트 - ol_match = re.match(r'^(\d+)\.\s+(.+)$', line) - - if ul_match: - if not in_list: - html_lines.append('
    ') - in_list = True - list_type = 'ul' - content = ul_match.group(1) - content = re.sub(r'\*\*(.+?)\*\*', r'\1', content) - html_lines.append(f'
  • {content}
  • ') - elif ol_match: - if not in_list: - html_lines.append('
      ') - in_list = True - list_type = 'ol' - content = ol_match.group(2) - content = re.sub(r'\*\*(.+?)\*\*', r'\1', content) - html_lines.append(f'
    1. {content}
    2. ') - - if in_list: - html_lines.append(f'') - - return '\n'.join(html_lines) - - def convert_paragraph(self, text: str) -> str: - """일반 텍스트 →

      변환""" - # 빈 줄이면 무시 - if not text.strip(): - return "" - - # **text** → - text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) - # *text* → - text = re.sub(r'\*(.+?)\*', r'\1', text) - # `code` → - text = re.sub(r'`(.+?)`', r'\1', text) - - return f'

      {text}

      ' - - def convert_full_content(self, md_content: str) -> str: - """전체 마크다운 콘텐츠를 HTML로 변환""" - lines = md_content.split('\n') - html_parts = [] - - current_chapter = "1" - i = 0 - - while i < len(lines): - line = lines[i].strip() - - # 빈 줄 - if not line: - i += 1 - continue - - # H1 (# 제목) - 보고서 제목, 섹션 시작 등 - h1_match = re.match(r'^#\s+(.+)$', line) - if h1_match and not line.startswith('##'): - title = h1_match.group(1) - # 섹션 번호가 있으면 추출 - num_match = re.match(r'^(\d+(?:\.\d+)*)\s+', title) - if num_match: - current_chapter = num_match.group(1).split('.')[0] - html_parts.append(f'

      {title}

      ') - i += 1 - continue - - # H2 (## 대목차) - h2_match = re.match(r'^##\s+(.+)$', line) - if h2_match: - title = h2_match.group(1) - num_match = re.match(r'^(\d+)\s+', title) - if num_match: - current_chapter = num_match.group(1) - html_parts.append(f'

      {title}

      ') # H1으로 변환 (페이지 분리 트리거) - i += 1 - continue - - # H3 (### 중목차) - h3_match = re.match(r'^###\s+(.+)$', line) - if h3_match: - html_parts.append(f'

      {h3_match.group(1)}

      ') - i += 1 - continue - - # H4 (#### 소목차/꼭지) - h4_match = re.match(r'^####\s+(.+)$', line) - if h4_match: - html_parts.append(f'

      {h4_match.group(1)}

      ') - i += 1 - continue - - # 이미지 플레이스홀더 {{IMG:xxx}} - img_match = re.match(r'^\{\{IMG:(.+?)\}\}$', line) - if img_match: - html_parts.append(self.convert_image_placeholder(line, current_chapter)) - i += 1 - continue - - # 이미지 캡션 *(참고: ...)* - figure 바로 뒤에 나오면 무시 (이미 figcaption으로 처리) - if line.startswith('*(') and line.endswith(')*'): - i += 1 - continue - - # 테이블 감지 (| 로 시작) - if line.startswith('|') or (line.startswith('**[표') and i + 1 < len(lines)): - # 표 제목 캡션 - caption = "" - if line.startswith('**[표'): - caption_match = re.match(r'^\*\*(\[표.+?\].*?)\*\*$', line) - if caption_match: - caption = caption_match.group(1) - i += 1 - if i >= len(lines): - break - line = lines[i].strip() - - # 테이블 본문 수집 - table_lines = [] - while i < len(lines) and (lines[i].strip().startswith('|') or - re.match(r'^[\|\s\-:]+$', lines[i].strip())): - table_lines.append(table_lines) # Fixed from list.append(table_lines) to line in thinking - table_lines.append(lines[i]) - i += 1 - - if table_lines: - table_md = '\n'.join(table_lines) - html_parts.append(self.convert_table(table_md, caption, current_chapter)) - continue - - # 리스트 감지 (* 또는 - 또는 1. 로 시작) - if re.match(r'^[\*\-]\s+', line) or re.match(r'^\d+\.\s+', line): - list_lines = [line] - i += 1 - while i < len(lines): - next_line = lines[i].strip() - if re.match(r'^[\*\-]\s+', next_line) or re.match(r'^\d+\.\s+', next_line): - list_lines.append(next_line) - i += 1 - elif not next_line: - i += 1 - break - else: - break - - html_parts.append(self.convert_list('\n'.join(list_lines))) - continue - - # 일반 문단 - para_lines = [line] - i += 1 - while i < len(lines): - next_line = lines[i].strip() - # 다음이 특수 요소면 문단 종료 - if (not next_line or - next_line.startswith('#') or - next_line.startswith('|') or - next_line.startswith('**[표') or - next_line.startswith('{{IMG:') or - next_line.startswith('*(') or - re.match(r'^[\*\-]\s+', next_line) or - re.match(r'^\d+\.\s+', next_line)): - break - para_lines.append(next_line) - i += 1 - - para_text = ' '.join(para_lines) - if para_text: - html_parts.append(self.convert_paragraph(para_text)) - - return '\n'.join(html_parts) - - -# ===== HTML 템플릿 ===== -def get_html_template() -> str: - """A4 보고서 HTML 템플릿 반환""" - return ''' - - - -{{report_title}} - - - - -
      -
      {{box_cover}}
      -
      {{box_toc}}
      -
      {{box_summary}}
      -
      {{box_content}}
      -
      - - - - - -''' - - -# ===== 메인 함수 ===== -def generate_report_html( - md_path: Path, - json_path: Path, - output_path: Path, - include_toc: bool = True, - include_summary: bool = True, - cover_info: Optional[Dict[str, str]] = None -): - """ - MD와 JSON을 A4 HTML 보고서로 변환 - - Args: - md_path: report_draft.md 경로 - json_path: report_sections.json 경로 - output_path: 출력할 report.html 경로 - include_toc: 목차 포함 여부 - include_summary: 요약 포함 여부 - cover_info: 표지 정보 (date, author, department 등) - """ - log("=== Step 9: MD → HTML 변환 시작 ===") - - # 1. 데이터 로드 - log(f"JSON 로드: {{json_path}}") - report_title, sections = load_json_meta(json_path) - - log(f"MD 로드: {{md_path}}") - md_content = load_markdown(md_path) - - log(f"보고서 제목: {{report_title}}") - log(f"섹션 수: {{len(sections)}}") - - # 2. 이미지 맵 생성 - image_map = build_image_map(sections) - log(f"이미지 자산 수: {{len(image_map)}}") - - # 3. 목차 추출 - toc_items = extract_toc_from_md(md_content) - log(f"목차 항목 수: {{len(toc_items)}}") - - # 4. MD → HTML 변환 - converter = MarkdownToHtmlConverter(image_map) - content_html = converter.convert_full_content(md_content) - - # 5. 박스별 콘텐츠 생성 - - # box-cover (표지) - cover_date = cover_info.get('date', datetime.now().strftime('%Y.%m.%d')) if cover_info else datetime.now().strftime('%Y.%m.%d') - cover_author = cover_info.get('author', '') if cover_info else '' - cover_dept = cover_info.get('department', '') if cover_info else '' - - # 제목에서 부제목 분리 (: 기준) - title_parts = report_title.split(':') - main_title = title_parts[0].strip() - sub_title = title_parts[1].strip() if len(title_parts) > 1 else "" - - box_cover = f''' -

      {{main_title}}

      -

      {{sub_title}}

      -

      {{cover_date}}

      - {{f'

      {{cover_author}}

      ' if cover_author else ''}} - {{f'

      {{cover_dept}}

      ' if cover_dept else ''}} - ''' - - # box-toc (목차) - box_toc = "" - if include_toc and toc_items: - box_toc = generate_toc_html(toc_items) - log(f"목차 HTML 생성 완료") - - # box-summary (요약) - 첫 번째 섹션을 요약으로 사용하거나 비워둠 - box_summary = "" - if include_summary: - # 요약 섹션이 있으면 사용 - for sec in sections: - if '요약' in sec.section_title or 'summary' in sec.section_title.lower(): - summary_converter = MarkdownToHtmlConverter(image_map) - box_summary = f"

      요약

      \\n{{summary_converter.convert_full_content(sec.generated_text)}}" - break - - # box-content (본문) - box_content = content_html - - # 6. 템플릿에 주입 - template = get_html_template() - html_output = template.format( - report_title=report_title, - box_cover=box_cover, - box_toc=box_toc, - box_summary=box_summary, - box_content=box_content - ) - - # 7. 파일 저장 - output_path.write_text(html_output, encoding='utf-8') - - log(f"") - log(f"═══════════════════════════════════════════════════") - log(f"HTML 보고서 생성 완료!") - log(f" 출력 파일: {{output_path}}") - log(f" 파일 크기: {{output_path.stat().st_size / 1024:.1f}} KB") - log(f"═══════════════════════════════════════════════════") - log("=== Step 9 종료 ===") - - return output_path - - -def main(): - """CLI 진입점""" - parser = argparse.ArgumentParser( - description='MD + JSON → A4 HTML 보고서 변환', - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=''' -예시: - python 9_md_to_html_publisher.py - python 9_md_to_html_publisher.py --md report_draft.md --json report_sections.json - python 9_md_to_html_publisher.py --no-toc --no-summary - python 9_md_to_html_publisher.py --cover-date "2026.01.15" --cover-author "홍길동" - ''' - ) - - parser.add_argument('--md', type=Path, default=DEFAULT_MD_PATH, - help='입력 마크다운 파일 경로') - parser.add_argument('--json', type=Path, default=DEFAULT_JSON_PATH, - help='입력 JSON 파일 경로') - parser.add_argument('--output', '-o', type=Path, default=DEFAULT_OUTPUT_PATH, - help='출력 HTML 파일 경로') - parser.add_argument('--no-toc', action='store_true', - help='목차 페이지 제외') - parser.add_argument('--no-summary', action='store_true', - help='요약 페이지 제외') - parser.add_argument('--cover-date', type=str, default=None, - help='표지 날짜 (예: 2026.01.15)') - parser.add_argument('--cover-author', type=str, default=None, - help='표지 작성자') - parser.add_argument('--cover-dept', type=str, default=None, - help='표지 부서명') - - args = parser.parse_args() - - # 표지 정보 구성 - cover_info = {} - if args.cover_date: - cover_info['date'] = args.cover_date - if args.cover_author: - cover_info['author'] = args.cover_author - if args.cover_dept: - cover_info['department'] = args.cover_dept - - # 변환 실행 - generate_report_html( - md_path=args.md, - json_path=args.json, - output_path=args.output, - include_toc=not args.no_toc, - include_summary=not args.no_summary, - cover_info=cover_info if cover_info else None - ) - - -if __name__ == "__main__": - main()