From 80cbcfdf2ae19995b91c2c9369a694c0c48dc119 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EC=9D=B4=EA=B2=BD=EB=AF=BC?= Date: Thu, 19 Mar 2026 09:58:00 +0900 Subject: [PATCH] Update step9_html.py --- .../converters/pipeline/step9_html.py | 1287 ++++++++++++++++- 1 file changed, 1217 insertions(+), 70 deletions(-) diff --git a/03.Code/업로드용/converters/pipeline/step9_html.py b/03.Code/업로드용/converters/pipeline/step9_html.py index ee603d8..e6b213f 100644 --- a/03.Code/업로드용/converters/pipeline/step9_html.py +++ b/03.Code/업로드용/converters/pipeline/step9_html.py @@ -1,103 +1,1250 @@ # -*- coding: utf-8 -*- -from dotenv import load_dotenv -load_dotenv() - """ 9_md_to_html_publisher.py 기능: -- 생성된 report_draft.md 파일을 읽어 최종 report.html을 생성합니다. -- 마크다운을 HTML로 변환하며, 지정된 스타일시트(Word/HWP 스타일)를 적용합니다. -- 목차(TOC) 자동 생성 및 링크 기능을 포함합니다. +- report_draft.md + report_sections.json → report.html 변환 +- A4 규격 페이지네이션 템플릿 적용 +- 마크다운 테이블 → HTML 테이블 변환 +- 이미지 플레이스홀더 {{IMG:xxx}} →
변환 +- 목차(TOC) 자동 생성 + +사용법: + python 9_md_to_html_publisher.py + python 9_md_to_html_publisher.py --md report_draft.md --json report_sections.json --output report.html + python 9_md_to_html_publisher.py --no-toc --no-summary """ import os import re import json +import argparse from pathlib import Path from datetime import datetime -from typing import List, Dict, Any, Tuple +from typing import List, Dict, Any, Tuple, Optional +from dataclasses import dataclass, field + +# ===== 경로 설정 ===== +OUTPUT_ROOT = Path(r"D:\for python\geulbeot-light\geulbeot-light\00.test\hwpx\out\out") # 출력 위치 +GEN_DIR = OUTPUT_ROOT / "generated" +ASSETS_DIR = GEN_DIR / "assets" +LOG_DIR = OUTPUT_ROOT / "logs" + +# 기본 입출력 파일 +DEFAULT_MD_PATH = GEN_DIR / "report_draft.md" +DEFAULT_JSON_PATH = GEN_DIR / "report_sections.json" +DEFAULT_OUTPUT_PATH = GEN_DIR / "report.html" + +for d in [GEN_DIR, ASSETS_DIR, LOG_DIR]: + d.mkdir(parents=True, exist_ok=True) + def log(msg: str): - print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}") + """로깅 함수""" + line = f"[{datetime.now().strftime('%H:%M:%S')}] {msg}" + print(line, flush=True) + with (LOG_DIR / "step9_html_publish_log.txt").open("a", encoding="utf-8") as f: + f.write(line + "\n") + +# ===== 데이터 클래스 ===== +@dataclass +class ImageAsset: + """이미지 자산 정보""" + image_id: str + filename: str + caption: str + placeholder: str + source_path: str = "" + page: Optional[int] = None + asset_path: Optional[str] = None + + +@dataclass +class Section: + """섹션 정보""" + section_id: str + section_title: str + generated_text: str + assets: List[ImageAsset] = field(default_factory=list) + + +@dataclass +class TocItem: + """목차 항목""" + number: str + title: str + level: int # 1, 2, 3 + + +# ===== 파일 로더 ===== +def load_json_meta(json_path: Path) -> Tuple[str, List[Section]]: + """JSON 파일에서 메타정보와 섹션 로드""" + if not json_path.exists(): + raise FileNotFoundError(f"JSON 파일 없음: {json_path}") + + data = json.loads(json_path.read_text(encoding="utf-8")) + report_title = data.get("report_title", "보고서") + + sections = [] + for sec in data.get("sections", []): + assets = [] + for asset in sec.get("assets", []): + assets.append(ImageAsset( + image_id=asset.get("image_id", ""), + filename=asset.get("filename", ""), + caption=asset.get("caption", ""), + placeholder=asset.get("placeholder", ""), + source_path=asset.get("source_path", ""), + page=asset.get("page"), + asset_path=asset.get("asset_path") + )) + + sections.append(Section( + section_id=sec.get("section_id", ""), + section_title=sec.get("section_title", ""), + generated_text=sec.get("generated_text", ""), + assets=assets + )) + + return report_title, sections + + +def load_markdown(md_path: Path) -> str: + """마크다운 파일 로드""" + if not md_path.exists(): + raise FileNotFoundError(f"MD 파일 없음: {md_path}") + return md_path.read_text(encoding="utf-8") + + +# ===== 이미지 맵 생성 ===== +def build_image_map(sections: List[Section]) -> Dict[str, ImageAsset]: + """placeholder → ImageAsset 매핑 생성""" + img_map = {} + for sec in sections: + for asset in sec.assets: + if asset.placeholder: + # {{IMG:xxx}} 형태에서 xxx 추출 + img_map[asset.image_id] = asset + return img_map + + +# ===== 목차 생성 ===== +def extract_toc_from_md(md_content: str) -> List[TocItem]: + """마크다운에서 목차 구조 추출""" + toc_items = [] + + # 헤딩 패턴 + patterns = [ + (re.compile(r'^##\s+(\d+)\s+(.+)$', re.MULTILINE), 1), # ## 1 대목차 + (re.compile(r'^###\s+(\d+\.\d+)\s+(.+)$', re.MULTILINE), 2), # ### 1.1 중목차 + (re.compile(r'^####\s+(\d+\.\d+\.\d+)\s+(.+)$', re.MULTILINE), 3), # #### 1.1.1 소목차 + ] + + for pattern, level in patterns: + for match in pattern.finditer(md_content): + number = match.group(1) + title = match.group(2).strip() + toc_items.append(TocItem(number=number, title=title, level=level)) + + # 번호순 정렬 + def sort_key(item: TocItem) -> tuple: + parts = item.number.split('.') + return tuple(int(p) for p in parts) + + toc_items.sort(key=sort_key) + return toc_items + + +def generate_toc_html(toc_items: List[TocItem]) -> str: + """목차 HTML 생성""" + if not toc_items: + return "" + + lines = ['') + return '\n'.join(lines) + + +# ===== 마크다운 → HTML 변환 ===== class MarkdownToHtmlConverter: - def __init__(self): - pass + """마크다운을 HTML로 변환하는 클래스 Teng-style""" + + def __init__(self, image_map: Dict[str, ImageAsset]): + self.image_map = image_map + self.table_counter = {} # chapter -> count + self.figure_counter = {} # chapter -> count + + def get_chapter(self, context: str = "1") -> str: + """현재 챕터 번호 추출""" + return context.split('.')[0] if context else "1" + + def next_table_num(self, chapter: str) -> str: + """다음 표 번호""" + if chapter not in self.table_counter: + self.table_counter[chapter] = 0 + self.table_counter[chapter] += 1 + return f"{chapter}-{self.table_counter[chapter]}" + + def next_figure_num(self, chapter: str) -> str: + """다음 그림 번호""" + if chapter not in self.figure_counter: + self.figure_counter[chapter] = 0 + self.figure_counter[chapter] += 1 + return f"{chapter}-{self.figure_counter[chapter]}" + + def convert_table(self, md_table: str, caption: str = "", chapter: str = "1") -> str: + """마크다운 테이블 → HTML 테이블""" + lines = [l.strip() for l in md_table.strip().split('\n') if l.strip()] + if len(lines) < 2: + return "" + + # 헤더 행 + header_cells = [c.strip() for c in lines[0].split('|') if c.strip()] + + # 구분선 건너뛰기 (|---|---|) + data_start = 1 + if len(lines) > 1 and re.match(r'^[\|\s\-:]+$', lines[1]): + data_start = 2 + + # 데이터 행 + data_rows = [] + for line in lines[data_start:]: + cells = [c.strip() for c in line.split('|') if c.strip()] + if cells: + data_rows.append(cells) + + # HTML 생성 + html_lines = [''] + + # thead + html_lines.append('') + for cell in header_cells: + # **text** → text + cell = re.sub(r'\*\*(.+?)\*\*', r'\1', cell) + html_lines.append(f'') + html_lines.append('') + + # tbody + html_lines.append('') + for row in data_rows: + html_lines.append('') + for cell in row: + # **text** 처리 + cell = re.sub(r'\*\*(.+?)\*\*', r'\1', cell) + #
처리 + cell = cell.replace('
', '
') + html_lines.append(f'') + html_lines.append('') + html_lines.append('') + html_lines.append('
{cell}
{cell}
') + + # 캡션 추가 + if caption: + html_lines.append(f'
{caption}
') + + return '\n'.join(html_lines) + + def convert_image_placeholder(self, placeholder: str, chapter: str = "1") -> str: + """{{IMG:xxx}} →
변환""" + # {{IMG:1_1_1_img01}} 에서 ID 추출 + match = re.match(r'\{\{IMG:(.+?)\}\}', placeholder) + if not match: + return placeholder + + image_id = match.group(1) + asset = self.image_map.get(image_id) + + if asset and asset.asset_path: + fig_num = self.next_figure_num(chapter) + caption = asset.caption if asset.caption and asset.caption != "Photo" else "" + caption_text = f"[그림 {fig_num}] {caption}" if caption else f"[그림 {fig_num}]" + + return f'''
+ {caption} +
{caption_text}
+
''' + else: + # 이미지 파일이 없는 경우 플레이스홀더 주석으로 + return f'' + + def convert_list(self, md_list: str) -> str: + """마크다운 리스트 → HTML 리스트""" + lines = md_list.strip().split('\n') + html_lines = [] + in_list = False + list_type = 'ul' + + for line in lines: + line = line.strip() + if not line: + continue + + # 순서 없는 리스트 + ul_match = re.match(r'^[\*\-]\s+(.+)$', line) + # 순서 있는 리스트 + ol_match = re.match(r'^(\d+)\.\s+(.+)$', line) + + if ul_match: + if not in_list: + html_lines.append('
    ') + in_list = True + list_type = 'ul' + content = ul_match.group(1) + content = re.sub(r'\*\*(.+?)\*\*', r'\1', content) + html_lines.append(f'
  • {content}
  • ') + elif ol_match: + if not in_list: + html_lines.append('
      ') + in_list = True + list_type = 'ol' + content = ol_match.group(2) + content = re.sub(r'\*\*(.+?)\*\*', r'\1', content) + html_lines.append(f'
    1. {content}
    2. ') + + if in_list: + html_lines.append(f'') + + return '\n'.join(html_lines) + + def convert_paragraph(self, text: str) -> str: + """일반 텍스트 →

      변환""" + # 빈 줄이면 무시 + if not text.strip(): + return "" + + # **text** → + text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) + # *text* → + text = re.sub(r'\*(.+?)\*', r'\1', text) + # `code` → + text = re.sub(r'`(.+?)`', r'\1', text) + + return f'

      {text}

      ' + + def convert_full_content(self, md_content: str) -> str: + """전체 마크다운 콘텐츠를 HTML로 변환""" + lines = md_content.split('\n') + html_parts = [] + + current_chapter = "1" + i = 0 + + while i < len(lines): + line = lines[i].strip() + + # 빈 줄 + if not line: + i += 1 + continue + + # H1 (# 제목) - 보고서 제목, 섹션 시작 등 + h1_match = re.match(r'^#\s+(.+)$', line) + if h1_match and not line.startswith('##'): + title = h1_match.group(1) + # 섹션 번호가 있으면 추출 + num_match = re.match(r'^(\d+(?:\.\d+)*)\s+', title) + if num_match: + current_chapter = num_match.group(1).split('.')[0] + html_parts.append(f'

      {title}

      ') + i += 1 + continue + + # H2 (## 대목차) + h2_match = re.match(r'^##\s+(.+)$', line) + if h2_match: + title = h2_match.group(1) + num_match = re.match(r'^(\d+)\s+', title) + if num_match: + current_chapter = num_match.group(1) + html_parts.append(f'

      {title}

      ') # H1으로 변환 (페이지 분리 트리거) + i += 1 + continue + + # H3 (### 중목차) + h3_match = re.match(r'^###\s+(.+)$', line) + if h3_match: + html_parts.append(f'

      {h3_match.group(1)}

      ') + i += 1 + continue + + # H4 (#### 소목차/꼭지) + h4_match = re.match(r'^####\s+(.+)$', line) + if h4_match: + html_parts.append(f'

      {h4_match.group(1)}

      ') + i += 1 + continue + + # 이미지 플레이스홀더 {{IMG:xxx}} + img_match = re.match(r'^\{\{IMG:(.+?)\}\}$', line) + if img_match: + html_parts.append(self.convert_image_placeholder(line, current_chapter)) + i += 1 + continue + + # 이미지 캡션 *(참고: ...)* - figure 바로 뒤에 나오면 무시 (이미 figcaption으로 처리) + if line.startswith('*(') and line.endswith(')*'): + i += 1 + continue + + # 테이블 감지 (| 로 시작) + if line.startswith('|') or (line.startswith('**[표') and i + 1 < len(lines)): + # 표 제목 캡션 + caption = "" + if line.startswith('**[표'): + caption_match = re.match(r'^\*\*(\[표.+?\].*?)\*\*$', line) + if caption_match: + caption = caption_match.group(1) + i += 1 + if i >= len(lines): + break + line = lines[i].strip() + + # 테이블 본문 수집 + table_lines = [] + while i < len(lines) and (lines[i].strip().startswith('|') or + re.match(r'^[\|\s\-:]+$', lines[i].strip())): + table_lines.append(table_lines) # Fixed from list.append(table_lines) to line in thinking + table_lines.append(lines[i]) + i += 1 + + if table_lines: + table_md = '\n'.join(table_lines) + html_parts.append(self.convert_table(table_md, caption, current_chapter)) + continue + + # 리스트 감지 (* 또는 - 또는 1. 로 시작) + if re.match(r'^[\*\-]\s+', line) or re.match(r'^\d+\.\s+', line): + list_lines = [line] + i += 1 + while i < len(lines): + next_line = lines[i].strip() + if re.match(r'^[\*\-]\s+', next_line) or re.match(r'^\d+\.\s+', next_line): + list_lines.append(next_line) + i += 1 + elif not next_line: + i += 1 + break + else: + break + + html_parts.append(self.convert_list('\n'.join(list_lines))) + continue + + # 일반 문단 + para_lines = [line] + i += 1 + while i < len(lines): + next_line = lines[i].strip() + # 다음이 특수 요소면 문단 종료 + if (not next_line or + next_line.startswith('#') or + next_line.startswith('|') or + next_line.startswith('**[표') or + next_line.startswith('{{IMG:') or + next_line.startswith('*(') or + re.match(r'^[\*\-]\s+', next_line) or + re.match(r'^\d+\.\s+', next_line)): + break + para_lines.append(next_line) + i += 1 + + para_text = ' '.join(para_lines) + if para_text: + html_parts.append(self.convert_paragraph(para_text)) + + return '\n'.join(html_parts) - def convert(self, md_content: str) -> str: - """단순 마크다운 -> HTML 변환 (정규식 기반)""" - html = md_content - - # 헤더 - html = re.sub(r'^#### (.*)$', r'

      \1

      ', html, flags=re.MULTILINE) - html = re.sub(r'^### (.*)$', r'

      \1

      ', html, flags=re.MULTILINE) - html = re.sub(r'^## (.*)$', r'

      \1

      ', html, flags=re.MULTILINE) - html = re.sub(r'^# (.*)$', r'

      \1

      ', html, flags=re.MULTILINE) - - # 강조 - html = re.sub(r'\*\*(.*?)\*\*', r'\1', html) - - # 리스트 - html = re.sub(r'^\s*-\s+(.*)$', r'
    3. \1
    4. ', html, flags=re.MULTILINE) - - # 줄바꿈 - html = html.replace('\n', '
      \n') - - return html -def get_html_template(title: str, content: str) -> str: - """최종 HTML 템플릿 적용""" - return f""" +# ===== HTML 템플릿 ===== +def get_html_template() -> str: + """A4 보고서 HTML 템플릿 반환""" + return ''' - - {title} - + +{{report_title}} + -
      - {content} + +
      +
      {{box_cover}}
      +
      {{box_toc}}
      +
      {{box_summary}}
      +
      {{box_content}}
      + + + + -""" +''' -def main(input_dir, output_dir): - global OUTPUT_ROOT, GEN_DIR - OUTPUT_ROOT = Path(output_dir) - GEN_DIR = OUTPUT_ROOT / "generated" - - md_path = GEN_DIR / "report_draft.md" - out_path = GEN_DIR / "report.html" - - if not md_path.exists(): - log(f"대상 파일 없음: {md_path}") - return - log("HTML 변환 작업 시작...") - md_content = md_path.read_text(encoding="utf-8") +# ===== 메인 함수 ===== +def generate_report_html( + md_path: Path, + json_path: Path, + output_path: Path, + include_toc: bool = True, + include_summary: bool = True, + cover_info: Optional[Dict[str, str]] = None +): + """ + MD와 JSON을 A4 HTML 보고서로 변환 - # 제목 추출 - title_match = re.search(r'^# (.*)$', md_content, re.MULTILINE) - title = title_match.group(1) if title_match else "보고서" + Args: + md_path: report_draft.md 경로 + json_path: report_sections.json 경로 + output_path: 출력할 report.html 경로 + include_toc: 목차 포함 여부 + include_summary: 요약 포함 여부 + cover_info: 표지 정보 (date, author, department 등) + """ + log("=== Step 9: MD → HTML 변환 시작 ===") - converter = MarkdownToHtmlConverter() - html_body = converter.convert(md_content) + # 1. 데이터 로드 + log(f"JSON 로드: {{json_path}}") + report_title, sections = load_json_meta(json_path) - final_html = get_html_template(title, html_body) + log(f"MD 로드: {{md_path}}") + md_content = load_markdown(md_path) - out_path.write_text(final_html, encoding="utf-8") - log(f"최종 HTML 생성 완료: {out_path}") + log(f"보고서 제목: {{report_title}}") + log(f"섹션 수: {{len(sections)}}") + + # 2. 이미지 맵 생성 + image_map = build_image_map(sections) + log(f"이미지 자산 수: {{len(image_map)}}") + + # 3. 목차 추출 + toc_items = extract_toc_from_md(md_content) + log(f"목차 항목 수: {{len(toc_items)}}") + + # 4. MD → HTML 변환 + converter = MarkdownToHtmlConverter(image_map) + content_html = converter.convert_full_content(md_content) + + # 5. 박스별 콘텐츠 생성 + + # box-cover (표지) + cover_date = cover_info.get('date', datetime.now().strftime('%Y.%m.%d')) if cover_info else datetime.now().strftime('%Y.%m.%d') + cover_author = cover_info.get('author', '') if cover_info else '' + cover_dept = cover_info.get('department', '') if cover_info else '' + + # 제목에서 부제목 분리 (: 기준) + title_parts = report_title.split(':') + main_title = title_parts[0].strip() + sub_title = title_parts[1].strip() if len(title_parts) > 1 else "" + + box_cover = f''' +

      {{main_title}}

      +

      {{sub_title}}

      +

      {{cover_date}}

      + {{f'

      {{cover_author}}

      ' if cover_author else ''}} + {{f'

      {{cover_dept}}

      ' if cover_dept else ''}} + ''' + + # box-toc (목차) + box_toc = "" + if include_toc and toc_items: + box_toc = generate_toc_html(toc_items) + log(f"목차 HTML 생성 완료") + + # box-summary (요약) - 첫 번째 섹션을 요약으로 사용하거나 비워둠 + box_summary = "" + if include_summary: + # 요약 섹션이 있으면 사용 + for sec in sections: + if '요약' in sec.section_title or 'summary' in sec.section_title.lower(): + summary_converter = MarkdownToHtmlConverter(image_map) + box_summary = f"

      요약

      \\n{{summary_converter.convert_full_content(sec.generated_text)}}" + break + + # box-content (본문) + box_content = content_html + + # 6. 템플릿에 주입 + template = get_html_template() + html_output = template.format( + report_title=report_title, + box_cover=box_cover, + box_toc=box_toc, + box_summary=box_summary, + box_content=box_content + ) + + # 7. 파일 저장 + output_path.write_text(html_output, encoding='utf-8') + + log(f"") + log(f"═══════════════════════════════════════════════════") + log(f"HTML 보고서 생성 완료!") + log(f" 출력 파일: {{output_path}}") + log(f" 파일 크기: {{output_path.stat().st_size / 1024:.1f}} KB") + log(f"═══════════════════════════════════════════════════") + log("=== Step 9 종료 ===") + + return output_path + + +def main(): + """CLI 진입점""" + parser = argparse.ArgumentParser( + description='MD + JSON → A4 HTML 보고서 변환', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=''' +예시: + python 9_md_to_html_publisher.py + python 9_md_to_html_publisher.py --md report_draft.md --json report_sections.json + python 9_md_to_html_publisher.py --no-toc --no-summary + python 9_md_to_html_publisher.py --cover-date "2026.01.15" --cover-author "홍길동" + ''' + ) + + parser.add_argument('--md', type=Path, default=DEFAULT_MD_PATH, + help='입력 마크다운 파일 경로') + parser.add_argument('--json', type=Path, default=DEFAULT_JSON_PATH, + help='입력 JSON 파일 경로') + parser.add_argument('--output', '-o', type=Path, default=DEFAULT_OUTPUT_PATH, + help='출력 HTML 파일 경로') + parser.add_argument('--no-toc', action='store_true', + help='목차 페이지 제외') + parser.add_argument('--no-summary', action='store_true', + help='요약 페이지 제외') + parser.add_argument('--cover-date', type=str, default=None, + help='표지 날짜 (예: 2026.01.15)') + parser.add_argument('--cover-author', type=str, default=None, + help='표지 작성자') + parser.add_argument('--cover-dept', type=str, default=None, + help='표지 부서명') + + args = parser.parse_args() + + # 표지 정보 구성 + cover_info = {} + if args.cover_date: + cover_info['date'] = args.cover_date + if args.cover_author: + cover_info['author'] = args.cover_author + if args.cover_dept: + cover_info['department'] = args.cover_dept + + # 변환 실행 + generate_report_html( + md_path=args.md, + json_path=args.json, + output_path=args.output, + include_toc=not args.no_toc, + include_summary=not args.no_summary, + cover_info=cover_info if cover_info else None + ) + if __name__ == "__main__": main()