Files
test/converters/pipeline/step9_html.py

1249 lines
49 KiB
Python

# -*- coding: utf-8 -*-
"""
9_md_to_html_publisher.py
기능:
- report_draft.md + report_sections.json → report.html 변환
- A4 규격 페이지네이션 템플릿 적용
- 마크다운 테이블 → HTML 테이블 변환
- 이미지 플레이스홀더 {{IMG:xxx}} → <figure> 변환
- 목차(TOC) 자동 생성
사용법:
python 9_md_to_html_publisher.py
python 9_md_to_html_publisher.py --md report_draft.md --json report_sections.json --output report.html
python 9_md_to_html_publisher.py --no-toc --no-summary
"""
import os
import re
import json
import argparse
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Any, Tuple, Optional
from dataclasses import dataclass, field
# ===== 경로 설정 =====
OUTPUT_ROOT = Path(r"D:\for python\geulbeot-light\geulbeot-light\00.test\hwpx\out\out") # 출력 위치
GEN_DIR = OUTPUT_ROOT / "generated"
ASSETS_DIR = GEN_DIR / "assets"
LOG_DIR = OUTPUT_ROOT / "logs"
# 기본 입출력 파일
DEFAULT_MD_PATH = GEN_DIR / "report_draft.md"
DEFAULT_JSON_PATH = GEN_DIR / "report_sections.json"
DEFAULT_OUTPUT_PATH = GEN_DIR / "report.html"
for d in [GEN_DIR, ASSETS_DIR, LOG_DIR]:
d.mkdir(parents=True, exist_ok=True)
def log(msg: str):
"""로깅 함수"""
line = f"[{datetime.now().strftime('%H:%M:%S')}] {msg}"
print(line, flush=True)
with (LOG_DIR / "step9_html_publish_log.txt").open("a", encoding="utf-8") as f:
f.write(line + "\n")
# ===== 데이터 클래스 =====
@dataclass
class ImageAsset:
"""이미지 자산 정보"""
image_id: str
filename: str
caption: str
placeholder: str
source_path: str = ""
page: Optional[int] = None
asset_path: Optional[str] = None
@dataclass
class Section:
"""섹션 정보"""
section_id: str
section_title: str
generated_text: str
assets: List[ImageAsset] = field(default_factory=list)
@dataclass
class TocItem:
"""목차 항목"""
number: str
title: str
level: int # 1, 2, 3
# ===== 파일 로더 =====
def load_json_meta(json_path: Path) -> Tuple[str, List[Section]]:
"""JSON 파일에서 메타정보와 섹션 로드"""
if not json_path.exists():
raise FileNotFoundError(f"JSON 파일 없음: {json_path}")
data = json.loads(json_path.read_text(encoding="utf-8"))
report_title = data.get("report_title", "보고서")
sections = []
for sec in data.get("sections", []):
assets = []
for asset in sec.get("assets", []):
assets.append(ImageAsset(
image_id=asset.get("image_id", ""),
filename=asset.get("filename", ""),
caption=asset.get("caption", ""),
placeholder=asset.get("placeholder", ""),
source_path=asset.get("source_path", ""),
page=asset.get("page"),
asset_path=asset.get("asset_path")
))
sections.append(Section(
section_id=sec.get("section_id", ""),
section_title=sec.get("section_title", ""),
generated_text=sec.get("generated_text", ""),
assets=assets
))
return report_title, sections
def load_markdown(md_path: Path) -> str:
"""마크다운 파일 로드"""
if not md_path.exists():
raise FileNotFoundError(f"MD 파일 없음: {md_path}")
return md_path.read_text(encoding="utf-8")
# ===== 이미지 맵 생성 =====
def build_image_map(sections: List[Section]) -> Dict[str, ImageAsset]:
"""placeholder → ImageAsset 매핑 생성"""
img_map = {}
for sec in sections:
for asset in sec.assets:
if asset.placeholder:
# {{IMG:xxx}} 형태에서 xxx 추출
img_map[asset.image_id] = asset
return img_map
# ===== 목차 생성 =====
def extract_toc_from_md(md_content: str) -> List[TocItem]:
"""마크다운에서 목차 구조 추출"""
toc_items = []
# 헤딩 패턴
patterns = [
(re.compile(r'^##\s+(\d+)\s+(.+)$', re.MULTILINE), 1), # ## 1 대목차
(re.compile(r'^###\s+(\d+\.\d+)\s+(.+)$', re.MULTILINE), 2), # ### 1.1 중목차
(re.compile(r'^####\s+(\d+\.\d+\.\d+)\s+(.+)$', re.MULTILINE), 3), # #### 1.1.1 소목차
]
for pattern, level in patterns:
for match in pattern.finditer(md_content):
number = match.group(1)
title = match.group(2).strip()
toc_items.append(TocItem(number=number, title=title, level=level))
# 번호순 정렬
def sort_key(item: TocItem) -> tuple:
parts = item.number.split('.')
return tuple(int(p) for p in parts)
toc_items.sort(key=sort_key)
return toc_items
def generate_toc_html(toc_items: List[TocItem]) -> str:
"""목차 HTML 생성"""
if not toc_items:
return ""
lines = ['<ul style="list-style:none; padding:0; margin:0;">']
current_l1 = None
for item in toc_items:
if item.level == 1:
# 새로운 대목차 그룹
if current_l1 is not None:
lines.append('</div>') # 이전 그룹 닫기
lines.append('<div class="toc-group atomic-block">')
lines.append(f'<li class="toc-item toc-lvl-1">{item.number}. {item.title}</li>')
current_l1 = item.number
elif item.level == 2:
lines.append(f'<li class="toc-item toc-lvl-2">{item.number} {item.title}</li>')
elif item.level == 3:
lines.append(f'<li class="toc-item toc-lvl-3">{item.number} {item.title}</li>')
if current_l1 is not None:
lines.append('</div>') # 마지막 그룹 닫기
lines.append('</ul>')
return '\n'.join(lines)
# ===== 마크다운 → HTML 변환 =====
class MarkdownToHtmlConverter:
"""마크다운을 HTML로 변환하는 클래스"""
def __init__(self, image_map: Dict[str, ImageAsset]):
self.image_map = image_map
self.table_counter = {} # chapter -> count
self.figure_counter = {} # chapter -> count
def get_chapter(self, context: str = "1") -> str:
"""현재 챕터 번호 추출"""
return context.split('.')[0] if context else "1"
def next_table_num(self, chapter: str) -> str:
"""다음 표 번호"""
if chapter not in self.table_counter:
self.table_counter[chapter] = 0
self.table_counter[chapter] += 1
return f"{chapter}-{self.table_counter[chapter]}"
def next_figure_num(self, chapter: str) -> str:
"""다음 그림 번호"""
if chapter not in self.figure_counter:
self.figure_counter[chapter] = 0
self.figure_counter[chapter] += 1
return f"{chapter}-{self.figure_counter[chapter]}"
def convert_table(self, md_table: str, caption: str = "", chapter: str = "1") -> str:
"""마크다운 테이블 → HTML 테이블"""
lines = [l.strip() for l in md_table.strip().split('\n') if l.strip()]
if len(lines) < 2:
return ""
# 헤더 행
header_cells = [c.strip() for c in lines[0].split('|') if c.strip()]
# 구분선 건너뛰기 (|---|---|)
data_start = 1
if len(lines) > 1 and re.match(r'^[\|\s\-:]+$', lines[1]):
data_start = 2
# 데이터 행
data_rows = []
for line in lines[data_start:]:
cells = [c.strip() for c in line.split('|') if c.strip()]
if cells:
data_rows.append(cells)
# HTML 생성
html_lines = ['<table class="atomic-block">']
# thead
html_lines.append('<thead><tr>')
for cell in header_cells:
# **text** → <strong>text</strong>
cell = re.sub(r'\*\*(.+?)\*\*', r'<strong>\1</strong>', cell)
html_lines.append(f'<th>{cell}</th>')
html_lines.append('</tr></thead>')
# tbody
html_lines.append('<tbody>')
for row in data_rows:
html_lines.append('<tr>')
for cell in row:
# **text** 처리
cell = re.sub(r'\*\*(.+?)\*\*', r'<strong>\1</strong>', cell)
# <br> 처리
cell = cell.replace('<br>', '<br/>')
html_lines.append(f'<td>{cell}</td>')
html_lines.append('</tr>')
html_lines.append('</tbody>')
html_lines.append('</table>')
# 캡션 추가
if caption:
html_lines.append(f'<figcaption>{caption}</figcaption>')
return '\n'.join(html_lines)
def convert_image_placeholder(self, placeholder: str, chapter: str = "1") -> str:
"""{{IMG:xxx}} → <figure> 변환"""
# {{IMG:1_1_1_img01}} 에서 ID 추출
match = re.match(r'\{\{IMG:(.+?)\}\}', placeholder)
if not match:
return placeholder
image_id = match.group(1)
asset = self.image_map.get(image_id)
if asset and asset.asset_path:
fig_num = self.next_figure_num(chapter)
caption = asset.caption if asset.caption and asset.caption != "Photo" else ""
caption_text = f"[그림 {fig_num}] {caption}" if caption else f"[그림 {fig_num}]"
return f'''<figure class="atomic-block">
<img src="{asset.asset_path}" alt="{caption}">
<figcaption>{caption_text}</figcaption>
</figure>'''
else:
# 이미지 파일이 없는 경우 플레이스홀더 주석으로
return f'<!-- 이미지 없음: {image_id} -->'
def convert_list(self, md_list: str) -> str:
"""마크다운 리스트 → HTML 리스트"""
lines = md_list.strip().split('\n')
html_lines = []
in_list = False
list_type = 'ul'
for line in lines:
line = line.strip()
if not line:
continue
# 순서 없는 리스트
ul_match = re.match(r'^[\*\-]\s+(.+)$', line)
# 순서 있는 리스트
ol_match = re.match(r'^(\d+)\.\s+(.+)$', line)
if ul_match:
if not in_list:
html_lines.append('<ul>')
in_list = True
list_type = 'ul'
content = ul_match.group(1)
content = re.sub(r'\*\*(.+?)\*\*', r'<strong>\1</strong>', content)
html_lines.append(f'<li>{content}</li>')
elif ol_match:
if not in_list:
html_lines.append('<ol>')
in_list = True
list_type = 'ol'
content = ol_match.group(2)
content = re.sub(r'\*\*(.+?)\*\*', r'<strong>\1</strong>', content)
html_lines.append(f'<li>{content}</li>')
if in_list:
html_lines.append(f'</{list_type}>')
return '\n'.join(html_lines)
def convert_paragraph(self, text: str) -> str:
"""일반 텍스트 → <p> 변환"""
# 빈 줄이면 무시
if not text.strip():
return ""
# **text** → <strong>
text = re.sub(r'\*\*(.+?)\*\*', r'<strong>\1</strong>', text)
# *text* → <em>
text = re.sub(r'\*(.+?)\*', r'<em>\1</em>', text)
# `code` → <code>
text = re.sub(r'`(.+?)`', r'<code>\1</code>', text)
return f'<p>{text}</p>'
def convert_full_content(self, md_content: str) -> str:
"""전체 마크다운 콘텐츠를 HTML로 변환"""
lines = md_content.split('\n')
html_parts = []
current_chapter = "1"
i = 0
while i < len(lines):
line = lines[i].strip()
# 빈 줄
if not line:
i += 1
continue
# H1 (# 제목) - 보고서 제목, 섹션 시작 등
h1_match = re.match(r'^#\s+(.+)$', line)
if h1_match and not line.startswith('##'):
title = h1_match.group(1)
# 섹션 번호가 있으면 추출
num_match = re.match(r'^(\d+(?:\.\d+)*)\s+', title)
if num_match:
current_chapter = num_match.group(1).split('.')[0]
html_parts.append(f'<h1>{title}</h1>')
i += 1
continue
# H2 (## 대목차)
h2_match = re.match(r'^##\s+(.+)$', line)
if h2_match:
title = h2_match.group(1)
num_match = re.match(r'^(\d+)\s+', title)
if num_match:
current_chapter = num_match.group(1)
html_parts.append(f'<h1>{title}</h1>') # H1으로 변환 (페이지 분리 트리거)
i += 1
continue
# H3 (### 중목차)
h3_match = re.match(r'^###\s+(.+)$', line)
if h3_match:
html_parts.append(f'<h2>{h3_match.group(1)}</h2>')
i += 1
continue
# H4 (#### 소목차/꼭지)
h4_match = re.match(r'^####\s+(.+)$', line)
if h4_match:
html_parts.append(f'<h3>{h4_match.group(1)}</h3>')
i += 1
continue
# 이미지 플레이스홀더 {{IMG:xxx}}
img_match = re.match(r'^\{\{IMG:(.+?)\}\}$', line)
if img_match:
html_parts.append(self.convert_image_placeholder(line, current_chapter))
i += 1
continue
# 이미지 캡션 *(참고: ...)* - figure 바로 뒤에 나오면 무시 (이미 figcaption으로 처리)
if line.startswith('*(') and line.endswith(')*'):
i += 1
continue
# 테이블 감지 (| 로 시작)
if line.startswith('|') or (line.startswith('**[표') and i + 1 < len(lines)):
# 표 제목 캡션
caption = ""
if line.startswith('**[표'):
caption_match = re.match(r'^\*\*(\[표.+?\].*?)\*\*$', line)
if caption_match:
caption = caption_match.group(1)
i += 1
if i >= len(lines):
break
line = lines[i].strip()
# 테이블 본문 수집
table_lines = []
while i < len(lines) and (lines[i].strip().startswith('|') or
re.match(r'^[\|\s\-:]+$', lines[i].strip())):
table_lines.append(lines[i])
i += 1
if table_lines:
table_md = '\n'.join(table_lines)
html_parts.append(self.convert_table(table_md, caption, current_chapter))
continue
# 리스트 감지 (* 또는 - 또는 1. 로 시작)
if re.match(r'^[\*\-]\s+', line) or re.match(r'^\d+\.\s+', line):
list_lines = [line]
i += 1
while i < len(lines):
next_line = lines[i].strip()
if re.match(r'^[\*\-]\s+', next_line) or re.match(r'^\d+\.\s+', next_line):
list_lines.append(next_line)
i += 1
elif not next_line:
i += 1
break
else:
break
html_parts.append(self.convert_list('\n'.join(list_lines)))
continue
# 일반 문단
para_lines = [line]
i += 1
while i < len(lines):
next_line = lines[i].strip()
# 다음이 특수 요소면 문단 종료
if (not next_line or
next_line.startswith('#') or
next_line.startswith('|') or
next_line.startswith('**[표') or
next_line.startswith('{{IMG:') or
next_line.startswith('*(') or
re.match(r'^[\*\-]\s+', next_line) or
re.match(r'^\d+\.\s+', next_line)):
break
para_lines.append(next_line)
i += 1
para_text = ' '.join(para_lines)
if para_text:
html_parts.append(self.convert_paragraph(para_text))
return '\n'.join(html_parts)
# ===== HTML 템플릿 =====
def get_html_template() -> str:
"""A4 보고서 HTML 템플릿 반환"""
return '''<!DOCTYPE html>
<html lang="ko">
<head>
<meta charset="UTF-8">
<title>{report_title}</title>
<style>
@import url('https://fonts.googleapis.com/css2?family=Noto+Sans+KR:wght@300;400;500;700;900&display=swap');
:root {{
--primary: #006400;
--accent: #228B22;
--light-green: #E8F5E9;
--bg: #525659;
}}
body {{ margin: 0; background: var(--bg); font-family: 'Noto Sans KR', sans-serif; }}
/* [A4 용지 규격] */
.sheet {{
width: 210mm; height: 297mm;
background: white; margin: 20px auto;
position: relative; overflow: hidden; box-sizing: border-box;
box-shadow: 0 0 15px rgba(0,0,0,0.1);
}}
@media print {{
.sheet {{ margin: 0; break-after: page; box-shadow: none; }}
body {{ background: white; }}
}}
/* [헤더/푸터] */
.page-header {{
position: absolute; top: 10mm; left: 20mm; right: 20mm;
font-size: 9pt; color: #000000; font-weight: bold;
text-align: right; border-bottom: none !important; padding-bottom: 5px;
}}
.page-footer {{
position: absolute; bottom: 10mm; left: 20mm; right: 20mm;
display: flex; justify-content: space-between; align-items: flex-end;
font-size: 9pt; color: #555; border-top: 1px solid #eee; padding-top: 5px;
}}
/* [본문 영역] */
.body-content {{
position: absolute;
top: 20mm; left: 20mm; right: 20mm;
bottom: auto;
}}
/* [타이포그래피] */
h1, h2, h3 {{
white-space: nowrap; overflow: hidden; word-break: keep-all; color: var(--primary);
margin: 0; padding: 0;
}}
h1 {{
font-size: 20pt;
font-weight: 900;
color: var(--primary);
border-bottom: 2px solid var(--primary);
margin-bottom: 20px;
margin-top: 0;
}}
h2 {{
font-size: 18pt;
border-left: 5px solid var(--accent);
padding-left: 10px;
margin-top: 30px;
margin-bottom: 10px;
color: #03581dff;
}}
h3 {{ font-size: 14pt; margin-top: 20px; margin-bottom: 5px; color: var(--accent); font-weight: 700; }}
p, li {{ font-size: 12pt !important; line-height: 1.6 !important; text-align: justify; word-break: keep-all; margin-bottom: 5px; }}
/* [목차 스타일] */
.toc-item {{ line-height: 1.8; list-style: none; border-bottom: 1px dotted #eee; }}
.toc-lvl-1 {{
color: #006400;
font-weight: 900;
font-size: 13.5pt;
margin-top: 15px;
margin-bottom: 5px;
border-bottom: 2px solid #ccc;
list-style: none !important;
}}
.toc-lvl-2 {{ font-size: 10.5pt; color: #333; margin-left: 20px; font-weight: normal; list-style: none !important; }}
.toc-lvl-3 {{ font-size: 10.5pt; color: #666; margin-left: 40px; list-style: none !important; }}
.toc-group {{
margin-bottom: 12px;
break-inside: avoid;
page-break-inside: avoid;
}}
/* [표/이미지 스타일] */
table {{
width: 100%;
border-collapse: collapse;
margin: 15px 0;
font-size: 9.5pt;
table-layout: auto;
border-top: 2px solid var(--primary);
}}
th, td {{
border: 1px solid #ddd;
padding: 6px 5px;
text-align: center;
vertical-align: middle;
word-break: keep-all;
word-wrap: break-word;
}}
th {{
background: var(--light-green);
color: var(--primary);
font-weight: 900;
white-space: nowrap;
letter-spacing: -0.05em;
font-size: 9pt;
}}
/* [캡션 및 그림 스타일] */
figure {{ display: block; margin: 20px auto; text-align: center; width: 100%; }}
img, svg {{ max-width: 95% !important; height: auto !important; display: block; margin: 0 auto; border: 1px solid #eee; }}
figcaption {{
display: block; text-align: center; margin-top: 10px;
font-size: 9.5pt; color: #666; font-weight: 600;
}}
.atomic-block {{ break-inside: avoid; page-break-inside: avoid; }}
#raw-container {{ display: none; }}
/* [하이라이트 박스] */
.highlight-box {{
background-color: rgb(226, 236, 226);
border: 1px solid #2a2c2aff;
padding: 5px; margin: 1.5px 1.5px 2px 0px; border-radius: 3px;
color: #333;
}}
.highlight-box li, .highlight-box p {{
font-size: 11pt !important;
line-height: 1.2;
letter-spacing: -0.6px;
margin-bottom: 3px;
color: #1a1919ff;
}}
.highlight-box h3, .highlight-box strong, .highlight-box b {{
font-size: 12pt !important; color: rgba(2, 37, 2, 1) !important;
font-weight: bold; margin: 0; display: block; margin-bottom: 5px;
}}
/* [요약 페이지 스타일] */
.squeeze {{
line-height: 1.35 !important;
letter-spacing: -0.5px !important;
margin-bottom: 2px !important;
}}
.squeeze-title {{
margin-bottom: 5px !important;
padding-bottom: 2px !important;
}}
#box-summary p, #box-summary li {{
font-size: 10pt !important;
line-height: 1.45 !important;
letter-spacing: -0.04em !important;
margin-bottom: 3px !important;
text-align: justify;
}}
#box-summary h1 {{
margin-bottom: 10px !important;
padding-bottom: 5px !important;
}}
</style>
</head>
<body>
<div id="raw-container">
<div id="box-cover">{box_cover}</div>
<div id="box-toc">{box_toc}</div>
<div id="box-summary">{box_summary}</div>
<div id="box-content">{box_content}</div>
</div>
<template id="page-template">
<div class="sheet">
<div class="page-header"></div>
<div class="body-content"></div>
<div class="page-footer">
<span class="rpt-title"></span>
<span class="pg-num"></span>
</div>
</div>
</template>
<script>
window.addEventListener("load", async () => {{
await document.fonts.ready;
const CONFIG = {{ maxHeight: 970 }};
const rawContainer = document.getElementById('raw-container');
if (rawContainer) {{
rawContainer.innerHTML = rawContainer.innerHTML.replace(
/(<rect[^>]*?)\\s+y="[^"]*"\\s+([^>]*?y="[^"]*")/gi,
"$1 $2"
);
}}
const raw = {{
cover: document.getElementById('box-cover'),
toc: document.getElementById('box-toc'),
summary: document.getElementById('box-summary'),
content: document.getElementById('box-content')
}};
let globalPage = 1;
let reportTitle = raw.cover.querySelector('h1')?.innerText || "Report";
function cleanH1Text(text) {{
if (!text) return "";
const parts = text.split("-");
return parts[0].trim();
}}
function detox(node) {{
if (node.nodeType !== 1) return;
if (node.closest('svg')) return;
let cls = "";
if (node.hasAttribute('class')) {{
cls = node.getAttribute('class');
}}
if ( (cls.includes('bg-') || cls.includes('border-') || cls.includes('box')) &&
!cls.includes('title-box') &&
!cls.includes('toc-') &&
!cls.includes('cover-') &&
!cls.includes('highlight-box') ) {{
node.setAttribute('class', 'highlight-box atomic-block');
const internalHeads = node.querySelectorAll('h3, h4, strong, b');
internalHeads.forEach(head => {{
head.removeAttribute('style');
head.removeAttribute('class');
}});
node.removeAttribute('style');
cls = 'highlight-box atomic-block';
}}
if (node.hasAttribute('class')) {{
if (!cls.includes('toc-') &&
!cls.includes('cover-') &&
!cls.includes('highlight-') &&
!cls.includes('title-box') &&
!cls.includes('atomic-block')) {{
node.removeAttribute('class');
}}
}}
node.removeAttribute('style');
if (node.tagName === 'TABLE') node.border = "1";
if (node.tagName === 'FIGURE') {{
const internalTitles = node.querySelectorAll('h3, h4, .chart-title');
internalTitles.forEach(t => t.style.display = 'none');
}}
}}
function formatTOC(element) {{
const items = element.querySelectorAll('li');
items.forEach(li => {{
const text = li.innerText.trim();
const m1 = text.match(/^(\\d+)\\.\\s+(.+)$/);
const m2 = text.match(/^(\\d+\\.\\d+)\\s+(.+)$/);
const m3 = text.match(/^(\\d+\\.\\d+\\.\\d+)\\s+(.+)$/);
if (m3) li.classList.add('toc-lvl-3');
else if (m2) li.classList.add('toc-lvl-2');
else if (m1) li.classList.add('toc-lvl-1');
}});
}}
function getFlatNodes(element) {{
if(element.id === 'box-toc') {{
element.querySelectorAll('*').forEach(el => detox(el));
formatTOC(element);
const tocNodes = [];
let title = element.querySelector('h1');
if (!title) {{
title = document.createElement('h1');
title.innerText = "목차";
}}
tocNodes.push(title.cloneNode(true));
const allLis = element.querySelectorAll('li');
let currentGroup = null;
allLis.forEach(li => {{
const isLevel1 = li.classList.contains('toc-lvl-1');
if (isLevel1) {{
if (currentGroup) tocNodes.push(currentGroup);
currentGroup = document.createElement('div');
currentGroup.className = 'toc-group atomic-block';
const ulWrapper = document.createElement('ul');
ulWrapper.style.margin = "0";
ulWrapper.style.padding = "0";
currentGroup.appendChild(ulWrapper);
}}
if (!currentGroup) {{
currentGroup = document.createElement('div');
currentGroup.className = 'toc-group atomic-block';
const ulWrapper = document.createElement('ul');
ulWrapper.style.margin = "0";
ulWrapper.style.padding = "0";
currentGroup.appendChild(ulWrapper);
}}
currentGroup.querySelector('ul').appendChild(li.cloneNode(true));
}});
if (currentGroup) tocNodes.push(currentGroup);
return tocNodes;
}}
let nodes = [];
Array.from(element.children).forEach(child => {{
detox(child);
if (child.classList.contains('highlight-box')) {{
child.querySelectorAll('h3, h4, strong, b').forEach(head => {{
head.removeAttribute('style');
head.removeAttribute('class');
}});
nodes.push(child.cloneNode(true));
}}
else if(['DIV','SECTION','ARTICLE','MAIN'].includes(child.tagName)) {{
nodes = nodes.concat(getFlatNodes(child));
}}
else if (['UL','OL'].includes(child.tagName)) {{
Array.from(child.children).forEach((li, idx) => {{
detox(li);
const w = document.createElement(child.tagName);
w.style.margin="0"; w.style.paddingLeft="20px";
if(child.tagName==='OL') w.start=idx+1;
const cloneLi = li.cloneNode(true);
cloneLi.querySelectorAll('*').forEach(el => detox(el));
w.appendChild(cloneLi);
nodes.push(w);
}});
}} else {{
const clone = child.cloneNode(true);
detox(clone);
clone.querySelectorAll('*').forEach(el => detox(el));
nodes.push(clone);
}}
}});
return nodes;
}}
function renderFlow(sectionType, sourceNodes) {{
if (!sourceNodes.length) return;
let currentHeaderTitle = sectionType === 'toc' ? "목차" : (sectionType === 'summary' ? "요약" : reportTitle);
let page = createPage(sectionType, currentHeaderTitle);
let body = page.querySelector('.body-content');
let queue = [...sourceNodes];
while (queue.length > 0) {{
let node = queue.shift();
let clone = node.cloneNode(true);
let isH1 = clone.tagName === 'H1';
let isHeading = ['H2', 'H3'].includes(clone.tagName);
let isText = ['P', 'LI'].includes(clone.tagName) && !clone.classList.contains('atomic-block');
let isAtomic = ['TABLE', 'FIGURE', 'IMG', 'SVG'].includes(clone.tagName) ||
clone.querySelector('table, img, svg') ||
clone.classList.contains('atomic-block');
if (isH1 && clone.innerText.includes('-')) {{
clone.innerText = clone.innerText.split('-')[0].trim();
}}
if (isH1 && (sectionType === 'body' || sectionType === 'summary')) {{
currentHeaderTitle = clone.innerText;
if (body.children.length > 0) {{
page = createPage(sectionType, currentHeaderTitle);
body = page.querySelector('.body-content');
}} else {{
page.querySelector('.page-header').innerText = currentHeaderTitle;
}}
}}
if (isHeading) {{
const spaceLeft = CONFIG.maxHeight - body.scrollHeight;
if (spaceLeft < 90) {{
page = createPage(sectionType, currentHeaderTitle);
body = page.querySelector('.body-content');
}}
}}
body.appendChild(clone);
if (isText && clone.innerText.length > 10) {{
const originalHeight = clone.offsetHeight;
clone.style.letterSpacing = "-1.0px";
if (clone.offsetHeight < originalHeight) {{
clone.style.letterSpacing = "-0.8px";
}} else {{
clone.style.letterSpacing = "";
}}
}}
if (body.scrollHeight > CONFIG.maxHeight) {{
if (isText) {{
body.removeChild(clone);
let textContent = node.innerText;
let tempP = node.cloneNode(false);
tempP.innerText = "";
if (clone.style.letterSpacing) tempP.style.letterSpacing = clone.style.letterSpacing;
body.appendChild(tempP);
const words = textContent.split(' ');
let currentText = "";
for (let i = 0; i < words.length; i++) {{
let word = words[i];
let prevText = currentText;
currentText += (currentText ? " " : "") + word;
tempP.innerText = currentText;
if (body.scrollHeight > CONFIG.maxHeight) {{
tempP.innerText = prevText;
tempP.style.textAlign = "justify";
tempP.style.textAlignLast = "justify";
let remainingText = words.slice(i).join(' ');
let remainingNode = node.cloneNode(false);
remainingNode.innerText = remainingText;
queue.unshift(remainingNode);
page = createPage(sectionType, currentHeaderTitle);
body = page.querySelector('.body-content');
body.style.lineHeight = "";
body.style.letterSpacing = "";
break;
}}
}}
}}
else {{
body.removeChild(clone);
let spaceLeft = CONFIG.maxHeight - body.scrollHeight;
if (body.children.length > 0 && spaceLeft > 50 && queue.length > 0) {{
while(queue.length > 0) {{
let candidate = queue[0];
if (['H1','H2','H3'].includes(candidate.tagName) ||
candidate.classList.contains('atomic-block') ||
candidate.querySelector('img, table')) break;
let filler = candidate.cloneNode(true);
if(['P','LI'].includes(filler.tagName) && filler.innerText.length > 10) {{
filler.style.letterSpacing = "-1.0px";
}}
body.appendChild(filler);
if (body.scrollHeight <= CONFIG.maxHeight) {{
if(filler.style.letterSpacing === "-1.0px") filler.style.letterSpacing = "-0.8px";
queue.shift();
}} else {{
body.removeChild(filler);
break;
}}
}}
}}
if (body.children.length > 0) {{
page = createPage(sectionType, currentHeaderTitle);
body = page.querySelector('.body-content');
}}
body.appendChild(clone);
if (isAtomic && body.scrollHeight > CONFIG.maxHeight) {{
const currentH = clone.offsetHeight;
const overflow = body.scrollHeight - CONFIG.maxHeight;
body.removeChild(clone);
if (overflow > 0 && overflow < (currentH * 0.15)) {{
clone.style.transform = "scale(0.85)";
clone.style.transformOrigin = "top center";
clone.style.marginBottom = `-${{currentH * 0.15}}px`;
body.appendChild(clone);
}} else {{
body.appendChild(clone);
}}
}}
}}
}}
}}
}}
function createPage(type, headerTitle) {{
const tpl = document.getElementById('page-template');
const clone = tpl.content.cloneNode(true);
const sheet = clone.querySelector('.sheet');
if (type === 'cover') {{
sheet.innerHTML = "";
const title = raw.cover.querySelector('h1')?.innerText || "Report";
const sub = raw.cover.querySelector('h2')?.innerText || "";
const pTags = raw.cover.querySelectorAll('p');
const infos = pTags.length > 0 ? Array.from(pTags).map(p => p.innerText).join(" / ") : "";
sheet.innerHTML = `
<div style="position:absolute; top:20mm; right:20mm; text-align:right; font-size:11pt; color:#666;">${{infos}}</div>
<div style="display:flex; flex-direction:column; justify-content:center; align-items:center; height:100%; text-align:center; width:100%;">
<div style="width:85%;">
<div style="font-size:32pt; font-weight:900; color:var(--primary); line-height:1.2; margin-bottom:30px; word-break:keep-all;">${{title}}</div>
<div style="font-size:20pt; font-weight:300; color:#444; word-break:keep-all;">${{sub}}</div>
</div>
</div>`;
}} else {{
clone.querySelector('.page-header').innerText = headerTitle;
clone.querySelector('.rpt-title').innerText = reportTitle;
if (type !== 'toc') clone.querySelector('.pg-num').innerText = `- ${{globalPage++}} -`;
else clone.querySelector('.pg-num').innerText = "";
}}
document.body.appendChild(sheet);
return sheet;
}}
createPage('cover');
if(raw.toc && raw.toc.innerHTML.trim()) renderFlow('toc', getFlatNodes(raw.toc));
const summaryNodes = getFlatNodes(raw.summary);
const tempBox = document.createElement('div');
tempBox.style.width = "210mm";
tempBox.style.position = "absolute";
tempBox.style.visibility = "hidden";
tempBox.id = 'box-summary';
document.body.appendChild(tempBox);
summaryNodes.forEach(node => tempBox.appendChild(node.cloneNode(true)));
const totalHeight = tempBox.scrollHeight;
const pageHeight = CONFIG.maxHeight;
const lastPart = totalHeight % pageHeight;
if (totalHeight > pageHeight && lastPart > 0 && lastPart < 180) {{
summaryNodes.forEach(node => {{
if(node.nodeType === 1) {{
node.classList.add('squeeze');
if(node.tagName === 'H1') node.classList.add('squeeze-title');
if(node.tagName === 'P' || node.tagName === 'LI') {{
node.style.fontSize = "9.5pt";
node.style.lineHeight = "1.4";
node.style.letterSpacing = "-0.8px";
}}
}}
}});
}}
document.body.removeChild(tempBox);
if(summaryNodes.length > 0) renderFlow('summary', summaryNodes);
renderFlow('body', getFlatNodes(raw.content));
document.querySelectorAll('.sheet h1, .sheet h2').forEach(el => {{
let fs = 100;
while(el.scrollWidth > el.clientWidth && fs > 50) {{ el.style.fontSize = (--fs)+"%"; }}
}});
const allTextNodes = document.querySelectorAll('.sheet .body-content p, .sheet .body-content li');
allTextNodes.forEach(el => {{
if (el.closest('table') || el.closest('figure') || el.closest('.chart')) return;
if (el.innerText.trim().length < 10) return;
const originH = el.offsetHeight;
const originSpacing = el.style.letterSpacing;
el.style.fontSize = "12pt";
el.style.letterSpacing = "-1.4px";
const newH = el.offsetHeight;
if (newH < originH) {{
el.style.letterSpacing = "-1.0px";
}} else {{
el.style.letterSpacing = originSpacing;
}}
}});
document.querySelectorAll('.sheet h1, .sheet h2').forEach(el => {{
let fs = 100;
while(el.scrollWidth > el.clientWidth && fs > 50) {{ el.style.fontSize = (--fs)+"%"; }}
}});
const pages = document.querySelectorAll('.sheet');
if (pages.length >= 2) {{
const lastSheet = pages[pages.length - 1];
const prevSheet = pages[pages.length - 2];
if(lastSheet.querySelector('.rpt-title')) {{
const lastBody = lastSheet.querySelector('.body-content');
const prevBody = prevSheet.querySelector('.body-content');
if (lastBody.scrollHeight < 150 && lastBody.innerText.trim().length > 0) {{
prevBody.style.lineHeight = "1.3";
prevBody.style.paddingBottom = "0px";
const contentToMove = Array.from(lastBody.children);
contentToMove.forEach(child => prevBody.appendChild(child.cloneNode(true)));
if (prevBody.scrollHeight <= CONFIG.maxHeight + 5) {{
lastSheet.remove();
}} else {{
for(let i=0; i<contentToMove.length; i++) prevBody.lastElementChild.remove();
prevBody.style.lineHeight = "";
}}
}}
}}
}}
const rawContainerFinal = document.getElementById('raw-container');
if(rawContainerFinal) rawContainerFinal.remove();
}});
</script>
</body>
</html>'''
# ===== 메인 함수 =====
def generate_report_html(
md_path: Path,
json_path: Path,
output_path: Path,
include_toc: bool = True,
include_summary: bool = True,
cover_info: Optional[Dict[str, str]] = None
):
"""
MD와 JSON을 A4 HTML 보고서로 변환
Args:
md_path: report_draft.md 경로
json_path: report_sections.json 경로
output_path: 출력할 report.html 경로
include_toc: 목차 포함 여부
include_summary: 요약 포함 여부
cover_info: 표지 정보 (date, author, department 등)
"""
log("=== Step 9: MD → HTML 변환 시작 ===")
# 1. 데이터 로드
log(f"JSON 로드: {json_path}")
report_title, sections = load_json_meta(json_path)
log(f"MD 로드: {md_path}")
md_content = load_markdown(md_path)
log(f"보고서 제목: {report_title}")
log(f"섹션 수: {len(sections)}")
# 2. 이미지 맵 생성
image_map = build_image_map(sections)
log(f"이미지 자산 수: {len(image_map)}")
# 3. 목차 추출
toc_items = extract_toc_from_md(md_content)
log(f"목차 항목 수: {len(toc_items)}")
# 4. MD → HTML 변환
converter = MarkdownToHtmlConverter(image_map)
content_html = converter.convert_full_content(md_content)
# 5. 박스별 콘텐츠 생성
# box-cover (표지)
cover_date = cover_info.get('date', datetime.now().strftime('%Y.%m.%d')) if cover_info else datetime.now().strftime('%Y.%m.%d')
cover_author = cover_info.get('author', '') if cover_info else ''
cover_dept = cover_info.get('department', '') if cover_info else ''
# 제목에서 부제목 분리 (: 기준)
title_parts = report_title.split(':')
main_title = title_parts[0].strip()
sub_title = title_parts[1].strip() if len(title_parts) > 1 else ""
box_cover = f'''
<h1>{main_title}</h1>
<h2>{sub_title}</h2>
<p>{cover_date}</p>
{f'<p>{cover_author}</p>' if cover_author else ''}
{f'<p>{cover_dept}</p>' if cover_dept else ''}
'''
# box-toc (목차)
box_toc = ""
if include_toc and toc_items:
box_toc = generate_toc_html(toc_items)
log(f"목차 HTML 생성 완료")
# box-summary (요약) - 첫 번째 섹션을 요약으로 사용하거나 비워둠
box_summary = ""
if include_summary:
# 요약 섹션이 있으면 사용
for sec in sections:
if '요약' in sec.section_title or 'summary' in sec.section_title.lower():
summary_converter = MarkdownToHtmlConverter(image_map)
box_summary = f"<h1>요약</h1>\n{summary_converter.convert_full_content(sec.generated_text)}"
break
# box-content (본문)
box_content = content_html
# 6. 템플릿에 주입
template = get_html_template()
html_output = template.format(
report_title=report_title,
box_cover=box_cover,
box_toc=box_toc,
box_summary=box_summary,
box_content=box_content
)
# 7. 파일 저장
output_path.write_text(html_output, encoding='utf-8')
log(f"")
log(f"═══════════════════════════════════════════════════")
log(f"HTML 보고서 생성 완료!")
log(f" 출력 파일: {output_path}")
log(f" 파일 크기: {output_path.stat().st_size / 1024:.1f} KB")
log(f"═══════════════════════════════════════════════════")
log("=== Step 9 종료 ===")
return output_path
def main():
"""CLI 진입점"""
parser = argparse.ArgumentParser(
description='MD + JSON → A4 HTML 보고서 변환',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog='''
예시:
python 9_md_to_html_publisher.py
python 9_md_to_html_publisher.py --md report_draft.md --json report_sections.json
python 9_md_to_html_publisher.py --no-toc --no-summary
python 9_md_to_html_publisher.py --cover-date "2026.01.15" --cover-author "홍길동"
'''
)
parser.add_argument('--md', type=Path, default=DEFAULT_MD_PATH,
help='입력 마크다운 파일 경로')
parser.add_argument('--json', type=Path, default=DEFAULT_JSON_PATH,
help='입력 JSON 파일 경로')
parser.add_argument('--output', '-o', type=Path, default=DEFAULT_OUTPUT_PATH,
help='출력 HTML 파일 경로')
parser.add_argument('--no-toc', action='store_true',
help='목차 페이지 제외')
parser.add_argument('--no-summary', action='store_true',
help='요약 페이지 제외')
parser.add_argument('--cover-date', type=str, default=None,
help='표지 날짜 (예: 2026.01.15)')
parser.add_argument('--cover-author', type=str, default=None,
help='표지 작성자')
parser.add_argument('--cover-dept', type=str, default=None,
help='표지 부서명')
args = parser.parse_args()
# 표지 정보 구성
cover_info = {}
if args.cover_date:
cover_info['date'] = args.cover_date
if args.cover_author:
cover_info['author'] = args.cover_author
if args.cover_dept:
cover_info['department'] = args.cover_dept
# 변환 실행
generate_report_html(
md_path=args.md,
json_path=args.json,
output_path=args.output,
include_toc=not args.no_toc,
include_summary=not args.no_summary,
cover_info=cover_info if cover_info else None
)
if __name__ == "__main__":
main()