from __future__ import annotations import json import re from pathlib import Path from typing import Any def _read_text(path: Path) -> str: return path.read_text(encoding='utf-8-sig') def _write_json(path: Path, data: dict[str, Any] | list[Any]) -> None: path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding='utf-8') def _write_text(path: Path, text: str) -> None: path.write_text(text, encoding='utf-8') def _normalize_space(text: str) -> str: return re.sub(r'\s+', ' ', text or '').strip() def _compact(text: str, max_len: int) -> str: normalized = _normalize_space(text) if len(normalized) <= max_len: return normalized cut = normalized[:max_len].rsplit(' ', 1)[0].strip() return (cut or normalized[:max_len]).rstrip(' ,.;:') + '...' def _preserve_len(text: str, ratio: float = 0.85, floor: int = 180, ceiling: int = 900) -> int: normalized = _normalize_space(text) if not normalized: return floor return max(floor, min(ceiling, int(len(normalized) * ratio))) def _normalize_title_key(text: str) -> str: return re.sub(r'\s+', '', text or '').lower() def _strip_frontmatter_and_imports(raw: str) -> str: text = raw.replace('\r\n', '\n') if text.startswith('---\n'): end = text.find('\n---', 4) if end != -1: text = text[end + 4 :] text = re.sub(r'^import\s+.+?$', '', text, flags=re.M) return text.strip() def _dx_effect_lines(repo_root: Path) -> list[str]: path = repo_root / 'components' / 'dx.astro' if not path.exists(): return [] text = _read_text(path) text = re.sub(r'', '', text, flags=re.S) text = text.replace('
', ' ') text = re.sub(r']*>', '\n', text) text = re.sub(r']*>', '- ', text) text = re.sub(r'', '\n', text) text = re.sub(r'<[^>]+>', ' ', text) lines: list[str] = [] for raw in text.splitlines(): line = _normalize_space(raw) if not line or line.startswith('/*') or line.startswith('[') or len(line) < 6: continue lines.append(line) deduped: list[str] = [] for line in lines: if line not in deduped: deduped.append(line) return deduped[:24] def _normalize_block_for_storage(text: str, repo_root: Path) -> str: dx_lines = _dx_effect_lines(repo_root) if '', replacement, text) text = re.sub(r']*>(.*?)', lambda m: f"**{re.sub(r'<[^>]+>', ' ', m.group(1)).strip()}**", text, flags=re.S) text = text.replace('
', '').replace('
', '') text = re.sub(r'', '\n', text, flags=re.I) text = re.sub(r']*>', '', text) text = re.sub(r':::\s*note\[(.*?)\]', r'**\1**', text) text = text.replace(':::', '') text = re.sub(r'!\[([^\]]+)\]\(([^\)]+)\)', r'[image] \1', text) text = re.sub(r'\n{3,}', '\n\n', text) return text.strip() def _extract_title_from_intro(block: str) -> str: m = re.search(r'\*\s+\*\*(.+?)\*\*', block) if m: return m.group(1).strip() return '서론' def _section_chunks(text: str) -> list[tuple[str, str]]: matches = list(re.finditer(r'^##\s+(.+)$', text, flags=re.M)) chunks: list[tuple[str, str]] = [] for idx, match in enumerate(matches): title = match.group(1).strip() start = match.end() end = matches[idx + 1].start() if idx + 1 < len(matches) else len(text) chunks.append((title, text[start:end].strip())) return chunks def _subsection_chunks(text: str) -> list[tuple[str, str]]: matches = list(re.finditer(r'^###\s+(.+)$', text, flags=re.M)) chunks: list[tuple[str, str]] = [] for idx, match in enumerate(matches): title = match.group(1).strip() start = match.end() end = matches[idx + 1].start() if idx + 1 < len(matches) else len(text) chunks.append((title, text[start:end].strip())) return chunks def _classify(title: str, layer_hint: str = 'core') -> tuple[str, str, str]: clean = title.strip() key = _normalize_title_key(clean) if any(token in key for token in ['혼용', '실태', '현실']): return 'problem', 'flow', 'intro' if any(token in key for token in ['정의', '개념', '용어']): return 'definition', 'flow', 'core' if any(token in key for token in ['상호관계', '관계', '위치']): return 'hierarchy', 'flow', 'core' if any(token in key for token in ['구분', '비교']): return 'comparison', 'reference', 'supporting' if any(token in key for token in ['사례', '근거', '대표']): return 'evidence', 'reference', 'supporting' if any(token in key for token in ['궁극적목표', '시행목표', '목표']): return 'goal', 'flow', 'core' if any(token in key for token in ['기대효과', '주체별', '효과']): return 'stakeholder_effect', 'flow', 'core' if any(token in key for token in ['필수요건', '요건']): return 'requirements', 'flow', 'core' if 'process' in key or '과정' in clean: return 'process', 'flow', 'core' if 'product' in key or '결과' in clean: return 'product', 'flow', 'core' if any(token in key for token in ['핵심요약', '요약', '결론']): return 'conclusion', 'flow', 'conclusion' if layer_hint == 'supporting': return 'support', 'reference', 'supporting' return 'section', 'flow', 'core' def _extract_detail_topics(block: str, start_id: int, repo_root: Path) -> tuple[list[dict[str, Any]], str, int]: topics: list[dict[str, Any]] = [] next_id = start_id def repl(match: re.Match[str]) -> str: nonlocal next_id inner = match.group(1) summary_match = re.search(r']*>(.*?)', inner, flags=re.S) summary = re.sub(r'<[^>]+>', ' ', summary_match.group(1)).strip() if summary_match else '상세 내용' detail_body = re.sub(r']*>.*?', '', inner, flags=re.S) detail_source = _normalize_block_for_storage(detail_body, repo_root) if detail_source: topics.append({ 'id': next_id, 'title': summary, 'purpose': '상세 근거 또는 부연 설명', 'role': 'reference', 'layer': 'supporting', 'relation_type': 'evidence', 'source_hint': summary, 'summary': _compact(detail_source, _preserve_len(detail_source, floor=220, ceiling=560)), 'source_data': detail_source, 'structured_text': detail_source, 'popup_candidate': True, }) next_id += 1 return f'\n* **{summary}**\n' stripped = re.sub(r'
(.*?)
', repl, block, flags=re.S) return topics, stripped, next_id def _extract_conclusion(text: str, repo_root: Path) -> tuple[str, str]: m = re.search(r':::\s*note\[(.*?)\](.*?):::', text, flags=re.S) if not m: return text, '' note_title = _normalize_space(m.group(1)) or '핵심 요약' note_body = _normalize_block_for_storage(m.group(2), repo_root) note_source = f'**{note_title}**\n{note_body}'.strip() stripped = text[: m.start()] + text[m.end() :] return stripped.strip(), note_source def _content_family(topics: list[dict[str, Any]]) -> str: relation_types = {str(t.get('relation_type', '') or '') for t in topics} if ('comparison' in relation_types or 'definition' in relation_types or 'hierarchy' in relation_types) and 'goal' not in relation_types: return 'type-a-compare-define-relate' if 'goal' in relation_types or 'stakeholder_effect' in relation_types: return 'type-b-goal-effect' if 'requirements' in relation_types or 'product' in relation_types or 'process' in relation_types: return 'type-b-requirements-process-product' return 'type-b-section-stack' def _popup_candidate(topic: dict[str, Any]) -> bool: relation = str(topic.get('relation_type', '') or '') source = _normalize_space(str(topic.get('source_data', '') or '')) return relation in {'comparison', 'evidence'} or len(source) > 520 def extract_topics_from_raw(raw: str, repo_root: Path) -> tuple[str, list[dict[str, Any]], str]: title_match = re.search(r'^title:\s*(.+)$', raw, flags=re.M) doc_title = title_match.group(1).strip() if title_match else 'Document' clean = _strip_frontmatter_and_imports(raw) clean, conclusion_source = _extract_conclusion(clean, repo_root) topics: list[dict[str, Any]] = [] next_id = 1 first_section = re.search(r'^##\s+', clean, flags=re.M) intro_block = clean[: first_section.start()].strip() if first_section else clean.strip() if intro_block: detail_topics, intro_stripped, _ = _extract_detail_topics(intro_block, next_id + 1, repo_root) intro_source = _normalize_block_for_storage(intro_stripped, repo_root) if intro_source: title = _extract_title_from_intro(intro_source) relation, role, layer = _classify(title, 'intro') topics.append({ 'id': next_id, 'title': title, 'purpose': '문서 도입 또는 문제 제기', 'role': role, 'layer': layer, 'relation_type': relation, 'source_hint': title, 'summary': _compact(intro_source, _preserve_len(intro_source, floor=260, ceiling=760)), 'source_data': intro_source, 'structured_text': intro_source, 'popup_candidate': False, }) next_id += 1 topics.extend(detail_topics) next_id = max([t['id'] for t in topics], default=0) + 1 for section_title, section_body in _section_chunks(clean): detail_topics, section_stripped, next_id = _extract_detail_topics(section_body, next_id, repo_root) subsections = _subsection_chunks(section_stripped) lead = re.split(r'^###\s+.+$', section_stripped, maxsplit=1, flags=re.M)[0].strip() if subsections else section_stripped if lead: source = _normalize_block_for_storage(lead, repo_root) if source: relation, role, layer = _classify(section_title) topics.append({ 'id': next_id, 'title': section_title, 'purpose': f'{section_title}의 핵심 내용', 'role': role, 'layer': layer, 'relation_type': relation, 'source_hint': section_title, 'summary': _compact(source, _preserve_len(source, floor=240, ceiling=780)), 'source_data': source, 'structured_text': source, 'popup_candidate': False, }) next_id += 1 for sub_title, sub_body in subsections: source = _normalize_block_for_storage(sub_body, repo_root) if source: relation, role, layer = _classify(sub_title) topics.append({ 'id': next_id, 'title': sub_title, 'purpose': f'{sub_title}의 세부 내용', 'role': role, 'layer': layer, 'relation_type': relation, 'source_hint': sub_title, 'summary': _compact(source, _preserve_len(source, floor=220, ceiling=760)), 'source_data': source, 'structured_text': source, 'popup_candidate': False, }) next_id += 1 topics.extend(detail_topics) next_id = max([t['id'] for t in topics], default=0) + 1 if conclusion_source: topics.append({ 'id': next_id, 'title': '핵심 요약', 'purpose': '결론 또는 핵심 메시지', 'role': 'flow', 'layer': 'conclusion', 'relation_type': 'conclusion', 'source_hint': '핵심 요약', 'summary': _compact(conclusion_source, _preserve_len(conclusion_source, floor=140, ceiling=360)), 'source_data': conclusion_source, 'structured_text': conclusion_source, 'popup_candidate': False, }) for topic in topics: topic['popup_candidate'] = _popup_candidate(topic) return doc_title, topics, _content_family(topics) def _page_structure(topics: list[dict[str, Any]], family: str) -> dict[str, Any]: intro_ids = [t['id'] for t in topics if t['layer'] == 'intro'] core_ids = [t['id'] for t in topics if t['layer'] == 'core'] support_ids = [t['id'] for t in topics if t['layer'] == 'supporting'] conclusion_ids = [t['id'] for t in topics if t['layer'] == 'conclusion'] structure: dict[str, Any] = {} if family == 'type-a-compare-define-relate': if intro_ids: structure['background'] = {'topic_ids': intro_ids, 'weight': 0.22} if core_ids: structure['body'] = {'topic_ids': core_ids, 'weight': 0.50} if support_ids: structure['support'] = {'topic_ids': support_ids, 'weight': 0.18} else: top_ids = intro_ids + core_ids[:1] body_ids = core_ids[1:] if len(core_ids) > 1 else core_ids[:1] support_main = support_ids[:] if top_ids: structure['body'] = {'topic_ids': top_ids + body_ids, 'weight': 0.58 if support_main else 0.64} if support_main: structure['support'] = {'topic_ids': support_main, 'weight': 0.18} if conclusion_ids: structure['key_message'] = {'topic_ids': conclusion_ids, 'weight': 0.10} return structure def rebuild_run_from_raw(repo_root: Path, run_dir: Path, input_file: Path) -> dict[str, Any]: raw = _read_text(input_file) doc_title, topics, family = extract_topics_from_raw(raw, repo_root) core_topic = next((t for t in topics if t['layer'] == 'conclusion'), topics[-1] if topics else {'source_data': ''}) stage1a = { 'analysis': { 'title': doc_title, 'core_message': _normalize_space(str(core_topic.get('source_data', ''))), 'total_pages': 1, 'layout_template': ('A' if family == 'type-a-compare-define-relate' else ('B_GOAL' if family == 'type-b-goal-effect' else ('B_RPP' if family == 'type-b-requirements-process-product' else 'B_STACK'))), 'content_family': family, }, 'page_structure': _page_structure(topics, family), 'topics': topics, } stage1b = { 'concepts': [ { 'topic_id': t['id'], 'relation_type': t['relation_type'], 'expression_hint': ( '원문 제목과 원문 bullet을 우선 유지한다. 긴 세부 설명이나 큰 표는 popup으로 이동하되, 본문에는 핵심 bullet과 진입 요약을 남긴다.' if t.get('popup_candidate') else '원문 제목과 원문 bullet을 visible block으로 유지하고, 임의 재서술을 최소화한다.' ), 'summary': t['summary'], } for t in topics ] } input_dir = run_dir / '01-input' interp_dir = run_dir / '02-kei-interpretation' structure_dir = run_dir / '03-structure' plan_dir = run_dir / '04-plan' for d in (input_dir, interp_dir, structure_dir, plan_dir): d.mkdir(parents=True, exist_ok=True) _write_json(plan_dir / 'stage-1a-topics.json', stage1a) _write_json(plan_dir / 'stage-1b-refined-concepts.json', stage1b) _write_json(structure_dir / 'source-blocks.json', { 'title': doc_title, 'content_family': family, 'blocks': [ { 'id': t['id'], 'title': t['title'], 'layer': t['layer'], 'relation_type': t['relation_type'], 'popup_candidate': bool(t.get('popup_candidate')), 'source_data': t['source_data'], } for t in topics ], }) input_lines = [ '# Input Review', '', f'- 입력 파일: {input_file.name}', f'- 문서 제목: {doc_title}', f'- content family 후보: {family}', '- 우선 목표: 원문 block과 원문 순서를 최대한 보존한다.', '- popup 전략: 큰 표, 긴 사례, 긴 근거는 popup 후보로 분리하고 본문에는 제목과 핵심 bullet을 남긴다.', '', '## 원문 블록 식별', ] for topic in topics: popup_mark = ' [popup]' if topic.get('popup_candidate') else '' input_lines.append(f"- {topic['title']} ({topic['relation_type']}/{topic['layer']}){popup_mark}: {_compact(_normalize_space(topic['source_data']), 180)}") _write_text(input_dir / 'input-review.md', '\n'.join(input_lines) + '\n') interp_lines = [ '# Interpretation', '', f'- content family: {family}', '- 해석 원칙: 원문 제목/순서/표현을 우선 보존하고, 임의 재서술은 최소화한다.', '- grouping 원칙: 관계가 같은 block만 묶고, 내용이 길다고 해서 본문에서 제거하지 않는다.', '- popup 원칙: 상세는 popup으로 보내되 본문에는 핵심 bullet과 진입 문장을 남긴다.', '', '## Topic Classification', ] for topic in topics: interp_lines.append( f"- {topic['title']}: relation={topic['relation_type']} / layer={topic['layer']} / popup_candidate={str(bool(topic.get('popup_candidate'))).lower()}" ) _write_text(interp_dir / 'kei-interpretation.md', '\n'.join(interp_lines) + '\n') structure_lines = [ '# Content Structure', '', f'- content family: {family}', '- visible block 원칙: 각 섹션 제목과 핵심 bullet은 본문에 남긴다.', '- popup block 원칙: 큰 표, 긴 사례, 긴 상세 설명만 popup으로 보낸다.', '- 결론 원칙: note/결론 문장은 footer 또는 결론 배너에 직접 노출한다.', '', '## Ordered Blocks', ] for idx, topic in enumerate(topics, start=1): popup_mark = ' popup' if topic.get('popup_candidate') else ' visible' structure_lines.append(f"{idx}. {topic['title']} ({topic['relation_type']} / {topic['layer']} /{popup_mark})") _write_text(structure_dir / 'content-structure.md', '\n'.join(structure_lines) + '\n') plan_lines = [ '# Execution Plan', '', f'- content family: {family}', '- stage-1a/stage-1b는 raw MDX 기반 block 추출 결과를 그대로 사용한다.', '- Type A는 비교/정의/관계형으로, Type B는 본문 중심형으로 렌더한다.', '- popup 후보 block은 삭제하지 않고 popup overlay로 이동한다.', '- visible 영역에는 섹션 제목과 핵심 bullet을 남겨 원문 85% 보존 목표를 유지한다.', ] _write_text(plan_dir / 'execution-plan.md', '\n'.join(plan_lines) + '\n') return {'title': doc_title, 'topics': topics, 'content_family': family}