Upload converters/pipeline/step7_index.py

2026-03-19 09:13:25 +09:00
parent 5da3c7cae5
commit 1b182262bb
1 changed files with 312 additions and 0 deletions
--- a/03.Code/업로드용/converters/pipeline/step7_index.py
+++ b/03.Code/업로드용/converters/pipeline/step7_index.py
@@ -0,0 +1,312 @@
+# -*- coding: utf-8 -*-
+from dotenv import load_dotenv
+load_dotenv()
+
+"""
+make_outline.py
+
+기능:
+- output_context/context/domain_prompt.txt
+- output_context/context/corpus.txt
+를 바탕으로 보고서 목차를 생성합니다.
+1) outline_issue_report.txt 생성
+2) outline_issue_report.html 생성 (미리보기용)
+"""
+import os
+import sys
+import re
+from pathlib import Path
+from datetime import datetime
+from typing import List, Dict, Any, Tuple
+from openai import OpenAI
+
+# ===== OpenAI 설정 (구조 유지) =====
+OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
+GPT_MODEL      = "gpt-5-2025-08-07"
+client = OpenAI(api_key=OPENAI_API_KEY)
+
+# ===== 목차 구성을 위한 정규식 =====
+RE_KEYWORDS = re.compile(r"(#\S+)")
+RE_L1 = re.compile(r"^\s*(\d+)\.\s+(.+?)\s*$")
+RE_L2 = re.compile(r"^\s*(\d+\.\d+)\s+(.+?)\s*$")
+RE_L3 = re.compile(r"^\s*(\d+\.\d+\.\d+)\s+(.+?)\s*$")
+
+def log(msg: str):
+    print(msg, flush=True)
+    with (LOG_DIR / "make_outline_log.txt").open("a", encoding="utf-8") as f:
+        f.write(msg + "\n")
+
+def load_domain_prompt() -> str:
+    p = CONTEXT_DIR / "domain_prompt.txt"
+    if not p.exists():
+        log("domain_prompt.txt가 없습니다. 먼저 domain_prompt.py를 실행하십시오.")
+        sys.exit(1)
+    return p.read_text(encoding="utf-8", errors="ignore").strip()
+
+def load_corpus() -> str:
+    p = CONTEXT_DIR / "corpus.txt"
+    if not p.exists():
+        log("corpus.txt가 없습니다. 먼저 make_corpus.py를 실행하십시오.")
+        sys.exit(1)
+    return p.read_text(encoding="utf-8", errors="ignore").strip()
+
+
+# 기존 RE_L1, RE_L2와 일치하지 않는 가이드용 정규식 추가.
+RE_L3_HEAD = re.compile(r"^\s*(\d+\.\d+\.\d+)\s+(.+)$")
+RE_L3_TOPIC = re.compile(r"^\s*[\-\*]\s+(.+?)\s*\|\s*(.+?)\s*\|\s*(\[.+?\])\s*\|\s*(.+)$")
+
+def generate_outline(domain_prompt: str, corpus: str, rag_chunks: str = "", doc_type: str = 'report', attach_pages: int = 1) -> str:
+    """
+    GPT를 호출하여 전체 보고서의 목차를 구성합니다.
+    - doc_type: 'report' (일반 보고서) 또는 'briefing' (브리핑 자료)
+    - attach_pages: 브리핑 자료 시 첨부 페이지 수
+    """
+    if doc_type == 'briefing':
+        sys_msg = {
+            "role": "system",
+            "content": (
+                domain_prompt + "\n\n"
+                "당신은 측량 및 지리정보 분야의 보고서 기획 전문가입니다. "
+                "제공된 정보를 분석하여, A4 1~2매 분량의 '핵심 브리핑 자료' 목차를 작성하세요. "
+                "본문은 '1. 개요 - 2. 현황 - 3. 문제점 및 대책 - 4. 결론' 형식을 기본으로 하되 내용에 맞춰 조정 가능합니다. "
+                "각 세부 주제별로 데이터 근거와 시각화 방안을 포함한 기획안을 도출하세요."
+            ),
+        }
+        attach_str = ""
+        for i in range(1, attach_pages + 1):
+            attach_str += f"""
+[첨부 {i}] 
+- 제목: 본문 내용을 보완하는 상세 데이터/참고 자료 (본문과 제목은 다르게)
+- 리드문: 본문에서 언급된 핵심 수치나 근거를 요약하여 제시
+- 세부 항목: [- 주제 | #키워드 | [시각화방안] | 내용 가이드] (3~4개 구성)
+"""
+        user_msg = {
+            "role": "user",
+            "content": f"""
+다음 정보를 바탕으로 브리핑 보고서 목차를 생성하세요.
+
+[정보원(Corpus)]
+{corpus[:8000]}
+
+[작성 규칙]
+1. 최상단에 [보고서 제목]을 작성 (전문적이고 명확하게)
+2. 본문(1페이지 분량)과 첨부({attach_pages}페이지 분량)로 구분
+3. 각 페이지별로 리드문(전체 내용을 관통하는 핵심 메시지) 포함
+4. 세부 주제(Topic)는 다음 형식을 준수:
+   - 주제명 | #키워드 | [시각화방안] | 내용 가이드
+   - 시각화방안: 표, 그래프, 비교표, 다이어그램 등 구체적으로 명시
+5. (중요) 코퍼스 내의 핵심 수치, 기준, 측량 기법 등을 세부 항목 가이드에 포함할 것.
+
+{attach_str}
+""",
+        }
+    else:
+        # 일반 보고서 모드
+        sys_msg = {
+            "role": "system",
+            "content": (
+                domain_prompt + "\n\n"
+                "당신은 건설/측량 DX 기술 전문가이자 보고서 기획자입니다. "
+                "제시된 코퍼스를 분석하여, 실무에 즉시 활용 가능한 고품질 기술 보고서 목차를 생성하세요. "
+                "목차는 대분류(1.), 중분류(1.1), 소분류(1.1.1)의 3단계 계층 구조를 따릅니다. "
+                "각 소분류(1.1.1) 하위에는 반드시 구체적인 집필 가이드를 포함해야 합니다."
+            ),
+        }
+        user_msg = {
+            "role": "user",
+            "content": f"""
+다음 정보를 바탕으로 기술 보고서 목차를 생성하세요.
+
+[정보원(Corpus)]
+{corpus[:10000]}
+
+[작성 규칙]
+1. 최상단에 [보고서 제목] 1개를 작성
+2. 목차는 1. / 1.1 / 1.1.1 형식의 3단계 구조
+3. 소분류(1.1.1) 하단에는 해당 섹션에서 다룰 상세 주제들을 다음 형식으로 나열:
+   - 주제명 | #핵심키워드 | [구성형식] | 집필 가이드(수치나 핵심 기법 포함)
+   - 구성형식 예시: [비교표], [기술설명], [절차도], [성과분석] 등
+4. 도메인 지식(측량 정확도 기준, 사용 장비 등)이 각 세부 항목 가이드에 녹아있어야 함.
+
+출력은 목차 텍스트만 깔끔하게 출력하세요.
+""",
+        }
+
+    resp = client.chat.completions.create(
+        model=GPT_MODEL,
+        messages=[sys_msg, user_msg],
+        temperature=0.3,
+    )
+    return (resp.choices[0].message.content or "").strip()
+
+def parse_outline(outline_text: str) -> Tuple[str, List[Dict[str, Any]]]:
+    """
+    생성된 목차 텍스트를 구조화된 데이터로 파싱합니다.
+    """
+    lines = [l.strip() for l in outline_text.splitlines() if l.strip()]
+    if not lines:
+        return "제목 없음", []
+
+    # 1. 제목 추출
+    title_line = lines[0]
+    title = re.sub(r'^\[?보고서 제목\]?[:\s]*', '', title_line).strip()
+
+    # 2. 계층 구조 파싱
+    rows = []
+    current_section = None 
+
+    for ln in lines[1:]:
+        raw = ln.strip()
+
+        # 소분류 (1.1.1)
+        m3_head = RE_L3_HEAD.match(raw)
+        if m3_head:
+            num, s_title = m3_head.groups()
+            current_section = {
+                "depth": 3,
+                "num": num,
+                "title": s_title,
+                "sub_topics": []
+            }
+            rows.append(current_section)
+            continue
+
+        # 상세 주제 (- 주제 | #키워드 | [형식] | 가이드)
+        m_topic = RE_L3_TOPIC.match(raw)
+        if m_topic and current_section:
+            t_title, kws_raw, t_type, guide = m_topic.groups()
+            kws = [k.lstrip("#").strip() for k in RE_KEYWORDS.findall(kws_raw)]
+            current_section["sub_topics"].append({
+                "topic_title": t_title,
+                "keywords": kws,
+                "type": t_type,
+                "guide": guide
+            })
+            continue
+
+        # 대분류 (1.)
+        m1 = RE_L1.match(raw)
+        if m1:
+            rows.append({"depth": 1, "num": m1.group(1).strip(), "title": m1.group(2).strip()})
+            current_section = None
+            continue
+
+        # 중분류 (1.1)
+        m2 = RE_L2.match(raw)
+        if m2:
+            rows.append({"depth": 2, "num": m2.group(1).strip(), "title": m2.group(2).strip()})
+            current_section = None
+            continue
+
+    return title, rows
+
+def html_escape(s: str) -> str:
+    s = s or ""
+    return (s.replace("&", "&amp;")
+             .replace("<", "&lt;")
+             .replace(">", "&gt;")
+             .replace('"', "&quot;")
+             .replace("'", "&#39;"))
+
+def build_outline_table_html(rows: List[Dict[str, Any]]) -> str:
+    """목차 구조를 HTML 테이블로 변환"""
+    head = """
+    <table border="1" style="width:100%; border-collapse: collapse;">
+        <thead>
+            <tr style="background-color: #f2f2f2;">
+                <th>분류</th>
+                <th>번호</th>
+                <th>항목명</th>
+                <th>상세 가이드 / 키워드</th>
+            </tr>
+        </thead>
+        <tbody>
+    """
+
+    body_parts = []
+    for r in rows:
+        depth = r["depth"]
+        num = html_escape(r["num"])
+        title = html_escape(r["title"])
+        
+        if depth == 3:
+            kw_list = []
+            for st in r.get("sub_topics", []):
+                kw_list.append(f"<b>{html_escape(st['topic_title'])}</b>: {html_escape(st['guide'])}")
+            detail = "<br>".join(kw_list)
+            cls_name = "소분류"
+        elif depth == 2:
+            detail = ""
+            cls_name = "중분류"
+        else:
+            detail = ""
+            cls_name = "대분류"
+
+        body_parts.append(
+            f"""
+            <tr>
+                <td style="padding: 8px; text-align: center;">{cls_name}</td>
+                <td style="padding: 8px; text-align: center;">{num}</td>
+                <td style="padding: 8px; font-weight: {'bold' if depth < 3 else 'normal'};">{title}</td>
+                <td style="padding: 8px; font-size: 0.9em;">{detail}</td>
+            </tr>
+            """
+        )
+
+    tail = "</tbody></table>"
+    return head + "\n".join(body_parts) + tail
+
+def build_outline_html(report_title: str, rows: List[Dict[str, Any]]) -> str:
+    table_html = build_outline_table_html(rows)
+    return f"""<!DOCTYPE html>
+<html lang="ko">
+<head>
+  <meta charset="UTF-8">
+  <title>보고서 목차 구성안</title>
+  <style>
+    body {{ font-family: 'Malgun Gothic', sans-serif; padding: 40px; line-height: 1.6; color: #333; }}
+    h1 {{ color: #1a365d; border-bottom: 2px solid #1a365d; padding-bottom: 10px; }}
+    .info {{ background: #f7fafc; padding: 15px; border-left: 5px solid #1a365d; margin-bottom: 20px; }}
+  </style>
+</head>
+<body>
+  <h1>{html_escape(report_title)}</h1>
+  <div class="info">
+    본 목차는 제공된 코퍼스를 기반으로 AI가 설계한 보고서 구성안입니다.
+  </div>
+  {table_html}
+</body>
+</html>
+"""
+
+def main(input_dir, output_dir, doc_type='report', attach_pages=1):
+    global DATA_ROOT, OUTPUT_ROOT, CONTEXT_DIR, LOG_DIR
+    DATA_ROOT   = Path(input_dir)
+    OUTPUT_ROOT = Path(output_dir)
+    CONTEXT_DIR = OUTPUT_ROOT / "context"
+    LOG_DIR     = OUTPUT_ROOT / "logs"
+    for d in [CONTEXT_DIR, LOG_DIR]:
+        d.mkdir(parents=True, exist_ok=True)
+    log("=== 목차 생성 시작 ===")
+    domain_prompt = load_domain_prompt()
+    corpus        = load_corpus()
+
+    # RAG에서 일부 정보를 가져올 수 있으면 가져옴 (선택 사항)
+    rag_chunks = ""
+    outline = generate_outline(domain_prompt, corpus, rag_chunks, doc_type, attach_pages)
+
+    # TXT 저장
+    out_txt = CONTEXT_DIR / "outline_issue_report.txt"
+    out_txt.write_text(outline, encoding="utf-8")
+    log(f"목차 TXT 저장 완료: {out_txt}")
+
+    # HTML 미리보기용 저장
+    title, rows = parse_outline(outline)
+    out_html = CONTEXT_DIR / "outline_issue_report.html"
+    out_html.write_text(build_outline_html(title, rows), encoding="utf-8")
+    log(f"목차 HTML 미리보기 저장 완료: {out_html}")
+
+    log("=== 목차 생성 완료 ===")
+
+if __name__ == "__main__":
+    main()