v3:추출 파이프라인_260122

2026-02-13 14:06:57 +09:00
parent 3c5b9e29fe
commit 05d2d8cc9a
28 changed files with 6910 additions and 47 deletions
--- a/converters/pipeline/step3_domain.py
+++ b/converters/pipeline/step3_domain.py
@@ -0,0 +1,265 @@
+# -*- coding: utf-8 -*-
+"""
+domain_prompt.py
+
+기능:
+- D:\\test\\report 아래의 pdf/xlsx/png/txt/md 파일들의
+  파일명과 내용 일부를 샘플링한다.
+- 이 샘플을 기반으로, 문서 묶음의 분야/업무 맥락을 파악하고
+  "너는 ~~ 분야의 전문가이다. 나는 ~~를 하고 싶다..." 형식의
+  도메인 전용 시스템 프롬프트를 자동 생성한다.
+- 결과는 output/context/domain_prompt.txt 로 저장된다.
+
+이 domain_prompt.txt 내용은 이후 모든 GPT 호출(system role)에 공통으로 붙여 사용할 수 있다.
+"""
+
+import os
+import sys
+import json
+from pathlib import Path
+
+import pdfplumber
+import fitz  # PyMuPDF
+from PIL import Image
+import pytesseract
+import pandas as pd
+from openai import OpenAI
+import pytesseract
+from api_config import API_KEYS
+pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
+
+# ===== 경로 설정 =====
+DATA_ROOT   = Path(r"D:\for python\survey_test\extract")
+OUTPUT_ROOT = Path(r"D:\for python\survey_test\output")
+CONTEXT_DIR = OUTPUT_ROOT / "context"
+LOG_DIR     = OUTPUT_ROOT / "logs"
+
+for d in [OUTPUT_ROOT, CONTEXT_DIR, LOG_DIR]:
+    d.mkdir(parents=True, exist_ok=True)
+
+# ===== OpenAI 설정 (구조만 유지, 키는 마스터가 직접 입력) =====
+OPENAI_API_KEY = API_KEYS.get('GPT_API_KEY', '')
+GPT_MODEL      = "gpt-5-2025-08-07"
+
+client = OpenAI(api_key=OPENAI_API_KEY)
+
+# ===== OCR 설정 =====
+OCR_LANG = "kor+eng"
+
+SKIP_DIR_NAMES = {"System Volume Information", "$RECYCLE.BIN", ".git", "__pycache__"}
+
+
+def log(msg: str):
+    print(msg, flush=True)
+    with (LOG_DIR / "domain_prompt_log.txt").open("a", encoding="utf-8") as f:
+        f.write(msg + "\n")
+
+
+def safe_rel(p: Path) -> str:
+    try:
+        return str(p.relative_to(DATA_ROOT))
+    except Exception:
+        return str(p)
+
+
+def ocr_image(img_path: Path) -> str:
+    try:
+        return pytesseract.image_to_string(Image.open(img_path), lang=OCR_LANG).strip()
+    except Exception as e:
+        log(f"[WARN] OCR 실패: {safe_rel(img_path)} | {e}")
+        return ""
+
+
+def sample_from_pdf(p: Path, max_chars: int = 1000) -> str:
+    texts = []
+    try:
+        with pdfplumber.open(str(p)) as pdf:
+            # 앞쪽 몇 페이지만 샘플링
+            for page in pdf.pages[:3]:
+                t = page.extract_text() or ""
+                if t:
+                    texts.append(t)
+                if sum(len(x) for x in texts) >= max_chars:
+                    break
+    except Exception as e:
+        log(f"[WARN] PDF 샘플 추출 실패: {safe_rel(p)} | {e}")
+    joined = "\n".join(texts)
+    return joined[:max_chars]
+
+
+def sample_from_xlsx(p: Path, max_chars: int = 1000) -> str:
+    texts = [f"[파일명] {p.name}"]
+    try:
+        xls = pd.ExcelFile(str(p))
+        for sheet_name in xls.sheet_names[:3]:
+            try:
+                df = xls.parse(sheet_name)
+            except Exception as e:
+                log(f"[WARN] 시트 로딩 실패: {safe_rel(p)} | {sheet_name} | {e}")
+                continue
+            texts.append(f"\n[시트] {sheet_name}")
+            texts.append("컬럼: " + ", ".join(map(str, df.columns)))
+            head = df.head(5)
+            texts.append(head.to_string(index=False))
+            if sum(len(x) for x in texts) >= max_chars:
+                break
+    except Exception as e:
+        log(f"[WARN] XLSX 샘플 추출 실패: {safe_rel(p)} | {e}")
+    joined = "\n".join(texts)
+    return joined[:max_chars]
+
+
+def sample_from_text_file(p: Path, max_chars: int = 1000) -> str:
+    try:
+        t = p.read_text(encoding="utf-8", errors="ignore")
+    except Exception:
+        t = p.read_text(encoding="cp949", errors="ignore")
+    return t[:max_chars]
+
+
+def gather_file_samples(
+    max_files_per_type: int = 100,
+    max_total_samples: int = 300,
+    max_chars_per_sample: int = 1000,
+):
+
+    file_names = []
+    samples = []
+
+    count_pdf = 0
+    count_xlsx = 0
+    count_img = 0
+    count_txt = 0
+
+    for root, dirs, files in os.walk(DATA_ROOT):
+        dirs[:] = [d for d in dirs if d not in SKIP_DIR_NAMES and not d.startswith(".")]
+        cur_dir = Path(root)
+
+        for fname in files:
+            fpath = cur_dir / fname
+            ext = fpath.suffix.lower()
+
+            # 파일명은 전체 다 모으되, 샘플 추출은 제한
+            file_names.append(safe_rel(fpath))
+
+            if len(samples) >= max_total_samples:
+                continue
+
+            try:
+                if ext == ".pdf" and count_pdf < max_files_per_type:
+                    s = sample_from_pdf(fpath, max_chars=max_chars_per_sample)
+                    if s.strip():
+                        samples.append(f"[PDF] {safe_rel(fpath)}\n{s}")
+                        count_pdf += 1
+                    continue
+
+                if ext in {".xlsx", ".xls"} and count_xlsx < max_files_per_type:
+                    s = sample_from_xlsx(fpath, max_chars=max_chars_per_sample)
+                    if s.strip():
+                        samples.append(f"[XLSX] {safe_rel(fpath)}\n{s}")
+                        count_xlsx += 1
+                    continue
+
+                if ext in {".png", ".jpg", ".jpeg"} and count_img < max_files_per_type:
+                    s = ocr_image(fpath)
+                    if s.strip():
+                        samples.append(f"[IMG] {safe_rel(fpath)}\n{s[:max_chars_per_sample]}")
+                        count_img += 1
+                    continue
+
+                if ext in {".txt", ".md"} and count_txt < max_files_per_type:
+                    s = sample_from_text_file(fpath, max_chars=max_chars_per_sample)
+                    if s.strip():
+                        samples.append(f"[TEXT] {safe_rel(fpath)}\n{s}")
+                        count_txt += 1
+                    continue
+
+            except Exception as e:
+                log(f"[WARN] 샘플 추출 실패: {safe_rel(fpath)} | {e}")
+                continue
+
+    return file_names, samples
+
+
+def build_domain_prompt():
+    """
+    파일명 + 내용 샘플을 GPT에게 넘겨
+    '너는 ~~ 분야의 전문가이다...' 형태의 시스템 프롬프트를 생성한다.
+    """
+    log("도메인 프롬프트 생성을 위한 샘플 수집 중...")
+    file_names, samples = gather_file_samples()
+
+    if not file_names and not samples:
+        log("파일 샘플이 없어 도메인 프롬프트를 생성할 수 없습니다.")
+        sys.exit(1)
+
+    file_names_text = "\n".join(file_names[:80])
+    sample_text = "\n\n".join(samples[:30])
+
+    prompt = f"""
+다음은 한 기업의 '이슈 리포트 및 시스템 관련 자료'로 추정되는 파일들의 목록과,
+각 파일에서 일부 추출한 내용 샘플이다.
+
+[파일명 목록]
+{file_names_text}
+
+[내용 샘플]
+{sample_text}
+
+위 자료를 바탕으로 다음을 수행하라.
+
+1) 이 문서 묶음이 어떤 산업, 업무, 분야에 대한 것인지,
+   핵심 키워드를 포함해 2~3줄 정도로 설명하라.
+
+2) 이후, 이 문서들을 다루는 AI에게 사용할 "프롬프트 머리말"을 작성하라.
+   이 머리말은 모든 후속 프롬프트 앞에 항상 붙일 예정이며,
+   다음 조건을 만족해야 한다.
+
+   - 첫 문단: "너는 ~~ 분야의 전문가이다." 형식으로, 이 문서 묶음의 분야와 역할을 정의한다.
+   - 두 번째 문단 이후: "나는 ~~을 하고 싶다.", "우리는 ~~ 의 문제를 분석하고 개선방안을 찾고자 한다." 등
+     사용자가 AI에게 요구하는 전반적 목적과 관점을 정리한다.
+   - 총 5~7줄 정도의 한국어 문장으로 작성한다.
+   - 이후에 붙을 프롬프트(청킹, 요약, RAG, 보고서 작성 등)와 자연스럽게 연결될 수 있도록,
+     역할(role), 목적, 기준(추측 금지, 사실 기반, 근거 명시 등)을 모두 포함한다.
+
+출력 형식:
+- 설명과 머리말을 한 번에 출력하되,
+  별도의 마크다운 없이 순수 텍스트로만 작성하라.
+- 이 출력 전체를 domain_prompt.txt에 그대로 저장할 것이다.
+"""
+
+    resp = client.chat.completions.create(
+        model=GPT_MODEL,
+        messages=[
+            {
+                "role": "system",
+                "content": "너는 문서 묶음의 분야를 식별하고, 그에 맞는 AI 시스템 프롬프트와 컨텍스트를 설계하는 컨설턴트이다."
+            },
+            {
+                "role": "user",
+                "content": prompt
+            }
+        ],
+    )
+
+    content = (resp.choices[0].message.content or "").strip()
+    out_path = CONTEXT_DIR / "domain_prompt.txt"
+    out_path.write_text(content, encoding="utf-8")
+
+    log(f"도메인 프롬프트 생성 완료: {out_path}")
+    return content
+
+
+def main():
+    log("=== 도메인 프롬프트 생성 시작 ===")
+    out_path = CONTEXT_DIR / "domain_prompt.txt"
+    if out_path.exists():
+        log(f"이미 domain_prompt.txt가 존재합니다: {out_path}")
+        log("기존 파일을 사용하려면 종료하고, 재생성이 필요하면 파일을 삭제한 뒤 다시 실행하십시오.")
+    else:
+        build_domain_prompt()
+    log("=== 도메인 프롬프트 작업 종료 ===")
+
+
+if __name__ == "__main__":
+    main()