Cleanup: Deleting 03.Code/업로드용/converters/pipeline/step3_domain.py

2026-03-19 14:01:38 +09:00
parent 305453c1f4
commit 8b66037c24
1 changed files with 0 additions and 241 deletions
--- a/03.Code/업로드용/converters/pipeline/step3_domain.py
+++ b/03.Code/업로드용/converters/pipeline/step3_domain.py
@@ -1,241 +0,0 @@
-# -*- coding: utf-8 -*-
-from dotenv import load_dotenv
-load_dotenv()
-"""
-domain_prompt.py
-
-기능:
- D:\\test\\report 안의 모든 pdf/xlsx/png/txt/md 파일들을
-  파일명과 내용 일부를 샘플링한다.
- 이 샘플을 기반으로, 문서 성격의 분야/직무 영역을 파악하고
-  "너는 ~~ 분야의 전문가이다. 너는 ~~를 하고 있다..." 형식의
-  도메인 전용 시스템 프롬프트(persona)를 자동 생성한다.
- 결과물은 output/context/domain_prompt.txt 로 저장된다.
-
-이 domain_prompt.txt 내용은 이후 모든 GPT 호출(system role)에 공통적으로 붙여 사용하게 된다.
-"""
-
-import os
-import sys
-import json
-from pathlib import Path
-
-import pdfplumber
-import fitz  # PyMuPDF
-import pandas as pd
-from openai import OpenAI
-
-# ===== OpenAI 설정 (구조화된 키, 모델 설정 등 직접 입력) =====
-OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
-GPT_MODEL      = "gpt-4o"
-
-client = OpenAI(api_key=OPENAI_API_KEY)
-
-SKIP_DIR_NAMES = {"System Volume Information", "$RECYCLE.BIN", ".git", "__pycache__"}
-
-
-def log(msg: str):
-    print(msg, flush=True)
-    with (LOG_DIR / "domain_prompt_log.txt").open("a", encoding="utf-8") as f:
-        f.write(msg + "\n")
-
-
-def safe_rel(p: Path) -> str:
-    try:
-        return str(p.relative_to(DATA_ROOT))
-    except Exception:
-        return str(p)
-
-
-def sample_from_pdf(p: Path, max_chars: int = 1000) -> str:
-    texts = []
-    try:
-        with pdfplumber.open(str(p)) as pdf:
-            # 상위 3페이지만 샘플링
-            for page in pdf.pages[:3]:
-                t = page.extract_text() or ""
-                if t:
-                    texts.append(t)
-                if sum(len(x) for x in texts) >= max_chars:
-                    break
-    except Exception as e:
-        log(f"[WARN] PDF 샘플 추출 실패: {safe_rel(p)} | {e}")
-    joined = "\n".join(texts)
-    return joined[:max_chars]
-
-
-def sample_from_xlsx(p: Path, max_chars: int = 1000) -> str:
-    texts = [f"[파일명] {p.name}"]
-    try:
-        xls = pd.ExcelFile(str(p))
-        for sheet_name in xls.sheet_names[:3]:
-            try:
-                df = xls.parse(sheet_name)
-            except Exception as e:
-                log(f"[WARN] 시트 로딩 실패: {safe_rel(p)} | {sheet_name} | {e}")
-                continue
-            texts.append(f"\n[시트] {sheet_name}")
-            texts.append("컬럼명: " + ", ".join(map(str, df.columns)))
-            head = df.head(5)
-            texts.append(head.to_string(index=False))
-            if sum(len(x) for x in texts) >= max_chars:
-                break
-    except Exception as e:
-        log(f"[WARN] XLSX 샘플 추출 실패: {safe_rel(p)} | {e}")
-    joined = "\n".join(texts)
-    return joined[:max_chars]
-
-
-def sample_from_text_file(p: Path, max_chars: int = 1000) -> str:
-    try:
-        t = p.read_text(encoding="utf-8", errors="ignore")
-    except Exception:
-        t = p.read_text(encoding="cp949", errors="ignore")
-    return t[:max_chars]
-
-
-def gather_file_samples(
-    max_files_per_type: int = 100,
-    max_total_samples: int = 300,
-    max_chars_per_sample: int = 1000,
-):
-
-    file_names = []
-    samples = []
-
-    count_pdf = 0
-    count_xlsx = 0
-    count_img = 0
-    count_txt = 0
-
-    for root, dirs, files in os.walk(DATA_ROOT):
-        dirs[:] = [d for d in dirs if d not in SKIP_DIR_NAMES and not d.startswith(".")]
-        cur_dir = Path(root)
-
-        for fname in files:
-            fpath = cur_dir / fname
-            ext = fpath.suffix.lower()
-
-            # 파일명은 전체 다 확보하고, 샘플 추출은 제한
-            file_names.append(safe_rel(fpath))
-
-            if len(samples) >= max_total_samples:
-                continue
-
-            try:
-                if ext == ".pdf" and count_pdf < max_files_per_type:
-                    s = sample_from_pdf(fpath, max_chars=max_chars_per_sample)
-                    if s.strip():
-                        samples.append(f"[PDF] {safe_rel(fpath)}\n{s}")
-                        count_pdf += 1
-                    continue
-
-                if ext in {".xlsx", ".xls"} and count_xlsx < max_files_per_type:
-                    s = sample_from_xlsx(fpath, max_chars=max_chars_per_sample)
-                    if s.strip():
-                        samples.append(f"[XLSX] {safe_rel(fpath)}\n{s}")
-                        count_xlsx += 1
-                    continue
-
-                if ext in {".txt", ".md"} and count_txt < max_files_per_type:
-                    s = sample_from_text_file(fpath, max_chars=max_chars_per_sample)
-                    if s.strip():
-                        samples.append(f"[TEXT] {safe_rel(fpath)}\n{s}")
-                        count_txt += 1
-                    continue
-
-            except Exception as e:
-                log(f"[WARN] 샘플 추출 실패: {safe_rel(fpath)} | {e}")
-                continue
-
-    return file_names, samples
-
-
-def build_domain_prompt():
-    """
-    파일명 + 내용 샘플을 GPT에게 넘겨서
-    '너는 ~~ 분야의 전문가다...' 형식의 시스템 프롬프트를 생성한다.
-    """
-    log("도메인 프롬프트 생성을 위한 샘플링 중...")
-    file_names, samples = gather_file_samples()
-
-    if not file_names and not samples:
-        log("파일 샘플이 없어 도메인 프롬프트를 생성할 수 없습니다.")
-        sys.exit(1)
-
-    file_names_text = "\n".join(file_names[:80])
-    sample_text = "\n\n".join(samples[:30])
-
-    prompt = f"""
-다음은 한 업무 폴더의 파일 목록과 그 파일에서 추출한 샘플 내용이다.
-
-[파일명 목록]
-{file_names_text}
-
-[내용 샘플]
-{sample_text}
-
-위 자료를 바탕으로 다음을 수행하라.
-
-1) 이 문서 뭉치들이 어떤 분야, 업무, 도메인에 관한 것인지, 
-   핵심 키워드를 포함하여 2~3문장 정도로 요약하라.
-
-2) 이후, 이 문서들을 다루는 AI에게 쓰일 "프롬프트 가이드라인"을 작성하라.
-   이 가이드라인은 반드시 "너는 ~~ 분야의 전문 AI이다."로 시작하며,
-   다음 조건을 만족해야 한다.
-
-   - 페르소나: "너는 ~~ 분야의 전문가다." 형식으로, 이 문서 뭉치의 분야와 역할을 정의한다.
-   - 역할과 역량: 너는 ~~를 하고 있다.", "우리는 ~~의 문제를 해결하고 개선안을 찾고 있다." 등의 
-     사용자와 AI에게 부여되는 전문지식 수준과 관점을 정의한다.
-   - 주의사항 5~7문장 정도로 작성하라.
-   - 이 분야에서 쓰는 프롬프트(작업자, 역할, RAG, 보고서 작성 등)의 전체 스택에 연결될 수 있도록,
-     역할(role), 목적, 기준(출처 기반), 용어 사용, 규격 준수 등을 모두 포괄하여 작성하라.
-
-출력 형식:
- 요약과 가이드라인을 한 번에 출력하되, 
-  별도의 마크다운 없이 순수 텍스트로만 작성하라.
- 이 출력 전체를 domain_prompt.txt에 그대로 저장할 것이다.
-"""
-
-    resp = client.chat.completions.create(
-        model=GPT_MODEL,
-        messages=[
-            {
-                "role": "system",
-                "content": "너는 문서 뭉치의 도메인을 분석하고, 그에 맞는 AI 시스템 프롬프트와 가이드라인을 작성하는 지침 설계자이다."
-            },
-            {
-                "role": "user",
-                "content": prompt
-            }
-        ],
-    )
-
-    content = (resp.choices[0].message.content or "").strip()
-    out_path = CONTEXT_DIR / "domain_prompt.txt"
-    out_path.write_text(content, encoding="utf-8")
-
-    log(f"도메인 프롬프트 생성 완료: {out_path}")
-    return content
-
-
-def main(input_dir, output_dir):
-    global DATA_ROOT, OUTPUT_ROOT, CONTEXT_DIR, LOG_DIR
-    DATA_ROOT   = Path(input_dir)
-    OUTPUT_ROOT = Path(output_dir)
-    CONTEXT_DIR = OUTPUT_ROOT / "context"
-    LOG_DIR     = OUTPUT_ROOT / "logs"
-    for d in [OUTPUT_ROOT, CONTEXT_DIR, LOG_DIR]:
-        d.mkdir(parents=True, exist_ok=True)
-    log("=== 도메인 프롬프트 생성 시작 ===")
-    out_path = CONTEXT_DIR / "domain_prompt.txt"
-    if out_path.exists():
-        log(f"이미 domain_prompt.txt가 존재합니다: {out_path}")
-        log("기존 파일을 사용하려면 종료하고, 재생성이 필요하면 파일을 삭제 후 다시 실행하십시오.")
-    else:
-        build_domain_prompt()
-    log("=== 도메인 프롬프트 작업 종료 ===")
-
-
-if __name__ == "__main__":
-    main()