규칙 적용 재추적

2025-10-30 10:32:31 +09:00
parent bea690d3f1
commit 4062a50c80
43 changed files with 2105 additions and 0 deletions
--- a/workspace/utils/text_processor.py
+++ b/workspace/utils/text_processor.py
@@ -0,0 +1,90 @@
+import datetime
+import json
+import logging
+import re
+import unicodedata
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+def safe_filename(filename: str) -> str:
+    # 확장자 제거
+    print(f"[FILE NAME] {filename}")
+    base = Path(filename).stem
+    base = unicodedata.normalize("NFKC", base)
+    base = base.replace(" ", "_")
+    base = re.sub(r"[^\w\-\.가-힣]", "_", base, flags=re.UNICODE)
+    base = re.sub(r"_+", "_", base).strip("._-")
+
+    # 비어있으면 안전한 기본값
+    if not base:
+        base = f"result_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+
+    return f"{base}.html"
+
+
+def post_process(input_json, generated_text, llm_model):
+    result_dict = {}
+    # ✅ JSON 코드블럭 형식 처리
+    if "```json" in generated_text:
+        try:
+            logger.debug("[PROCESS-JSON] JSON 코드블럭 형식 후처리 진행합니다.")
+            json_block = re.search(
+                r"```json\s*(\{.*?\})\s*```", generated_text, re.DOTALL
+            )
+            if json_block:
+                parsed_json = json.loads(json_block.group(1))
+                result_dict = {
+                    re.sub(r"[^ㄱ-ㅎ가-힣a-zA-Z]", "", k): v
+                    for k, v in parsed_json.items()
+                }
+        except Exception as e:
+            logger.error("[PROCESS-ERROR] JSON 코드블럭 파싱 실패:", e)
+
+    # ✅ 길이 초과 메시지 감지 및 처리
+    elif "입력 텍스트가" in generated_text and "모델 호출 생략" in generated_text:
+        result_dict = {
+            "message": "⚠️ 입력 텍스트가 너무 깁니다. LLM 모델 호출을 생략했습니다.",
+            "note": "OCR로 추출된 원본 텍스트(parsed)를 참고해 주세요.",
+        }
+
+    else:
+        # ✅ "1.제목:" 또는 "1. 제목:" 형식 처리
+        logger.debug("[PROCESS-STRING] JSON 코드블럭 형식이 아닙니다.")
+        blocks = re.split(r"\n(?=\d+\.\s*[^:\n]+:)", generated_text.strip())
+
+        for block in blocks:
+            if ":" in block:
+                key_line, *rest = block.split(":", 1)
+                key = re.sub(r"^\d+\.\s*", "", key_line).strip()
+                cleaned_key = re.sub(r"[^ㄱ-ㅎ가-힣a-zA-Z]", "", key)
+
+                value = rest[0].strip() if rest else ""
+                value = re.sub(r"^[^\w가-힣a-zA-Z]+", "", value).strip()
+
+                result_dict[cleaned_key] = value
+    
+    input_json["result"] = result_dict
+    input_json["llm_model"] = llm_model
+
+    # final_result
+    logger.info(json.dumps(input_json["result"], indent=2, ensure_ascii=False))
+
+    return input_json
+
+
+def ocr_process(filename, ocr_model, coord, text, start_time, end_time):
+    json_data = {
+        "filename": filename,
+        "model": {"ocr_model": ocr_model},
+        "time": {
+            "duration_sec": f"{end_time - start_time:.2f}",
+            "started_at": start_time,
+            "ended_at": end_time,
+        },
+        "fields": coord,
+        "parsed": text,
+    }
+
+    return json_data