kec ocr 추가

2025-06-12 12:33:07 +09:00
parent 5510529a36
commit 921a275e93
5 changed files with 168 additions and 51 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,5 @@
 .venv
-/workspace/data
+/workspace/data
 workspace/routers/data
 workspace/routers/kec_data
 drawingpdfocr-461103-2441e0b34216.json
--- a/workspace/api.py
+++ b/workspace/api.py
@@ -3,7 +3,7 @@
 import asyncio
 import json  # JSON 파싱을 위해 추가
-from fastapi import APIRouter, FastAPI, File, HTTPException, UploadFile
+from fastapi import APIRouter, FastAPI, HTTPException, UploadFile
 from fastapi.middleware.cors import CORSMiddleware
 from routers import google_docai
 from utils.config import (
@@ -11,6 +11,8 @@ from utils.config import (
    CORS_ALLOW_HEADERS,
    CORS_ALLOW_METHODS,
    CORS_ALLOW_ORIGINS,
    DOCAI_LOCATION,
    DOCAI_PROJECT_ID,
    UPLOAD_DOCS_DIR,
 )
@@ -47,10 +49,10 @@ doc_ai_router = APIRouter(
    tags=["DocumentAI"],
 )
 # Document AI 관련 설정값 (프로덕션에서는 환경 변수나 설정 파일에서 로드 권장)
-DOCAI_PROJECT_ID = "drawingpdfocr-461103"
+# DOCAI_PROCESSOR_ID = "b838676d4e3b4758"  # 실제 사용자의 프로세서 ID
-DOCAI_LOCATION = "us"
+# KEC_DOCAI_PROCESSOR_ID = "94de4322c20d276f"
 DOCAI_PROCESSOR_ID = "b838676d4e3b4758"  # 실제 사용자의 프로세서 ID
 async def run_sync_in_threadpool(func, *args, **kwargs):
@@ -62,71 +64,57 @@ async def run_sync_in_threadpool(func, *args, **kwargs):
        return await loop.run_in_executor(None, lambda: func(*args, **kwargs))
-@doc_ai_router.post("/process-document/")
+@doc_ai_router.post("/available-processors", summary="도면 OCR API")
-async def process_uploaded_document(file: UploadFile = File(...)):
+async def handle_docai_upload(file: UploadFile, processor_id: str):
    """
-    업로드된 파일을 Document AI로 처리하고, 추출된 엔티티 정보를 JSON으로 반환합니다.
+    국토교통부 = "b838676d4e3b4758"\n
    도로공사 = "94de4322c20d276f"
    """
    if not file.content_type:
        raise HTTPException(status_code=400, detail="File content type is missing.")
    # 지원되는 MIME 타입 (예시, 필요에 따라 확장)
    allowed_mime_types = ["application/pdf", "image/jpeg", "image/png", "image/tiff"]
    if file.content_type not in allowed_mime_types:
        raise HTTPException(
            status_code=400,
            detail=f"Unsupported file type: '{file.content_type}'. Supported: {', '.join(allowed_mime_types)}",
        )
-    print(f"Received audio file for async processing: {file.filename}")
+
    print(f"Received file: {file.filename}")
    file_id = str(create_key())
    # 파일 저장 (유틸리티 함수 사용)
    try:
        file_path, file_content = save_uploaded_file(file, UPLOAD_DOCS_DIR, file_id)
    except HTTPException as e:
        raise e
    except Exception as e:
-        raise HTTPException(
+        raise HTTPException(status_code=500, detail=f"파일 저장 실패: {str(e)}")
            status_code=500, detail=f"파일 저장 준비 중 오류 발생: {str(e)}"
        )
    try:
        # Document AI 처리 (동기 함수를 비동기적으로 호출)
        document_result = await run_sync_in_threadpool(
-            google_docai.process_document_from_content,  # 수정된 함수 사용
+            google_docai.process_document_from_content,
            project_id=DOCAI_PROJECT_ID,
            location=DOCAI_LOCATION,
-            processor_id=DOCAI_PROCESSOR_ID,
+            processor_id=processor_id,
            file_content=file_content,
            mime_type=file.content_type,
-            field_mask="text,entities",  # 필요한 필드 마스크
+            field_mask="text,entities",
        )
-        print(document_result)
+
        if not document_result:
-            # 이 경우는 process_document_from_content 함수 내부에서 예외가 발생하지 않고
+            raise HTTPException(status_code=500, detail="Document AI 처리 결과 없음.")
            # None이나 빈 Document 객체를 반환했을 때를 대비 (일반적으론 예외 발생)
            raise HTTPException(
                status_code=500,
                detail="Failed to process document: No result from Document AI.",
            )
        json_output_string = google_docai.extract_and_convert_to_json(document_result)
        return json.loads(json_output_string)
    except HTTPException as http_exc:
        # 이미 HTTPException으로 처리된 예외는 그대로 다시 발생시킴
        raise http_exc
    except Exception as e:
        # 기타 예외 처리 (로깅 권장)
        # import traceback
        # print(f"Error processing file: {e}\n{traceback.format_exc()}")
        raise HTTPException(
            status_code=500,
-            detail=f"An error occurred during document processing: {str(e)}",
+            detail=f"Document AI 처리 중 오류 발생: {str(e)}",
        )
    finally:
-        await file.close()  # 업로드된 파일 객체를 닫아 리소스 정리
+        await file.close()
 # app에 라우터 등록
--- a/workspace/routers/batch_docai_to_excel.py
+++ b/workspace/routers/batch_docai_to_excel.py
@@ -0,0 +1,73 @@
 import json
 import mimetypes
 import os
 import google_docai
 import pandas as pd
 from tqdm import tqdm
 # 설정
 DOCAI_PROJECT_ID = "drawingpdfocr-461103"
 DOCAI_LOCATION = "us"
 DOCAI_PROCESSOR_ID = "94de4322c20d276f"
 FIELD_MASK = "text,entities"
 FOLDER_PATH = "/home/jackjack/test/doc_ai/workspace/routers/kec_data"
 OUTPUT_EXCEL = "KEC_results_transposed.xlsx"
 ALLOWED_EXT = [".pdf", ".png", ".jpg", ".jpeg", ".tiff"]
 # 전체 결과 dict (key = type, value = dict {filename: mention_text})
 all_results = {}
 for filename in tqdm(os.listdir(FOLDER_PATH)):
    ext = os.path.splitext(filename)[-1].lower()
    if ext not in ALLOWED_EXT:
        continue
    file_path = os.path.join(FOLDER_PATH, filename)
    mime_type, _ = mimetypes.guess_type(file_path)
    try:
        with open(file_path, "rb") as f:
            file_content = f.read()
        doc_result = google_docai.process_document_from_content(
            project_id=DOCAI_PROJECT_ID,
            location=DOCAI_LOCATION,
            processor_id=DOCAI_PROCESSOR_ID,
            file_content=file_content,
            mime_type=mime_type,
            field_mask=FIELD_MASK,
        )
        json_output = google_docai.extract_and_convert_to_json(doc_result)
        parsed = json.loads(json_output)
        for entity in parsed:
            entity_type = entity.get("type")
            mention_text = entity.get("mention_text", "")
            # 중첩 구조도 처리
            if entity_type and "properties" in entity:
                for prop in entity["properties"]:
                    prop_type = prop.get("type")
                    prop_text = prop.get("mention_text", "")
                    if prop_type:
                        all_results.setdefault(prop_type, {})[filename] = prop_text
            elif entity_type:
                all_results.setdefault(entity_type, {})[filename] = mention_text
    except Exception as e:
        all_results.setdefault("ERROR", {})[filename] = str(e)
 # 👉 DataFrame 생성 (행=항목명, 열=파일명)
 # DataFrame 생성 (파일명이 행, 항목명이 열)
 df = pd.DataFrame.from_dict(all_results)
 # 여기서 전치 (transpose) 적용
 df = df.T  # 행과 열 뒤집기
 # 엑셀 저장
 df.to_excel(OUTPUT_EXCEL, index=True, engine="openpyxl")
 print(f"✅ 엑셀이 '{OUTPUT_EXCEL}'로 저장되었어여~ (행: 항목명, 열: 파일명)")
--- a/workspace/routers/google_docai.py
+++ b/workspace/routers/google_docai.py
@@ -1,5 +1,3 @@
 # google_docai.py
 import json
 import os
 from typing import Optional
@@ -7,21 +5,37 @@ from typing import Optional
 from google.api_core.client_options import ClientOptions
 from google.cloud import documentai
-if not os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):  # 이미 설정되어 있지 않다면
+# Google Cloud 인증 정보 설정 (환경 변수가 설정되어 있지 않은 경우)
 if not os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = (
-        "/home/jackjack/test/doc_ai/workspace/drawingpdfocr-461103-2441e0b34216.json"  # 이 경로가 API 서버 실행 시점에서 유효해야 함
+        "/home/jackjack/test/doc_ai/workspace/drawingpdfocr-461103-2441e0b34216.json"
    )
-def process_document_from_content(  # 함수 이름 및 파라미터 변경
+def process_document_from_content(
    project_id: str,
    location: str,
    processor_id: str,
-    file_content: bytes,  # file_path 대신 file_content (bytes)
+    file_content: bytes,
    mime_type: str,
    field_mask: Optional[str] = None,
    processor_version_id: Optional[str] = None,
 ) -> documentai.Document:
    """
    주어진 파일 콘텐츠를 사용하여 Document AI 프로세서를 통해 문서를 처리합니다.
    Args:
        project_id (str): Google Cloud 프로젝트 ID.
        location (str): 프로세서 위치 (예: "us", "asia-east1").
        processor_id (str): 프로세서 ID.
        file_content (bytes): 처리할 파일의 바이너리 콘텐츠.
        mime_type (str): 파일의 MIME 타입 (예: "application/pdf").
        field_mask (Optional[str]): 추출할 필드를 지정하는 필드 마스크.
        processor_version_id (Optional[str]): 특정 프로세서 버전 ID.
    Returns:
        documentai.Document: 처리된 문서 객체.
    """
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
    client = documentai.DocumentProcessorServiceClient(client_options=opts)
@@ -32,10 +46,9 @@ def process_document_from_content(  # 함수 이름 및 파라미터 변경
    else:
        name = client.processor_path(project_id, location, processor_id)
    # 파일 읽기 부분이 사라지고, file_content를 직접 사용
    raw_document = documentai.RawDocument(content=file_content, mime_type=mime_type)
-    # 예시: 첫 페이지만 처리 (필요에 따라 수정)
+    # 예시: 첫 페이지만 처리 (필요에 따라 수정 가능)
    process_options = documentai.ProcessOptions(
        individual_page_selector=documentai.ProcessOptions.IndividualPageSelector(
            pages=[1]
@@ -55,16 +68,51 @@ def process_document_from_content(  # 함수 이름 및 파라미터 변경
 def extract_and_convert_to_json(
    document: documentai.Document,
 ) -> str:
    """
    Document AI 문서 객체에서 엔티티 정보를 추출하여 JSON 문자열로 변환합니다.
    'properties'와 같은 중첩된 엔티티도 처리합니다.
    Args:
        document (documentai.Document): Document AI에 의해 처리된 문서 객체.
    Returns:
        str: 추출된 엔티티 정보가 담긴 JSON 문자열.
    """
    extracted_entities = []
    if document and document.entities:
        for entity in document.entities:
-            if (
+            entity_data = {}
-                hasattr(entity, "type_")
+
-                and hasattr(entity, "mention_text")
+            # 기본 엔티티 속성 추출
-                and entity.type_
+            if hasattr(entity, "type_") and entity.type_:
-                and entity.mention_text
+                entity_data["type"] = entity.type_
-            ):
+            if hasattr(entity, "mention_text") and entity.mention_text:
-                extracted_entities.append(
+                entity_data["mention_text"] = entity.mention_text
-                    {"type": entity.type_, "mention_text": entity.mention_text}
+            if hasattr(entity, "confidence"):
-                )
+                entity_data["confidence"] = entity.confidence
            # 'properties' 필드에 중첩된 엔티티가 있는 경우 처리
            # Document AI의 엔티티 모델에 따라 properties가 있을 수 있습니다.
            if hasattr(entity, "properties") and entity.properties:
                # properties는 리스트 형태일 수 있으므로 순회합니다.
                nested_properties = []
                for prop in entity.properties:
                    prop_data = {}
                    if hasattr(prop, "type_") and prop.type_:
                        prop_data["type"] = prop.type_
                    if hasattr(prop, "mention_text") and prop.mention_text:
                        prop_data["mention_text"] = prop.mention_text
                    if hasattr(prop, "confidence"):
                        prop_data["confidence"] = prop.confidence
                    if prop_data:  # 추출된 데이터가 있는 경우에만 추가
                        nested_properties.append(prop_data)
                if nested_properties:  # 중첩된 속성이 하나라도 있다면 추가
                    entity_data["properties"] = nested_properties
            # 모든 데이터가 추출된 후, 해당 엔티티 데이터를 리스트에 추가
            if entity_data:
                extracted_entities.append(entity_data)
    # JSON 형식으로 반환
    return json.dumps(extracted_entities, ensure_ascii=False, indent=2)
--- a/workspace/utils/config.py
+++ b/workspace/utils/config.py
@@ -14,3 +14,8 @@ CORS_ALLOW_ORIGINS = os.getenv("CORS_ALLOW_ORIGINS", "*").split(",")
 CORS_ALLOW_CREDENTIALS = os.getenv("CORS_ALLOW_CREDENTIALS", "true").lower() == "true"
 CORS_ALLOW_METHODS = os.getenv("CORS_ALLOW_METHODS", "*").split(",")
 CORS_ALLOW_HEADERS = os.getenv("CORS_ALLOW_HEADERS", "*").split(",")
 # api
 DOCAI_PROJECT_ID = "drawingpdfocr-461103"
 DOCAI_LOCATION = "us"