api 구축

2025-06-04 15:25:36 +09:00
parent 04536eabd6
commit 5510529a36
7 changed files with 698 additions and 0 deletions
--- a/workspace/routers/google_docai.py
+++ b/workspace/routers/google_docai.py
@@ -0,0 +1,70 @@
+# google_docai.py
+
+import json
+import os
+from typing import Optional
+
+from google.api_core.client_options import ClientOptions
+from google.cloud import documentai
+
+if not os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):  # 이미 설정되어 있지 않다면
+    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = (
+        "/home/jackjack/test/doc_ai/workspace/drawingpdfocr-461103-2441e0b34216.json"  # 이 경로가 API 서버 실행 시점에서 유효해야 함
+    )
+
+
+def process_document_from_content(  # 함수 이름 및 파라미터 변경
+    project_id: str,
+    location: str,
+    processor_id: str,
+    file_content: bytes,  # file_path 대신 file_content (bytes)
+    mime_type: str,
+    field_mask: Optional[str] = None,
+    processor_version_id: Optional[str] = None,
+) -> documentai.Document:
+    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
+    client = documentai.DocumentProcessorServiceClient(client_options=opts)
+
+    if processor_version_id:
+        name = client.processor_version_path(
+            project_id, location, processor_id, processor_version_id
+        )
+    else:
+        name = client.processor_path(project_id, location, processor_id)
+
+    # 파일 읽기 부분이 사라지고, file_content를 직접 사용
+    raw_document = documentai.RawDocument(content=file_content, mime_type=mime_type)
+
+    # 예시: 첫 페이지만 처리 (필요에 따라 수정)
+    process_options = documentai.ProcessOptions(
+        individual_page_selector=documentai.ProcessOptions.IndividualPageSelector(
+            pages=[1]
+        )
+    )
+    request = documentai.ProcessRequest(
+        name=name,
+        raw_document=raw_document,
+        field_mask=field_mask,
+        process_options=process_options,
+    )
+    result = client.process_document(request=request)
+    document = result.document
+    return document
+
+
+def extract_and_convert_to_json(
+    document: documentai.Document,
+) -> str:
+    extracted_entities = []
+    if document and document.entities:
+        for entity in document.entities:
+            if (
+                hasattr(entity, "type_")
+                and hasattr(entity, "mention_text")
+                and entity.type_
+                and entity.mention_text
+            ):
+                extracted_entities.append(
+                    {"type": entity.type_, "mention_text": entity.mention_text}
+                )
+    return json.dumps(extracted_entities, ensure_ascii=False, indent=2)