api 구축

This commit is contained in:
2025-06-04 15:25:36 +09:00
parent 04536eabd6
commit 5510529a36
7 changed files with 698 additions and 0 deletions

View File

@@ -0,0 +1,70 @@
# google_docai.py
import json
import os
from typing import Optional
from google.api_core.client_options import ClientOptions
from google.cloud import documentai
if not os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): # 이미 설정되어 있지 않다면
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = (
"/home/jackjack/test/doc_ai/workspace/drawingpdfocr-461103-2441e0b34216.json" # 이 경로가 API 서버 실행 시점에서 유효해야 함
)
def process_document_from_content( # 함수 이름 및 파라미터 변경
project_id: str,
location: str,
processor_id: str,
file_content: bytes, # file_path 대신 file_content (bytes)
mime_type: str,
field_mask: Optional[str] = None,
processor_version_id: Optional[str] = None,
) -> documentai.Document:
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
client = documentai.DocumentProcessorServiceClient(client_options=opts)
if processor_version_id:
name = client.processor_version_path(
project_id, location, processor_id, processor_version_id
)
else:
name = client.processor_path(project_id, location, processor_id)
# 파일 읽기 부분이 사라지고, file_content를 직접 사용
raw_document = documentai.RawDocument(content=file_content, mime_type=mime_type)
# 예시: 첫 페이지만 처리 (필요에 따라 수정)
process_options = documentai.ProcessOptions(
individual_page_selector=documentai.ProcessOptions.IndividualPageSelector(
pages=[1]
)
)
request = documentai.ProcessRequest(
name=name,
raw_document=raw_document,
field_mask=field_mask,
process_options=process_options,
)
result = client.process_document(request=request)
document = result.document
return document
def extract_and_convert_to_json(
document: documentai.Document,
) -> str:
extracted_entities = []
if document and document.entities:
for entity in document.entities:
if (
hasattr(entity, "type_")
and hasattr(entity, "mention_text")
and entity.type_
and entity.mention_text
):
extracted_entities.append(
{"type": entity.type_, "mention_text": entity.mention_text}
)
return json.dumps(extracted_entities, ensure_ascii=False, indent=2)