doc_ai/workspace/routers/google_docai.py

# google_docai.py

import json
import os
from typing import Optional

from google.api_core.client_options import ClientOptions
from google.cloud import documentai

if not os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):  # 이미 설정되어 있지 않다면
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = (
        "/home/jackjack/test/doc_ai/workspace/drawingpdfocr-461103-2441e0b34216.json"  # 이 경로가 API 서버 실행 시점에서 유효해야 함
    )


def process_document_from_content(  # 함수 이름 및 파라미터 변경
    project_id: str,
    location: str,
    processor_id: str,
    file_content: bytes,  # file_path 대신 file_content (bytes)
    mime_type: str,
    field_mask: Optional[str] = None,
    processor_version_id: Optional[str] = None,
) -> documentai.Document:
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    if processor_version_id:
        name = client.processor_version_path(
            project_id, location, processor_id, processor_version_id
        )
    else:
        name = client.processor_path(project_id, location, processor_id)

    # 파일 읽기 부분이 사라지고, file_content를 직접 사용
    raw_document = documentai.RawDocument(content=file_content, mime_type=mime_type)

    # 예시: 첫 페이지만 처리 (필요에 따라 수정)
    process_options = documentai.ProcessOptions(
        individual_page_selector=documentai.ProcessOptions.IndividualPageSelector(
            pages=[1]
        )
    )
    request = documentai.ProcessRequest(
        name=name,
        raw_document=raw_document,
        field_mask=field_mask,
        process_options=process_options,
    )
    result = client.process_document(request=request)
    document = result.document
    return document


def extract_and_convert_to_json(
    document: documentai.Document,
) -> str:
    extracted_entities = []
    if document and document.entities:
        for entity in document.entities:
            if (
                hasattr(entity, "type_")
                and hasattr(entity, "mention_text")
                and entity.type_
                and entity.mention_text
            ):
                extracted_entities.append(
                    {"type": entity.type_, "mention_text": entity.mention_text}
                )
    return json.dumps(extracted_entities, ensure_ascii=False, indent=2)