import json
import os
from typing import Optional

from google.api_core.client_options import ClientOptions
from google.cloud import documentai

# Google Cloud 인증 정보 설정 (환경 변수가 설정되어 있지 않은 경우)
if not os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = (
        "/home/jackjack/test/doc_ai/workspace/drawingpdfocr-461103-2441e0b34216.json"
    )


def process_document_from_content(
    project_id: str,
    location: str,
    processor_id: str,
    file_content: bytes,
    mime_type: str,
    field_mask: Optional[str] = None,
    processor_version_id: Optional[str] = None,
) -> documentai.Document:
    """
    주어진 파일 콘텐츠를 사용하여 Document AI 프로세서를 통해 문서를 처리합니다.

    Args:
        project_id (str): Google Cloud 프로젝트 ID.
        location (str): 프로세서 위치 (예: "us", "asia-east1").
        processor_id (str): 프로세서 ID.
        file_content (bytes): 처리할 파일의 바이너리 콘텐츠.
        mime_type (str): 파일의 MIME 타입 (예: "application/pdf").
        field_mask (Optional[str]): 추출할 필드를 지정하는 필드 마스크.
        processor_version_id (Optional[str]): 특정 프로세서 버전 ID.

    Returns:
        documentai.Document: 처리된 문서 객체.
    """
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    if processor_version_id:
        name = client.processor_version_path(
            project_id, location, processor_id, processor_version_id
        )
    else:
        name = client.processor_path(project_id, location, processor_id)

    raw_document = documentai.RawDocument(content=file_content, mime_type=mime_type)

    # 예시: 첫 페이지만 처리 (필요에 따라 수정 가능)
    process_options = documentai.ProcessOptions(
        individual_page_selector=documentai.ProcessOptions.IndividualPageSelector(
            pages=[1]
        )
    )
    request = documentai.ProcessRequest(
        name=name,
        raw_document=raw_document,
        field_mask=field_mask,
        process_options=process_options,
    )
    result = client.process_document(request=request)
    document = result.document
    return document


def extract_and_convert_to_json(
    document: documentai.Document,
) -> str:
    """
    Document AI 문서 객체에서 엔티티 정보를 추출하여 JSON 문자열로 변환합니다.
    'properties'와 같은 중첩된 엔티티도 처리합니다.

    Args:
        document (documentai.Document): Document AI에 의해 처리된 문서 객체.

    Returns:
        str: 추출된 엔티티 정보가 담긴 JSON 문자열.
    """
    extracted_entities = []
    if document and document.entities:
        for entity in document.entities:
            entity_data = {}

            # 기본 엔티티 속성 추출
            if hasattr(entity, "type_") and entity.type_:
                entity_data["type"] = entity.type_
            if hasattr(entity, "mention_text") and entity.mention_text:
                entity_data["mention_text"] = entity.mention_text
            if hasattr(entity, "confidence"):
                entity_data["confidence"] = entity.confidence

            # 'properties' 필드에 중첩된 엔티티가 있는 경우 처리
            # Document AI의 엔티티 모델에 따라 properties가 있을 수 있습니다.
            if hasattr(entity, "properties") and entity.properties:
                # properties는 리스트 형태일 수 있으므로 순회합니다.
                nested_properties = []
                for prop in entity.properties:
                    prop_data = {}
                    if hasattr(prop, "type_") and prop.type_:
                        prop_data["type"] = prop.type_
                    if hasattr(prop, "mention_text") and prop.mention_text:
                        prop_data["mention_text"] = prop.mention_text
                    if hasattr(prop, "confidence"):
                        prop_data["confidence"] = prop.confidence
                    if prop_data:  # 추출된 데이터가 있는 경우에만 추가
                        nested_properties.append(prop_data)

                if nested_properties:  # 중첩된 속성이 하나라도 있다면 추가
                    entity_data["properties"] = nested_properties

            # 모든 데이터가 추출된 후, 해당 엔티티 데이터를 리스트에 추가
            if entity_data:
                extracted_entities.append(entity_data)

    # JSON 형식으로 반환
    return json.dumps(extracted_entities, ensure_ascii=False, indent=2)