import json # JSON 모듈 임포트 import os from typing import Optional from google.api_core.client_options import ClientOptions from google.cloud import documentai # type: ignore os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = ( "../drawingpdfocr-461103-2441e0b34216.json" # 이 경로가 정확한지 다시 한번 확인해주세요! ) def process_document_sample( project_id: str, location: str, processor_id: str, file_path: str, mime_type: str, field_mask: Optional[str] = None, processor_version_id: Optional[str] = None, ) -> documentai.Document: opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com") client = documentai.DocumentProcessorServiceClient(client_options=opts) if processor_version_id: name = client.processor_version_path( project_id, location, processor_id, processor_version_id ) else: name = client.processor_path(project_id, location, processor_id) with open(file_path, "rb") as image: image_content = image.read() raw_document = documentai.RawDocument(content=image_content, mime_type=mime_type) process_options = documentai.ProcessOptions( individual_page_selector=documentai.ProcessOptions.IndividualPageSelector( pages=[1] ) ) request = documentai.ProcessRequest( name=name, raw_document=raw_document, field_mask=field_mask, process_options=process_options, ) result = client.process_document(request=request) document = result.document return document def extract_and_convert_to_json( document: documentai.Document, ) -> str: # 반환 타입을 str (JSON 문자열)로 명시 """ Document AI의 Document 객체에서 entities의 type과 mention_text를 추출하여 JSON 문자열로 반환합니다. """ extracted_entities = [] if document and document.entities: for entity in document.entities: if ( hasattr(entity, "type_") and hasattr( entity, "mention_text" ) # type_와 mention_text 속성이 있는지 확인 and entity.type_ and entity.mention_text ): extracted_entities.append( {"type": entity.type_, "mention_text": entity.mention_text} ) return json.dumps(extracted_entities, ensure_ascii=False, indent=2) if __name__ == "__main__": project_id = "drawingpdfocr-461103" location = "us" processor_id = "b838676d4e3b4758" file_path = "../data/UPLOAD_DOCS/3공구-설계도1-004.pdf" mime_type = "application/pdf" try: document_result = process_document_sample( project_id=project_id, location=location, processor_id=processor_id, file_path=file_path, mime_type=mime_type, field_mask="text,entities", # entities 정보를 받아오도록 field_mask 설정 ) if document_result: json_output_string = extract_and_convert_to_json(document_result) print(json_output_string) # 변환된 JSON 문자열을 출력 except FileNotFoundError: print( f"오류: 파일 경로 '{file_path}'에서 파일을 찾을 수 없습니다. 파일 경로를 확인해주세요." ) except Exception as e: print(f"함수 실행 중 오류 발생: {e}")