import json import mimetypes import os import google_docai import pandas as pd from tqdm import tqdm # 설정 DOCAI_PROJECT_ID = "drawingpdfocr-461103" DOCAI_LOCATION = "us" DOCAI_PROCESSOR_ID = "94de4322c20d276f" FIELD_MASK = "text,entities" FOLDER_PATH = "/home/jackjack/test/doc_ai/workspace/routers/kec_data" OUTPUT_EXCEL = "KEC_results_transposed.xlsx" ALLOWED_EXT = [".pdf", ".png", ".jpg", ".jpeg", ".tiff"] # 전체 결과 dict (key = type, value = dict {filename: mention_text}) all_results = {} for filename in tqdm(os.listdir(FOLDER_PATH)): ext = os.path.splitext(filename)[-1].lower() if ext not in ALLOWED_EXT: continue file_path = os.path.join(FOLDER_PATH, filename) mime_type, _ = mimetypes.guess_type(file_path) try: with open(file_path, "rb") as f: file_content = f.read() doc_result = google_docai.process_document_from_content( project_id=DOCAI_PROJECT_ID, location=DOCAI_LOCATION, processor_id=DOCAI_PROCESSOR_ID, file_content=file_content, mime_type=mime_type, field_mask=FIELD_MASK, ) json_output = google_docai.extract_and_convert_to_json(doc_result) parsed = json.loads(json_output) for entity in parsed: entity_type = entity.get("type") mention_text = entity.get("mention_text", "") # 중첩 구조도 처리 if entity_type and "properties" in entity: for prop in entity["properties"]: prop_type = prop.get("type") prop_text = prop.get("mention_text", "") if prop_type: all_results.setdefault(prop_type, {})[filename] = prop_text elif entity_type: all_results.setdefault(entity_type, {})[filename] = mention_text except Exception as e: all_results.setdefault("ERROR", {})[filename] = str(e) # 👉 DataFrame 생성 (행=항목명, 열=파일명) # DataFrame 생성 (파일명이 행, 항목명이 열) df = pd.DataFrame.from_dict(all_results) # 여기서 전치 (transpose) 적용 df = df.T # 행과 열 뒤집기 # 엑셀 저장 df.to_excel(OUTPUT_EXCEL, index=True, engine="openpyxl") print(f"✅ 엑셀이 '{OUTPUT_EXCEL}'로 저장되었어여~ (행: 항목명, 열: 파일명)")