Files
doc_ai/workspace/routers/batch_docai_to_excel.py
2025-06-12 12:33:07 +09:00

74 lines
2.3 KiB
Python

import json
import mimetypes
import os
import google_docai
import pandas as pd
from tqdm import tqdm
# 설정
DOCAI_PROJECT_ID = "drawingpdfocr-461103"
DOCAI_LOCATION = "us"
DOCAI_PROCESSOR_ID = "94de4322c20d276f"
FIELD_MASK = "text,entities"
FOLDER_PATH = "/home/jackjack/test/doc_ai/workspace/routers/kec_data"
OUTPUT_EXCEL = "KEC_results_transposed.xlsx"
ALLOWED_EXT = [".pdf", ".png", ".jpg", ".jpeg", ".tiff"]
# 전체 결과 dict (key = type, value = dict {filename: mention_text})
all_results = {}
for filename in tqdm(os.listdir(FOLDER_PATH)):
ext = os.path.splitext(filename)[-1].lower()
if ext not in ALLOWED_EXT:
continue
file_path = os.path.join(FOLDER_PATH, filename)
mime_type, _ = mimetypes.guess_type(file_path)
try:
with open(file_path, "rb") as f:
file_content = f.read()
doc_result = google_docai.process_document_from_content(
project_id=DOCAI_PROJECT_ID,
location=DOCAI_LOCATION,
processor_id=DOCAI_PROCESSOR_ID,
file_content=file_content,
mime_type=mime_type,
field_mask=FIELD_MASK,
)
json_output = google_docai.extract_and_convert_to_json(doc_result)
parsed = json.loads(json_output)
for entity in parsed:
entity_type = entity.get("type")
mention_text = entity.get("mention_text", "")
# 중첩 구조도 처리
if entity_type and "properties" in entity:
for prop in entity["properties"]:
prop_type = prop.get("type")
prop_text = prop.get("mention_text", "")
if prop_type:
all_results.setdefault(prop_type, {})[filename] = prop_text
elif entity_type:
all_results.setdefault(entity_type, {})[filename] = mention_text
except Exception as e:
all_results.setdefault("ERROR", {})[filename] = str(e)
# 👉 DataFrame 생성 (행=항목명, 열=파일명)
# DataFrame 생성 (파일명이 행, 항목명이 열)
df = pd.DataFrame.from_dict(all_results)
# 여기서 전치 (transpose) 적용
df = df.T # 행과 열 뒤집기
# 엑셀 저장
df.to_excel(OUTPUT_EXCEL, index=True, engine="openpyxl")
print(f"✅ 엑셀이 '{OUTPUT_EXCEL}'로 저장되었어여~ (행: 항목명, 열: 파일명)")