74 lines
2.3 KiB
Python
74 lines
2.3 KiB
Python
import json
|
|
import mimetypes
|
|
import os
|
|
|
|
import google_docai
|
|
import pandas as pd
|
|
from tqdm import tqdm
|
|
|
|
# 설정
|
|
DOCAI_PROJECT_ID = "drawingpdfocr-461103"
|
|
DOCAI_LOCATION = "us"
|
|
DOCAI_PROCESSOR_ID = "94de4322c20d276f"
|
|
FIELD_MASK = "text,entities"
|
|
FOLDER_PATH = "/home/jackjack/test/doc_ai/workspace/routers/kec_data"
|
|
OUTPUT_EXCEL = "KEC_results_transposed.xlsx"
|
|
ALLOWED_EXT = [".pdf", ".png", ".jpg", ".jpeg", ".tiff"]
|
|
|
|
# 전체 결과 dict (key = type, value = dict {filename: mention_text})
|
|
all_results = {}
|
|
|
|
for filename in tqdm(os.listdir(FOLDER_PATH)):
|
|
ext = os.path.splitext(filename)[-1].lower()
|
|
if ext not in ALLOWED_EXT:
|
|
continue
|
|
|
|
file_path = os.path.join(FOLDER_PATH, filename)
|
|
mime_type, _ = mimetypes.guess_type(file_path)
|
|
|
|
try:
|
|
with open(file_path, "rb") as f:
|
|
file_content = f.read()
|
|
|
|
doc_result = google_docai.process_document_from_content(
|
|
project_id=DOCAI_PROJECT_ID,
|
|
location=DOCAI_LOCATION,
|
|
processor_id=DOCAI_PROCESSOR_ID,
|
|
file_content=file_content,
|
|
mime_type=mime_type,
|
|
field_mask=FIELD_MASK,
|
|
)
|
|
|
|
json_output = google_docai.extract_and_convert_to_json(doc_result)
|
|
parsed = json.loads(json_output)
|
|
|
|
for entity in parsed:
|
|
entity_type = entity.get("type")
|
|
mention_text = entity.get("mention_text", "")
|
|
|
|
# 중첩 구조도 처리
|
|
if entity_type and "properties" in entity:
|
|
for prop in entity["properties"]:
|
|
prop_type = prop.get("type")
|
|
prop_text = prop.get("mention_text", "")
|
|
if prop_type:
|
|
all_results.setdefault(prop_type, {})[filename] = prop_text
|
|
elif entity_type:
|
|
all_results.setdefault(entity_type, {})[filename] = mention_text
|
|
|
|
except Exception as e:
|
|
all_results.setdefault("ERROR", {})[filename] = str(e)
|
|
|
|
# 👉 DataFrame 생성 (행=항목명, 열=파일명)
|
|
# DataFrame 생성 (파일명이 행, 항목명이 열)
|
|
df = pd.DataFrame.from_dict(all_results)
|
|
|
|
# 여기서 전치 (transpose) 적용
|
|
df = df.T # 행과 열 뒤집기
|
|
|
|
# 엑셀 저장
|
|
df.to_excel(OUTPUT_EXCEL, index=True, engine="openpyxl")
|
|
|
|
|
|
print(f"✅ 엑셀이 '{OUTPUT_EXCEL}'로 저장되었어여~ (행: 항목명, 열: 파일명)")
|