kec ocr 추가
This commit is contained in:
73
workspace/routers/batch_docai_to_excel.py
Normal file
73
workspace/routers/batch_docai_to_excel.py
Normal file
@@ -0,0 +1,73 @@
|
||||
import json
|
||||
import mimetypes
|
||||
import os
|
||||
|
||||
import google_docai
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
|
||||
# 설정
|
||||
DOCAI_PROJECT_ID = "drawingpdfocr-461103"
|
||||
DOCAI_LOCATION = "us"
|
||||
DOCAI_PROCESSOR_ID = "94de4322c20d276f"
|
||||
FIELD_MASK = "text,entities"
|
||||
FOLDER_PATH = "/home/jackjack/test/doc_ai/workspace/routers/kec_data"
|
||||
OUTPUT_EXCEL = "KEC_results_transposed.xlsx"
|
||||
ALLOWED_EXT = [".pdf", ".png", ".jpg", ".jpeg", ".tiff"]
|
||||
|
||||
# 전체 결과 dict (key = type, value = dict {filename: mention_text})
|
||||
all_results = {}
|
||||
|
||||
for filename in tqdm(os.listdir(FOLDER_PATH)):
|
||||
ext = os.path.splitext(filename)[-1].lower()
|
||||
if ext not in ALLOWED_EXT:
|
||||
continue
|
||||
|
||||
file_path = os.path.join(FOLDER_PATH, filename)
|
||||
mime_type, _ = mimetypes.guess_type(file_path)
|
||||
|
||||
try:
|
||||
with open(file_path, "rb") as f:
|
||||
file_content = f.read()
|
||||
|
||||
doc_result = google_docai.process_document_from_content(
|
||||
project_id=DOCAI_PROJECT_ID,
|
||||
location=DOCAI_LOCATION,
|
||||
processor_id=DOCAI_PROCESSOR_ID,
|
||||
file_content=file_content,
|
||||
mime_type=mime_type,
|
||||
field_mask=FIELD_MASK,
|
||||
)
|
||||
|
||||
json_output = google_docai.extract_and_convert_to_json(doc_result)
|
||||
parsed = json.loads(json_output)
|
||||
|
||||
for entity in parsed:
|
||||
entity_type = entity.get("type")
|
||||
mention_text = entity.get("mention_text", "")
|
||||
|
||||
# 중첩 구조도 처리
|
||||
if entity_type and "properties" in entity:
|
||||
for prop in entity["properties"]:
|
||||
prop_type = prop.get("type")
|
||||
prop_text = prop.get("mention_text", "")
|
||||
if prop_type:
|
||||
all_results.setdefault(prop_type, {})[filename] = prop_text
|
||||
elif entity_type:
|
||||
all_results.setdefault(entity_type, {})[filename] = mention_text
|
||||
|
||||
except Exception as e:
|
||||
all_results.setdefault("ERROR", {})[filename] = str(e)
|
||||
|
||||
# 👉 DataFrame 생성 (행=항목명, 열=파일명)
|
||||
# DataFrame 생성 (파일명이 행, 항목명이 열)
|
||||
df = pd.DataFrame.from_dict(all_results)
|
||||
|
||||
# 여기서 전치 (transpose) 적용
|
||||
df = df.T # 행과 열 뒤집기
|
||||
|
||||
# 엑셀 저장
|
||||
df.to_excel(OUTPUT_EXCEL, index=True, engine="openpyxl")
|
||||
|
||||
|
||||
print(f"✅ 엑셀이 '{OUTPUT_EXCEL}'로 저장되었어여~ (행: 항목명, 열: 파일명)")
|
||||
Reference in New Issue
Block a user