api 구축
This commit is contained in:
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
.venv
|
||||
/workspace/data
|
||||
309
requirements.txt
Normal file
309
requirements.txt
Normal file
@@ -0,0 +1,309 @@
|
||||
aiohappyeyeballs==2.4.4
|
||||
aiohttp==3.11.11
|
||||
aiosignal==1.3.2
|
||||
alembic==1.14.0
|
||||
altair==5.5.0
|
||||
annotated-types==0.7.0
|
||||
anthropic==0.45.2
|
||||
antlr4-python3-runtime==4.9.3
|
||||
anyio==4.8.0
|
||||
asteroid-filterbanks==0.4.0
|
||||
asttokens==3.0.0
|
||||
async-timeout==4.0.3
|
||||
attrs==21.2.0
|
||||
audioread==3.0.1
|
||||
Automat==20.2.0
|
||||
azure-ai-documentintelligence==1.0.0
|
||||
azure-core==1.32.0
|
||||
azure-identity==1.20.0
|
||||
Babel==2.8.0
|
||||
bcrypt==3.2.0
|
||||
beautifulsoup4==4.13.3
|
||||
blinker==1.4
|
||||
cachetools==5.5.2
|
||||
certifi==2020.6.20
|
||||
cffi==1.17.1
|
||||
chardet==4.0.0
|
||||
charset-normalizer==3.4.1
|
||||
click==8.1.8
|
||||
cloud-init==24.3.1
|
||||
cobble==0.1.4
|
||||
colorama==0.4.4
|
||||
colorlog==6.9.0
|
||||
comm==0.2.2
|
||||
command-not-found==0.3
|
||||
configobj==5.0.6
|
||||
constantly==15.1.0
|
||||
contourpy==1.3.1
|
||||
cryptography==44.0.1
|
||||
cycler==0.12.1
|
||||
Cython==3.0.11
|
||||
dbus-python==1.2.18
|
||||
debugpy==1.8.12
|
||||
decorator==5.1.1
|
||||
deepl==1.21.0
|
||||
defusedxml==0.7.1
|
||||
distro==1.7.0
|
||||
distro-info==1.1+ubuntu0.2
|
||||
dnspython==2.7.0
|
||||
docker==5.0.3
|
||||
dockerpty==0.4.1
|
||||
docopt==0.6.2
|
||||
dtw-python==1.5.3
|
||||
einops==0.8.0
|
||||
elastic-transport==8.17.1
|
||||
elasticsearch==8.17.2
|
||||
email_validator==2.2.0
|
||||
et_xmlfile==2.0.0
|
||||
exceptiongroup==1.2.2
|
||||
executing==2.1.0
|
||||
fastapi==0.115.8
|
||||
fastapi-cli==0.0.7
|
||||
feedparser==6.0.11
|
||||
filelock==3.16.1
|
||||
fonttools==4.55.3
|
||||
frozenlist==1.5.0
|
||||
fsspec==2024.12.0
|
||||
gitdb==4.0.12
|
||||
GitPython==3.1.44
|
||||
google-api-core==2.25.0
|
||||
google-auth==2.40.2
|
||||
google-cloud-documentai==3.5.0
|
||||
googleapis-common-protos==1.66.0
|
||||
greenlet==3.1.1
|
||||
grpcio==1.72.1
|
||||
grpcio-status==1.71.0
|
||||
h11==0.14.0
|
||||
httpcore==1.0.7
|
||||
httplib2==0.20.2
|
||||
httptools==0.6.4
|
||||
httpx==0.28.1
|
||||
huggingface-hub==0.27.0
|
||||
hyperlink==21.0.0
|
||||
HyperPyYAML==1.2.2
|
||||
idna==3.3
|
||||
importlib-metadata==4.6.4
|
||||
incremental==21.3.0
|
||||
ipykernel==6.29.5
|
||||
ipython==8.31.0
|
||||
isodate==0.7.2
|
||||
itsdangerous==2.2.0
|
||||
jedi==0.19.2
|
||||
jeepney==0.7.1
|
||||
Jinja2==3.1.5
|
||||
jiter==0.8.2
|
||||
joblib==1.4.2
|
||||
jsonpatch==1.33
|
||||
jsonpointer==2.0
|
||||
jsonschema==3.2.0
|
||||
julius==0.2.7
|
||||
jupyter_client==8.6.3
|
||||
jupyter_core==5.7.2
|
||||
keyring==23.5.0
|
||||
kiwipiepy==0.20.3
|
||||
kiwipiepy-model==0.20.0
|
||||
kiwisolver==1.4.8
|
||||
langchain==0.3.18
|
||||
langchain-core==0.3.34
|
||||
langchain-teddynote==0.3.42
|
||||
langchain-text-splitters==0.3.6
|
||||
langgraph==0.2.71
|
||||
langgraph-checkpoint==2.0.12
|
||||
langgraph-sdk==0.1.51
|
||||
langsmith==0.3.8
|
||||
launchpadlib==1.10.16
|
||||
lazr.restfulclient==0.14.4
|
||||
lazr.uri==1.0.6
|
||||
lazy_loader==0.4
|
||||
librosa==0.10.2.post1
|
||||
lightning==2.5.0.post0
|
||||
lightning-utilities==0.11.9
|
||||
llvmlite==0.43.0
|
||||
lxml==5.3.1
|
||||
lz4==4.4.3
|
||||
Mako==1.3.8
|
||||
mammoth==1.9.0
|
||||
markdown-it-py==3.0.0
|
||||
markdownify==0.14.1
|
||||
markitdown==0.0.1a4
|
||||
MarkupSafe==2.0.1
|
||||
matplotlib==3.10.0
|
||||
matplotlib-inline==0.1.7
|
||||
mdurl==0.1.2
|
||||
mmh3==4.1.0
|
||||
more-itertools==8.10.0
|
||||
mpmath==1.3.0
|
||||
msal==1.31.1
|
||||
msal-extensions==1.2.0
|
||||
msgpack==1.1.0
|
||||
multidict==6.1.0
|
||||
narwhals==1.33.0
|
||||
nest-asyncio==1.6.0
|
||||
netifaces==0.11.0
|
||||
networkx==3.4.2
|
||||
nltk==3.9.1
|
||||
numba==0.60.0
|
||||
numpy==1.26.4
|
||||
nvidia-cublas-cu12==12.4.5.8
|
||||
nvidia-cuda-cupti-cu12==12.4.127
|
||||
nvidia-cuda-nvrtc-cu12==12.4.127
|
||||
nvidia-cuda-runtime-cu12==12.4.127
|
||||
nvidia-cudnn-cu12==9.1.0.70
|
||||
nvidia-cufft-cu12==11.2.1.3
|
||||
nvidia-curand-cu12==10.3.5.147
|
||||
nvidia-cusolver-cu12==11.6.1.9
|
||||
nvidia-cusparse-cu12==12.3.1.170
|
||||
nvidia-nccl-cu12==2.21.5
|
||||
nvidia-nvjitlink-cu12==12.4.127
|
||||
nvidia-nvtx-cu12==12.4.127
|
||||
oauthlib==3.2.0
|
||||
olefile==0.47
|
||||
ollama==0.4.7
|
||||
omegaconf==2.3.0
|
||||
openai==1.61.1
|
||||
openai-whisper==20240930
|
||||
openpyxl==3.1.5
|
||||
optuna==4.1.0
|
||||
orjson==3.10.15
|
||||
packaging==24.2
|
||||
pandas==2.2.3
|
||||
parso==0.8.4
|
||||
pathvalidate==3.2.3
|
||||
pdf2image==1.17.0
|
||||
pdfminer.six==20240706
|
||||
pexpect==4.9.0
|
||||
pillow==11.1.0
|
||||
pinecone-client==5.0.1
|
||||
pinecone-plugin-inference==1.1.0
|
||||
pinecone-plugin-interface==0.0.7
|
||||
pinecone-text==0.9.0
|
||||
platformdirs==4.3.6
|
||||
pooch==1.8.2
|
||||
portalocker==2.10.1
|
||||
primePy==1.3
|
||||
prompt_toolkit==3.0.48
|
||||
propcache==0.2.1
|
||||
proto-plus==1.26.1
|
||||
protobuf==5.29.5
|
||||
protoc-gen-openapiv2==0.0.1
|
||||
psutil==6.1.1
|
||||
ptyprocess==0.7.0
|
||||
pure_eval==0.2.3
|
||||
puremagic==1.28
|
||||
pyannote.audio==3.3.2
|
||||
pyannote.core==5.0.0
|
||||
pyannote.database==5.1.0
|
||||
pyannote.metrics==3.2.1
|
||||
pyannote.pipeline==3.0.1
|
||||
pyarrow==19.0.1
|
||||
pyasn1==0.4.8
|
||||
pyasn1-modules==0.2.1
|
||||
pycparser==2.22
|
||||
pycurl==7.44.1
|
||||
pydantic==2.10.6
|
||||
pydantic-extra-types==2.10.2
|
||||
pydantic-settings==2.7.1
|
||||
pydantic_core==2.27.2
|
||||
pydeck==0.9.1
|
||||
pydub==0.25.1
|
||||
Pygments==2.18.0
|
||||
PyGObject==3.42.1
|
||||
PyHamcrest==2.0.2
|
||||
PyJWT==2.3.0
|
||||
pyOpenSSL==21.0.0
|
||||
pyparsing==2.4.7
|
||||
pyrsistent==0.18.1
|
||||
pyserial==3.5
|
||||
python-apt==2.4.0+ubuntu4
|
||||
python-dateutil==2.9.0.post0
|
||||
python-dotenv==1.0.1
|
||||
python-multipart==0.0.20
|
||||
python-pptx==1.0.2
|
||||
pytorch-lightning==2.5.0.post0
|
||||
pytorch-metric-learning==2.8.1
|
||||
pytz==2022.1
|
||||
PyYAML==5.4.1
|
||||
pyzmq==26.2.0
|
||||
rank-bm25==0.2.2
|
||||
redis==5.2.1
|
||||
regex==2024.11.6
|
||||
requests==2.32.3
|
||||
requests-toolbelt==1.0.0
|
||||
rich==13.9.4
|
||||
rich-toolkit==0.13.2
|
||||
rsa==4.9.1
|
||||
ruamel.yaml==0.18.8
|
||||
ruamel.yaml.clib==0.2.12
|
||||
ruff==0.8.4
|
||||
scikit-learn==1.6.0
|
||||
scipy==1.14.1
|
||||
screen-resolution-extra==0.0.0
|
||||
SecretStorage==3.3.1
|
||||
semver==3.0.2
|
||||
sentencepiece==0.2.0
|
||||
service-identity==18.1.0
|
||||
sgmllib3k==1.0.0
|
||||
shellingham==1.5.4
|
||||
six==1.16.0
|
||||
smmap==5.0.2
|
||||
sniffio==1.3.1
|
||||
snowflake-id==1.0.2
|
||||
sortedcontainers==2.4.0
|
||||
soundfile==0.13.0
|
||||
soupsieve==2.6
|
||||
soxr==0.5.0.post1
|
||||
speechbrain==1.0.2
|
||||
SpeechRecognition==3.14.1
|
||||
SQLAlchemy==2.0.36
|
||||
ssh-import-id==5.11
|
||||
stack-data==0.6.3
|
||||
starlette==0.45.3
|
||||
streamlit==1.44.1
|
||||
sympy==1.13.1
|
||||
systemd-python==234
|
||||
tabulate==0.9.0
|
||||
tavily-python==0.5.1
|
||||
tenacity==9.0.0
|
||||
tensorboardX==2.6.2.2
|
||||
texttable==1.6.4
|
||||
threadpoolctl==3.5.0
|
||||
tiktoken==0.8.0
|
||||
toml==0.10.2
|
||||
torch==2.5.1
|
||||
torch-audiomentations==0.11.1
|
||||
torch_pitch_shift==1.2.5
|
||||
torchaudio==2.5.1
|
||||
torchmetrics==1.6.1
|
||||
tornado==6.4.2
|
||||
tqdm==4.67.1
|
||||
traitlets==5.14.3
|
||||
triton==3.1.0
|
||||
Twisted==22.1.0
|
||||
typer==0.15.1
|
||||
types-requests==2.32.0.20241016
|
||||
typing_extensions==4.12.2
|
||||
tzdata==2024.2
|
||||
ubuntu-drivers-common==0.0.0
|
||||
ubuntu-pro-client==8001
|
||||
ufw==0.36.1
|
||||
ujson==5.10.0
|
||||
unattended-upgrades==0.1
|
||||
urllib3==2.3.0
|
||||
uvicorn==0.34.0
|
||||
uvloop==0.21.0
|
||||
wadllib==1.3.6
|
||||
watchdog==6.0.0
|
||||
watchfiles==1.0.4
|
||||
wcwidth==0.2.13
|
||||
websocket-client==1.2.3
|
||||
websockets==14.2
|
||||
wget==3.2
|
||||
whisper-timestamped==1.15.8
|
||||
xkit==0.0.0
|
||||
xlrd==2.0.1
|
||||
XlsxWriter==3.2.2
|
||||
yarl==1.18.3
|
||||
youtube-transcript-api==0.6.3
|
||||
zipp==1.0.0
|
||||
zope.interface==5.4.0
|
||||
zstandard==0.23.0
|
||||
139
workspace/api.py
Normal file
139
workspace/api.py
Normal file
@@ -0,0 +1,139 @@
|
||||
# api.py
|
||||
|
||||
import asyncio
|
||||
import json # JSON 파싱을 위해 추가
|
||||
|
||||
from fastapi import APIRouter, FastAPI, File, HTTPException, UploadFile
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from routers import google_docai
|
||||
from utils.config import (
|
||||
CORS_ALLOW_CREDENTIALS,
|
||||
CORS_ALLOW_HEADERS,
|
||||
CORS_ALLOW_METHODS,
|
||||
CORS_ALLOW_ORIGINS,
|
||||
UPLOAD_DOCS_DIR,
|
||||
)
|
||||
|
||||
# 유틸리티 함수 임포트 (기존 코드 유지)
|
||||
from utils.file_utils import (
|
||||
create_essential_directories,
|
||||
create_key,
|
||||
save_uploaded_file,
|
||||
)
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
# CORS 설정 (기존 코드 유지)
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=CORS_ALLOW_ORIGINS,
|
||||
allow_credentials=CORS_ALLOW_CREDENTIALS,
|
||||
allow_methods=CORS_ALLOW_METHODS,
|
||||
allow_headers=CORS_ALLOW_HEADERS,
|
||||
)
|
||||
|
||||
|
||||
# 애플리케이션 시작 시 실행될 함수 (기존 코드 유지)
|
||||
@app.on_event("startup")
|
||||
async def startup_event():
|
||||
print("Starting up...")
|
||||
create_essential_directories()
|
||||
print("Essential directories created.")
|
||||
|
||||
|
||||
# --- Document AI 라우터 ---
|
||||
doc_ai_router = APIRouter(
|
||||
prefix="/docai",
|
||||
tags=["DocumentAI"],
|
||||
)
|
||||
|
||||
# Document AI 관련 설정값 (프로덕션에서는 환경 변수나 설정 파일에서 로드 권장)
|
||||
DOCAI_PROJECT_ID = "drawingpdfocr-461103"
|
||||
DOCAI_LOCATION = "us"
|
||||
DOCAI_PROCESSOR_ID = "b838676d4e3b4758" # 실제 사용자의 프로세서 ID
|
||||
|
||||
|
||||
async def run_sync_in_threadpool(func, *args, **kwargs):
|
||||
"""동기 함수를 별도의 스레드에서 실행하고 await 가능하게 만듭니다."""
|
||||
loop = asyncio.get_event_loop()
|
||||
if hasattr(asyncio, "to_thread"): # Python 3.9+
|
||||
return await asyncio.to_thread(func, *args, **kwargs)
|
||||
else: # Python < 3.9
|
||||
return await loop.run_in_executor(None, lambda: func(*args, **kwargs))
|
||||
|
||||
|
||||
@doc_ai_router.post("/process-document/")
|
||||
async def process_uploaded_document(file: UploadFile = File(...)):
|
||||
"""
|
||||
업로드된 파일을 Document AI로 처리하고, 추출된 엔티티 정보를 JSON으로 반환합니다.
|
||||
"""
|
||||
if not file.content_type:
|
||||
raise HTTPException(status_code=400, detail="File content type is missing.")
|
||||
|
||||
# 지원되는 MIME 타입 (예시, 필요에 따라 확장)
|
||||
allowed_mime_types = ["application/pdf", "image/jpeg", "image/png", "image/tiff"]
|
||||
if file.content_type not in allowed_mime_types:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Unsupported file type: '{file.content_type}'. Supported: {', '.join(allowed_mime_types)}",
|
||||
)
|
||||
print(f"Received audio file for async processing: {file.filename}")
|
||||
file_id = str(create_key())
|
||||
|
||||
# 파일 저장 (유틸리티 함수 사용)
|
||||
try:
|
||||
file_path, file_content = save_uploaded_file(file, UPLOAD_DOCS_DIR, file_id)
|
||||
except HTTPException as e:
|
||||
raise e
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"파일 저장 준비 중 오류 발생: {str(e)}"
|
||||
)
|
||||
|
||||
try:
|
||||
# Document AI 처리 (동기 함수를 비동기적으로 호출)
|
||||
document_result = await run_sync_in_threadpool(
|
||||
google_docai.process_document_from_content, # 수정된 함수 사용
|
||||
project_id=DOCAI_PROJECT_ID,
|
||||
location=DOCAI_LOCATION,
|
||||
processor_id=DOCAI_PROCESSOR_ID,
|
||||
file_content=file_content,
|
||||
mime_type=file.content_type,
|
||||
field_mask="text,entities", # 필요한 필드 마스크
|
||||
)
|
||||
print(document_result)
|
||||
if not document_result:
|
||||
# 이 경우는 process_document_from_content 함수 내부에서 예외가 발생하지 않고
|
||||
# None이나 빈 Document 객체를 반환했을 때를 대비 (일반적으론 예외 발생)
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail="Failed to process document: No result from Document AI.",
|
||||
)
|
||||
|
||||
json_output_string = google_docai.extract_and_convert_to_json(document_result)
|
||||
|
||||
return json.loads(json_output_string)
|
||||
|
||||
except HTTPException as http_exc:
|
||||
# 이미 HTTPException으로 처리된 예외는 그대로 다시 발생시킴
|
||||
raise http_exc
|
||||
except Exception as e:
|
||||
# 기타 예외 처리 (로깅 권장)
|
||||
# import traceback
|
||||
# print(f"Error processing file: {e}\n{traceback.format_exc()}")
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"An error occurred during document processing: {str(e)}",
|
||||
)
|
||||
finally:
|
||||
await file.close() # 업로드된 파일 객체를 닫아 리소스 정리
|
||||
|
||||
|
||||
# app에 라우터 등록
|
||||
app.include_router(doc_ai_router)
|
||||
|
||||
|
||||
@app.get("/health/API")
|
||||
async def health_check():
|
||||
"""애플리케이션 상태 확인"""
|
||||
return {"status": "API ok"}
|
||||
70
workspace/routers/google_docai.py
Normal file
70
workspace/routers/google_docai.py
Normal file
@@ -0,0 +1,70 @@
|
||||
# google_docai.py
|
||||
|
||||
import json
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
from google.api_core.client_options import ClientOptions
|
||||
from google.cloud import documentai
|
||||
|
||||
if not os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): # 이미 설정되어 있지 않다면
|
||||
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = (
|
||||
"/home/jackjack/test/doc_ai/workspace/drawingpdfocr-461103-2441e0b34216.json" # 이 경로가 API 서버 실행 시점에서 유효해야 함
|
||||
)
|
||||
|
||||
|
||||
def process_document_from_content( # 함수 이름 및 파라미터 변경
|
||||
project_id: str,
|
||||
location: str,
|
||||
processor_id: str,
|
||||
file_content: bytes, # file_path 대신 file_content (bytes)
|
||||
mime_type: str,
|
||||
field_mask: Optional[str] = None,
|
||||
processor_version_id: Optional[str] = None,
|
||||
) -> documentai.Document:
|
||||
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
|
||||
client = documentai.DocumentProcessorServiceClient(client_options=opts)
|
||||
|
||||
if processor_version_id:
|
||||
name = client.processor_version_path(
|
||||
project_id, location, processor_id, processor_version_id
|
||||
)
|
||||
else:
|
||||
name = client.processor_path(project_id, location, processor_id)
|
||||
|
||||
# 파일 읽기 부분이 사라지고, file_content를 직접 사용
|
||||
raw_document = documentai.RawDocument(content=file_content, mime_type=mime_type)
|
||||
|
||||
# 예시: 첫 페이지만 처리 (필요에 따라 수정)
|
||||
process_options = documentai.ProcessOptions(
|
||||
individual_page_selector=documentai.ProcessOptions.IndividualPageSelector(
|
||||
pages=[1]
|
||||
)
|
||||
)
|
||||
request = documentai.ProcessRequest(
|
||||
name=name,
|
||||
raw_document=raw_document,
|
||||
field_mask=field_mask,
|
||||
process_options=process_options,
|
||||
)
|
||||
result = client.process_document(request=request)
|
||||
document = result.document
|
||||
return document
|
||||
|
||||
|
||||
def extract_and_convert_to_json(
|
||||
document: documentai.Document,
|
||||
) -> str:
|
||||
extracted_entities = []
|
||||
if document and document.entities:
|
||||
for entity in document.entities:
|
||||
if (
|
||||
hasattr(entity, "type_")
|
||||
and hasattr(entity, "mention_text")
|
||||
and entity.type_
|
||||
and entity.mention_text
|
||||
):
|
||||
extracted_entities.append(
|
||||
{"type": entity.type_, "mention_text": entity.mention_text}
|
||||
)
|
||||
return json.dumps(extracted_entities, ensure_ascii=False, indent=2)
|
||||
104
workspace/tests/a.py
Normal file
104
workspace/tests/a.py
Normal file
@@ -0,0 +1,104 @@
|
||||
import json # JSON 모듈 임포트
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
from google.api_core.client_options import ClientOptions
|
||||
from google.cloud import documentai # type: ignore
|
||||
|
||||
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = (
|
||||
"../drawingpdfocr-461103-2441e0b34216.json" # 이 경로가 정확한지 다시 한번 확인해주세요!
|
||||
)
|
||||
|
||||
|
||||
def process_document_sample(
|
||||
project_id: str,
|
||||
location: str,
|
||||
processor_id: str,
|
||||
file_path: str,
|
||||
mime_type: str,
|
||||
field_mask: Optional[str] = None,
|
||||
processor_version_id: Optional[str] = None,
|
||||
) -> documentai.Document:
|
||||
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
|
||||
client = documentai.DocumentProcessorServiceClient(client_options=opts)
|
||||
|
||||
if processor_version_id:
|
||||
name = client.processor_version_path(
|
||||
project_id, location, processor_id, processor_version_id
|
||||
)
|
||||
else:
|
||||
name = client.processor_path(project_id, location, processor_id)
|
||||
|
||||
with open(file_path, "rb") as image:
|
||||
image_content = image.read()
|
||||
|
||||
raw_document = documentai.RawDocument(content=image_content, mime_type=mime_type)
|
||||
process_options = documentai.ProcessOptions(
|
||||
individual_page_selector=documentai.ProcessOptions.IndividualPageSelector(
|
||||
pages=[1]
|
||||
)
|
||||
)
|
||||
request = documentai.ProcessRequest(
|
||||
name=name,
|
||||
raw_document=raw_document,
|
||||
field_mask=field_mask,
|
||||
process_options=process_options,
|
||||
)
|
||||
result = client.process_document(request=request)
|
||||
document = result.document
|
||||
return document
|
||||
|
||||
|
||||
def extract_and_convert_to_json(
|
||||
document: documentai.Document,
|
||||
) -> str: # 반환 타입을 str (JSON 문자열)로 명시
|
||||
"""
|
||||
Document AI의 Document 객체에서 entities의 type과 mention_text를 추출하여
|
||||
JSON 문자열로 반환합니다.
|
||||
"""
|
||||
extracted_entities = []
|
||||
if document and document.entities:
|
||||
for entity in document.entities:
|
||||
if (
|
||||
hasattr(entity, "type_")
|
||||
and hasattr(
|
||||
entity, "mention_text"
|
||||
) # type_와 mention_text 속성이 있는지 확인
|
||||
and entity.type_
|
||||
and entity.mention_text
|
||||
):
|
||||
extracted_entities.append(
|
||||
{"type": entity.type_, "mention_text": entity.mention_text}
|
||||
)
|
||||
|
||||
return json.dumps(extracted_entities, ensure_ascii=False, indent=2)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
project_id = "drawingpdfocr-461103"
|
||||
location = "us"
|
||||
processor_id = "b838676d4e3b4758"
|
||||
file_path = "../data/UPLOAD_DOCS/3공구-설계도1-004.pdf"
|
||||
mime_type = "application/pdf"
|
||||
|
||||
try:
|
||||
document_result = process_document_sample(
|
||||
project_id=project_id,
|
||||
location=location,
|
||||
processor_id=processor_id,
|
||||
file_path=file_path,
|
||||
mime_type=mime_type,
|
||||
field_mask="text,entities", # entities 정보를 받아오도록 field_mask 설정
|
||||
)
|
||||
|
||||
if document_result:
|
||||
json_output_string = extract_and_convert_to_json(document_result)
|
||||
|
||||
print(json_output_string) # 변환된 JSON 문자열을 출력
|
||||
|
||||
except FileNotFoundError:
|
||||
print(
|
||||
f"오류: 파일 경로 '{file_path}'에서 파일을 찾을 수 없습니다. 파일 경로를 확인해주세요."
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"함수 실행 중 오류 발생: {e}")
|
||||
16
workspace/utils/config.py
Normal file
16
workspace/utils/config.py
Normal file
@@ -0,0 +1,16 @@
|
||||
# config.py
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
# 디렉토리 설정
|
||||
UPLOAD_DOCS_DIR = Path(os.getenv("AUDIO_DIR", "./data/UPLOAD_DOCS"))
|
||||
RESULT_DIR = Path(os.getenv("RESULT_DIR", "./data/results"))
|
||||
|
||||
# 허용 파일 확장자
|
||||
ALLOWED_EXTENSIONS = {".pdf"}
|
||||
# CORS 설정
|
||||
CORS_ALLOW_ORIGINS = os.getenv("CORS_ALLOW_ORIGINS", "*").split(",")
|
||||
CORS_ALLOW_CREDENTIALS = os.getenv("CORS_ALLOW_CREDENTIALS", "true").lower() == "true"
|
||||
CORS_ALLOW_METHODS = os.getenv("CORS_ALLOW_METHODS", "*").split(",")
|
||||
CORS_ALLOW_HEADERS = os.getenv("CORS_ALLOW_HEADERS", "*").split(",")
|
||||
58
workspace/utils/file_utils.py
Normal file
58
workspace/utils/file_utils.py
Normal file
@@ -0,0 +1,58 @@
|
||||
# utils/file_utils.py
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import HTTPException, UploadFile
|
||||
from snowflake import SnowflakeGenerator
|
||||
from utils.config import ALLOWED_EXTENSIONS, RESULT_DIR, UPLOAD_DOCS_DIR
|
||||
|
||||
|
||||
def create_essential_directories():
|
||||
"""애플리케이션 시작 시 필요한 디렉토리를 생성합니다."""
|
||||
UPLOAD_DOCS_DIR.mkdir(exist_ok=True)
|
||||
RESULT_DIR.mkdir(exist_ok=True)
|
||||
|
||||
|
||||
def create_key(node=1):
|
||||
generator = SnowflakeGenerator(node)
|
||||
key_value = next(generator)
|
||||
return str(key_value)
|
||||
|
||||
|
||||
def save_uploaded_file(
|
||||
upload_file: UploadFile, save_dir: Path, file_prefix: str
|
||||
) -> tuple[str, bytes]:
|
||||
"""
|
||||
업로드된 파일을 지정된 디렉토리에 저장하고, 파일 내용을 바이트로 반환합니다.
|
||||
|
||||
Returns:
|
||||
저장된 파일 경로 (str), 파일 내용 (bytes)
|
||||
"""
|
||||
file_extension = Path(upload_file.filename).suffix.lower()
|
||||
if file_extension not in ALLOWED_EXTENSIONS:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"지원하지 않는 파일 형식이에요. 지원 형식: {', '.join(ALLOWED_EXTENSIONS)} 😢",
|
||||
)
|
||||
|
||||
new_filename = f"{file_prefix}{file_extension}"
|
||||
file_path = save_dir / new_filename
|
||||
|
||||
try:
|
||||
upload_file.file.seek(0)
|
||||
file_content = upload_file.file.read() # 내용을 읽음
|
||||
|
||||
with open(file_path, "wb") as buffer:
|
||||
buffer.write(file_content) # 내용을 저장
|
||||
|
||||
print(f"File saved: {file_path}")
|
||||
return str(file_path), file_content
|
||||
|
||||
except IOError as e:
|
||||
print(f"File saving error for {file_prefix}: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=f"파일 저장 중 오류 발생: {str(e)}")
|
||||
except Exception as e:
|
||||
print(f"Unexpected file saving error for {file_prefix}: {e}", exc_info=True)
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"파일 처리 중 예상치 못한 오류 발생: {str(e)}"
|
||||
)
|
||||
Reference in New Issue
Block a user