Remove GT skip logic and always run PaddleOCR text extraction

2025-08-13 09:38:55 +09:00
parent a225767055
commit 8f02dc9a23
1 changed files with 7 additions and 7 deletions
--- a/utils/text_extractor.py
+++ b/utils/text_extractor.py
@@ -19,13 +19,13 @@ async def extract_text_from_file(file_path):
    images = []

    if ext == ".pdf":
-        # ① 먼저 PDF에서 텍스트 추출 시도
-        text_only = await asyncio.to_thread(extract_text_from_pdf_direct, file_path)
-        if text_only.strip():
-            logger.info(
-                "[UTILS-TEXT] PDF는 텍스트 기반입니다. (OCR 없이 텍스트 추출 완료)"
-            )
-            return text_only, [], "OCR not used"
+        # ① 먼저 PDF에서 텍스트 추출 시도 -> GT를 만들기에 무조건 ocr 과정 거치도록 변경
+        # text_only = await asyncio.to_thread(extract_text_from_pdf_direct, file_path)
+        # if text_only.strip():
+        #     logger.info(
+        #         "[UTILS-TEXT] PDF는 텍스트 기반입니다. (OCR 없이 텍스트 추출 완료)"
+        #     )
+        #     return text_only, [], "OCR not used"

        # ② 텍스트가 없으면 이미지 변환 → OCR 수행
        images = await asyncio.to_thread(convert_from_path, file_path, dpi=400)