From b0049229d880c11198f1e0238b0c90b8faecb17d Mon Sep 17 00:00:00 2001 From: kyy Date: Wed, 13 Aug 2025 11:07:44 +0900 Subject: [PATCH] =?UTF-8?q?PaddleOCR=20=EC=8B=A4=ED=96=89=20=EC=A0=84=20?= =?UTF-8?q?=EC=9D=B4=EB=AF=B8=EC=A7=80=EB=A5=BC=203=EC=B1=84=EB=84=90=20ui?= =?UTF-8?q?nt8=EB=A1=9C=20=ED=91=9C=EC=A4=80=ED=99=94=ED=95=98=EC=97=AC=20?= =?UTF-8?q?Normalize=20IndexError=20=EB=B0=A9=EC=A7=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- utils/text_extractor.py | 53 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/utils/text_extractor.py b/utils/text_extractor.py index e58ee09..d7e1a43 100644 --- a/utils/text_extractor.py +++ b/utils/text_extractor.py @@ -105,6 +105,41 @@ def preprocess_image_for_ocr(pil_img, page_idx=None): return Image.fromarray(img) +def _to_rgb_uint8(img_np: np.ndarray) -> np.ndarray: + """ + 입력 이미지를 3채널 RGB, uint8 [0,255] 로 표준화 + 허용 입력: HxW, HxWx1, HxWx3, HxWx4, float[0..1]/[0..255], int 등 + """ + if img_np is None: + raise ValueError("Input image is None") + + # dtype/범위 표준화 + if img_np.dtype != np.uint8: + arr = img_np.astype(np.float32) + if arr.max() <= 1.0: # [0,1]로 보이면 스케일업 + arr *= 255.0 + arr = np.clip(arr, 0, 255).astype(np.uint8) + img_np = arr + + # 채널 표준화 + if img_np.ndim == 2: # HxW + img_np = cv2.cvtColor(img_np, cv2.COLOR_GRAY2RGB) + elif img_np.ndim == 3: + h, w, c = img_np.shape + if c == 1: + img_np = cv2.cvtColor(img_np, cv2.COLOR_GRAY2RGB) + elif c == 4: + img_np = cv2.cvtColor(img_np, cv2.COLOR_RGBA2RGB) + elif c == 3: + pass # 그대로 사용 + else: + raise ValueError(f"Unsupported channel count: {c}") + else: + raise ValueError(f"Unsupported ndim: {img_np.ndim}") + + return img_np + + def extract_text_paddle_ocr(images): """ PaddleOCR를 사용하여 이미지에서 텍스트 추출 및 좌표 정보 반환 @@ -121,8 +156,22 @@ def extract_text_paddle_ocr(images): print(f"[PaddleOCR] 페이지 {page_idx + 1} OCR로 텍스트 추출 중...") img_np = np.array(img) - if len(img_np.shape) == 2: # grayscale → RGB 변환 - img_np = cv2.cvtColor(img_np, cv2.COLOR_GRAY2RGB) + # ✅ 채널/타입 표준화 (grayscale/rgba/float 등 대응) + try: + img_np = _to_rgb_uint8(img_np) + except Exception as e: + print(f"[PaddleOCR] 페이지 {page_idx + 1} 입력 표준화 실패: {e}") + continue # 문제 페이지 스킵 후 다음 페이지 진행 + + # ✅ 과도한 해상도 안정화 (최대 변 4000px) + h, w = img_np.shape[:2] + max_side = max(h, w) + max_side_limit = 4000 + if max_side > max_side_limit: + scale = max_side_limit / max_side + new_size = (int(w * scale), int(h * scale)) + img_np = cv2.resize(img_np, new_size, interpolation=cv2.INTER_AREA) + print(f"[PaddleOCR] Resized to {img_np.shape[1]}x{img_np.shape[0]}") results = ocr.predict(input=img_np)