diff --git a/utils/text_extractor.py b/utils/text_extractor.py index d7e1a43..d59a6c7 100644 --- a/utils/text_extractor.py +++ b/utils/text_extractor.py @@ -149,8 +149,8 @@ def extract_text_paddle_ocr(images): use_doc_orientation_classify=False, use_doc_unwarping=False, lang="korean" ) - full_response = [] coord_response = [] + all_text_boxes = [] # (y_center, x_center, text, box) 저장용 for page_idx, img in enumerate(images): print(f"[PaddleOCR] 페이지 {page_idx + 1} OCR로 텍스트 추출 중...") @@ -183,13 +183,50 @@ def extract_text_paddle_ocr(images): texts = res_dic.get("rec_texts", []) boxes = res_dic.get("rec_boxes", []) - full_response.extend(texts) + for text, box in zip(texts, boxes): + if isinstance(box, np.ndarray): + box = box.tolist() + # ✅ box 정규화 + if all(isinstance(p, (int, float)) for p in box): + if len(box) % 2 == 0: + box = [[box[i], box[i + 1]] for i in range(0, len(box), 2)] + else: + print(f"[PaddleOCR] 잘못된 box 형식: {box}") + continue - # ndarray → list 변환 - clean_boxes = [ - box.tolist() if isinstance(box, np.ndarray) else box for box in boxes - ] - coord_response.extend(clean_boxes) + coord_response.append(box) + + # 중심 좌표 계산 (y → 줄 순서, x → 단어 순서) + x_coords = [p[0] for p in box] + y_coords = [p[1] for p in box] + x_center = sum(x_coords) / len(x_coords) + y_center = sum(y_coords) / len(y_coords) + + all_text_boxes.append((y_center, x_center, text)) + + # ✅ 위치 기반 정렬 + all_text_boxes.sort(key=lambda x: (x[0], x[1])) # y 먼저, 그 다음 x 정렬 + + # ✅ 줄 단위 그룹핑 + lines = [] + current_line = [] + prev_y = None + line_threshold = 15 # 줄 묶음 y 오차 허용값 + + for y, x, text in all_text_boxes: + if prev_y is None or abs(y - prev_y) < line_threshold: + current_line.append((x, text)) + else: + current_line.sort(key=lambda xx: xx[0]) + lines.append(" ".join(t for _, t in current_line)) + current_line = [(x, text)] + prev_y = y + + if current_line: + current_line.sort(key=lambda xx: xx[0]) + lines.append(" ".join(t for _, t in current_line)) + + parsed_text = "\n".join(lines) print("[PaddleOCR] 전체 페이지 텍스트 및 좌표 추출 완료") - return " ".join(full_response), coord_response + return parsed_text, coord_response