diff --git a/tasks.py b/tasks.py index 3ef8ed7..f35df0f 100644 --- a/tasks.py +++ b/tasks.py @@ -109,37 +109,52 @@ def parse_ocr_text(self, presigned_url: str, request_id: str, file_name: str): # Upstage 응답 정규화: 가능한 많은 'text'를 모으고, 후보 bbox를 수집 -def _normalize_upstage_response(resp_json): +def _normalize_upstage_response(resp_json, return_word_level=False, normalize=False): """ Upstage 문서 디지타이제이션 응답에서 text와 bbox 후보를 추출. 구조가 달라도 dict/list를 재귀 탐색하여 'text' 유사 키와 bbox 유사 키를 모읍니다. """ - texts = [] - boxes = [] + # 1) 전체 텍스트 추출 + if isinstance(resp_json, dict) and resp_json.get("text"): + full_text = resp_json["text"] + else: + full_text = "" + for p in resp_json.get("pages") or []: + t = p.get("text") + if t: + full_text += t + "\n" + full_text = full_text.rstrip("\n") - def walk(obj): - if isinstance(obj, dict): - for k, v in obj.items(): - kl = k.lower() - # text 후보 키 - if kl in ("text", "content", "ocr_text", "full_text", "value"): - if isinstance(v, str) and v.strip(): - texts.append(v.strip()) - # bbox/box 후보 키 - if kl in ("bbox", "box", "bounding_box", "boundingbox", "polygon"): - boxes.append(v) - # 재귀 - walk(v) - elif isinstance(obj, list): - for item in obj: - walk(item) + # 2) 좌표/워드 추출 + coords = [] + word_items = [] + pages = resp_json.get("pages") or [] + for p_idx, page in enumerate(pages, start=1): + w = page.get("words") or [] + pw, ph = page.get("width"), page.get("height") # 정규화 옵션용 + for wobj in w: + bb = (wobj.get("boundingBox") or {}).get("vertices") or [] + if len(bb) == 4: + poly = [[float(pt.get("x", 0)), float(pt.get("y", 0))] for pt in bb] - walk(resp_json) + if normalize and pw and ph: + poly = [[x / float(pw), y / float(ph)] for x, y in poly] - merged_text = ( - "\n".join(texts) if texts else json.dumps(resp_json, ensure_ascii=False) - ) - return merged_text, boxes + if return_word_level: + word_items.append( + { + "page": p_idx, + "text": wobj.get("text", ""), + "confidence": float(wobj.get("confidence") or 0.0), + "box": poly, # 4x2 + } + ) + else: + coords.append(poly) + + if return_word_level: + return full_text, word_items + return full_text, coords # (Upstage) 외부 OCR API 호출 + 후처리