Files
_Geulbeot/03.Code/업로드용/converters/pipeline/step_format.py
2026-03-19 09:58:01 +09:00

166 lines
5.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""
step_format.py — 형식만 변경 모드 파이프라인 v4
문서 유형 자동 감지 후 전용 핸들러로 분기:
fmt_gpd_report.py ← GPD 보고서형 (헤더 2개↑ + 공백 기반 들여쓰기)
fmt_outline.py ← 목차/개요형 (1./a./. 번호 패턴 40%↑)
fmt_table_report.py ← 표 중심형 (표가 페이지 면적 60%↑)
fmt_generic.py ← 범용 fallback (위 유형 해당 없을 때, 페이지별 자동 선택)
판별 기준:
1. 헤더 패턴 2개 이상 → GPD 보고서형
2. 표 면적 비율 60% 이상 → 표 중심형
3. 번호/기호 패턴 비율 40% 이상 → 목차/개요형
4. 해당 없음 → 범용 fallback
"""
import re
import os
import sys
from pathlib import Path
import fitz
from fmt_base import (
log, detect_header_footer_patterns, build_html,
extract_tables_from_page, ZW_PATTERN
)
import fmt_gpd_report
import fmt_outline
import fmt_table_report
import fmt_generic
# ============================================================
# 문서 유형 판별
# ============================================================
_NUM_P = re.compile(r'^\d+[\.\uff0e\)]\s*[^\d\.\s]')
_ALPHA_P = re.compile(r'^[a-z][\.\uff0e\)]\s*\S')
_ROMAN_P = re.compile(r'^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴ]+[\.\s]')
_DASH_P = re.compile(r'^-\s*[^\s\(]')
def _doc_pattern_ratio(doc, header_patterns, footer_patterns):
"""전 페이지 평균 번호/기호 패턴 비율"""
total_hits, total_lines = 0, 0
for page in doc:
ph = page.rect.height
tb = ph * 0.85
# 표 bbox 수집
table_rects = []
try:
for t in page.find_tables().tables:
table_rects.append(t.bbox)
except: pass
def in_table(x, y):
for tx0,ty0,tx1,ty1 in table_rects:
if tx0-5<=x<=tx1+5 and ty0-5<=y<=ty1+5: return True
return False
for b in page.get_text("dict")["blocks"]:
if b.get("type") != 0: continue
for line in b.get("lines",[]):
x, parts = None, []
for s in line.get("spans",[]):
t = ZW_PATTERN.sub('', s.get("text",""))
if t.strip() and x is None: x = s["bbox"][0]
parts.append(t)
text = ''.join(parts).strip()
y0 = line['bbox'][1]
if text and y0<tb and text not in header_patterns and text not in footer_patterns:
if x is None or not in_table(x, y0):
total_lines += 1
if (_NUM_P.match(text) or _ALPHA_P.match(text) or
_ROMAN_P.match(text) or _DASH_P.match(text)):
total_hits += 1
return total_hits/total_lines if total_lines else 0.0
def _table_area_ratio(doc):
"""전 페이지 평균 표 면적 비율"""
ratios = []
for page in doc:
page_area = page.rect.width * page.rect.height
try:
table_area = sum((t.bbox[2]-t.bbox[0])*(t.bbox[3]-t.bbox[1])
for t in page.find_tables().tables)
except: table_area = 0
ratios.append(table_area / page_area if page_area else 0)
return sum(ratios)/len(ratios) if ratios else 0.0
def detect_doc_type(doc, header_patterns, footer_patterns):
"""
반환:
'gpd_report' — GPD 보고서형
'table_report' — 표 중심형
'outline' — 목차/개요형
'generic' — 범용 fallback
"""
n_headers = len(header_patterns)
table_ratio = _table_area_ratio(doc)
pattern_ratio = _doc_pattern_ratio(doc, header_patterns, footer_patterns)
log(f"유형 판별: 헤더={n_headers}개 표면적={table_ratio:.2f} 패턴={pattern_ratio:.2f}")
if n_headers >= 2:
return 'gpd_report'
if table_ratio >= 0.6:
return 'table_report'
if pattern_ratio >= 0.4:
return 'outline'
return 'generic'
# ============================================================
# 메인 진입점
# ============================================================
def run_format_only(session_id: str, input_dir: str) -> dict:
gen_dir = Path(f'/tmp/{session_id}/format_out')
gen_dir.mkdir(parents=True, exist_ok=True)
pdf_files = list(Path(input_dir).rglob('*.pdf')) + \
list(Path(input_dir).rglob('*.PDF'))
if not pdf_files:
return {'success': False, 'error': '처리할 PDF 파일이 없습니다.'}
pdf_path = pdf_files[0]
log(f"PDF: {pdf_path.name}")
doc = fitz.open(str(pdf_path))
log(f"{len(doc)}페이지")
header_patterns, footer_patterns, meta = detect_header_footer_patterns(doc)
log(f"메타: {meta}")
doc_type = detect_doc_type(doc, header_patterns, footer_patterns)
log(f"문서 유형: {doc_type}")
handler_map = {
'gpd_report': fmt_gpd_report,
'outline': fmt_outline,
'table_report': fmt_table_report,
'generic': fmt_generic,
}
handler = handler_map[doc_type]
pages_html = handler.process(doc, header_patterns, footer_patterns, meta)
if not pages_html:
return {'success': False, 'error': '생성된 페이지가 없습니다.'}
final_html = build_html(pages_html)
out_path = gen_dir / 'result.html'
out_path.write_text(final_html, encoding='utf-8')
log(f"완료: {len(pages_html)}페이지 → {doc_type}")
return {
'success': True, 'html': final_html,
'page_count': len(pages_html),
'doc_type': doc_type,
'session_id': session_id
}