test/handlers/content_analyzer.py

# -*- coding: utf-8 -*-
"""
Content Analyzer (Phase 3 — Layer A)
- template_info + semantic_map → content_prompt.json
- 각 placeholder의 의미/유형/예시값/작성 패턴 추출
- Phase 5에서 AI가 새 문서 생성 시 "레시피"로 참조

★ 원칙: 모든 분류는 코드 100% (AI 없음)
   purpose_hint / audience_hint / tone_hint는 빈 문자열로 남김
   → 추후 AI enrichment 단계에서 채울 수 있도록 설계
"""

import re


def generate(template_info: dict, semantic_map: dict,
             parsed: dict = None) -> dict:
    """
    content_prompt.json 생성

    Args:
        template_info: doc_template_analyzer 추출 결과
        semantic_map: semantic_mapper 분류 결과
        parsed: HWPX 파싱 원본 (선택)

    Returns:
        content_prompt.json 구조
    """
    placeholders = {}
    table_guide = {}

    # ① 문서 기본 정보
    document = _analyze_document(template_info)

    # ② 헤더 placeholders
    _analyze_header(template_info, placeholders)

    # ③ 푸터 placeholders
    _analyze_footer(template_info, placeholders)

    # ④ 제목 placeholder
    _analyze_title(template_info, semantic_map, placeholders)

    # ⑤ 섹션 placeholders
    _analyze_sections(semantic_map, placeholders, template_info)

    # ⑤-b content_order 기반 문단/이미지 placeholders
    _analyze_content_order(template_info, semantic_map, placeholders)

    # ⑥ 표 가이드 + placeholders
    _analyze_tables(template_info, semantic_map,
                    placeholders, table_guide)

    # ⑦ 작성 패턴
    writing_guide = _analyze_writing_patterns(template_info, semantic_map)

    return {
        "version": "1.0",
        "document": document,
        "placeholders": placeholders,
        "table_guide": table_guide,
        "writing_guide": writing_guide
    }


# ================================================================
#  문서 기본 정보
# ================================================================

def _analyze_document(template_info: dict) -> dict:
    """문서 레벨 정보 추출"""
    page = template_info.get("page", {})
    paper = page.get("paper", {})

    return {
        "paper": paper.get("name", "A4"),
        "layout": "landscape" if paper.get("landscape") else "portrait",
        "margins": page.get("margins", {}),
        "purpose_hint": "",    # AI enrichment 예약
        "audience_hint": "",   # AI enrichment 예약
        "tone_hint": ""        # AI enrichment 예약
    }


# ================================================================
#  텍스트 유형 분류 (코드 100%, AI 없음)
# ================================================================

def _classify_text(text: str) -> dict:
    """텍스트 패턴으로 콘텐츠 유형 분류"""
    text = text.strip()
    if not text:
        return {"type": "empty", "pattern": "빈 셀"}

    # 날짜: "2025. 1. 30(금)", "2025-01-30", "2025.01.30"
    if re.match(r'\d{4}[\.\-/]\s*\d{1,2}[\.\-/]\s*\d{1,2}', text):
        return {"type": "date", "pattern": "날짜 (YYYY. M. D)"}

    # ★ 직급+이름 (부서보다 먼저!)
    positions = [
        '사원', '대리', '과장', '차장', '부장', '이사', '상무', '전무',
        '연구원', '선임연구원', '책임연구원', '수석연구원',
        '주임', '계장', '팀장', '실장', '부서장', '센터장'
    ]
    for pos in positions:
        if pos in text:
            return {"type": "author", "pattern": f"이름 + 직급({pos})"}

    # 부서 (직급 아닌 것만 여기로)
    if re.search(r'(실|부|국|과|원|처|센터|본부)$', text) and len(text) <= 12:
        return {"type": "department", "pattern": "조직명"}

    # 팀
    if re.search(r'팀$', text) and len(text) <= 10:
        return {"type": "team", "pattern": "팀명"}

    # 페이지 참조: "1p", "2p"
    if re.match(r'\d+p$', text):
        return {"type": "page_ref", "pattern": "페이지 참조"}

    # 문서 제목: ~계획(안), ~보고서, ~제안서 등
    if re.search(r'(계획|보고서|제안서|기획서|결과|방안|현황|분석)'
                 r'\s*(\(안\))?\s*$', text):
        return {"type": "doc_title", "pattern": "문서 제목"}

    # 슬로건/비전 (길고 추상적 키워드 포함)
    if len(text) > 10 and any(k in text for k in
                              ['함께', '세상', '미래', '가치', '만들어']):
        return {"type": "slogan", "pattern": "회사 슬로건/비전"}

    # 기본
    return {"type": "text", "pattern": "자유 텍스트"}


# ================================================================
#  헤더 분석
# ================================================================

def _analyze_header(template_info: dict, placeholders: dict):
    """헤더 영역 placeholder 분석"""
    header = template_info.get("header", {})
    if not header or not header.get("exists"):
        return

    if header.get("type") == "table" and header.get("table"):
        _analyze_table_area(header["table"], "HEADER", "header",
                            placeholders)
    else:
        texts = header.get("texts", [])
        for i in range(max(len(texts), 1)):
            ph = f"HEADER_TEXT_{i+1}"
            example = texts[i] if i < len(texts) else ""
            info = _classify_text(example)
            info["example"] = example.strip()
            info["location"] = "header"
            placeholders[ph] = info


# ================================================================
#  푸터 분석
# ================================================================

def _analyze_footer(template_info: dict, placeholders: dict):
    """푸터 영역 placeholder 분석"""
    footer = template_info.get("footer", {})
    if not footer or not footer.get("exists"):
        return

    if footer.get("type") == "table" and footer.get("table"):
        _analyze_table_area(footer["table"], "FOOTER", "footer",
                            placeholders)
    else:
        placeholders["PAGE_NUMBER"] = {
            "type": "page_number",
            "pattern": "페이지 번호",
            "example": "1",
            "location": "footer"
        }


# ================================================================
#  헤더/푸터 공통: 표 형태 영역 분석
# ================================================================

def _analyze_table_area(tbl: dict, prefix: str, location: str,
                        placeholders: dict):
    """표 형태의 헤더/푸터 → placeholder 매핑

    Args:
        tbl: header["table"] 또는 footer["table"]
        prefix: "HEADER" 또는 "FOOTER"
        location: "header" 또는 "footer"
        placeholders: 결과 dict (in-place 수정)
    """
    rows = tbl.get("rows", [])

    for r_idx, row in enumerate(rows):
        for c_idx, cell in enumerate(row):
            lines = cell.get("lines", [])

            if len(lines) > 1:
                for l_idx, line_text in enumerate(lines):
                    ph = f"{prefix}_R{r_idx+1}_C{c_idx+1}_LINE_{l_idx+1}"
                    info = _classify_text(line_text)
                    info["example"] = line_text.strip()
                    info["location"] = location
                    placeholders[ph] = info
            elif lines:
                ph = f"{prefix}_R{r_idx+1}_C{c_idx+1}"
                info = _classify_text(lines[0])
                info["example"] = lines[0].strip()
                info["location"] = location
                placeholders[ph] = info
            else:
                ph = f"{prefix}_R{r_idx+1}_C{c_idx+1}"
                placeholders[ph] = {
                    "type": "empty",
                    "pattern": "빈 셀 (로고/여백)",
                    "example": "",
                    "location": location
                }


# ================================================================
#  제목 분석
# ================================================================

def _analyze_title(template_info: dict, semantic_map: dict,
                   placeholders: dict):
    """제목 블록 placeholder 분석

    ★ v1.1: template_manager._build_title_block_html()과 동일한
       TITLE_R{r}_C{c} 명명 규칙 사용 (범용 매핑)
    """
    title_idx = semantic_map.get("title_table")
    if title_idx is None:
        return

    tables = template_info.get("tables", [])
    title_tbl = next((t for t in tables if t["index"] == title_idx), None)
    if not title_tbl:
        return

    # 각 셀별로 placeholder 생성 (template과 동일한 이름)
    for r_idx, row in enumerate(title_tbl.get("rows", [])):
        for c_idx, cell in enumerate(row):
            cell_text = cell.get("text", "").strip()
            if not cell_text:
                continue  # 빈 셀은 template에서도 placeholder 없음

            ph_name = f"TITLE_R{r_idx+1}_C{c_idx+1}"
            info = _classify_text(cell_text)
            if "title" not in info["type"] and "doc_title" not in info["type"]:
                # 제목표 안의 텍스트가 doc_title이 아닐 수도 있음 (부제 등)
                # 가장 긴 텍스트만 doc_title로 분류
                pass
            info["example"] = cell_text
            info["location"] = "title_block"
            placeholders[ph_name] = info

    # 가장 긴 텍스트를 가진 셀을 doc_title로 마킹
    longest_ph = None
    longest_len = 0
    for ph_key in list(placeholders.keys()):
        if ph_key.startswith("TITLE_R"):
            ex = placeholders[ph_key].get("example", "")
            if len(ex) > longest_len:
                longest_len = len(ex)
                longest_ph = ph_key
    if longest_ph:
        placeholders[longest_ph]["type"] = "doc_title"
        placeholders[longest_ph]["pattern"] = "문서 제목"


# ================================================================
#  섹션 분석
# ================================================================

def _analyze_sections(semantic_map: dict, placeholders: dict,
                      template_info: dict = None):
    """섹션 placeholder 분석.

    content_order에 문단이 있으면 SECTION_n_CONTENT는 생략
    (개별 PARA_n이 본문 역할을 대신함).
    """
    sections = semantic_map.get("sections", [])

    # content_order에 문단이 있으면 개별 PARA_n이 본문 담당 → CONTENT 불필요
    has_co_paragraphs = False
    if template_info:
        co = template_info.get("content_order", [])
        has_co_paragraphs = any(c['type'] == 'paragraph' for c in co) if co else False

    if not sections:
        placeholders["SECTION_1_TITLE"] = {
            "type": "section_title", "pattern": "섹션 제목",
            "example": "", "location": "body"
        }
        if not has_co_paragraphs:
            placeholders["SECTION_1_CONTENT"] = {
                "type": "section_content", "pattern": "섹션 본문",
                "example": "", "location": "body"
            }
        return

    for i, sec in enumerate(sections):
        s_num = i + 1
        title_text = sec if isinstance(sec, str) else sec.get("title", "")

        placeholders[f"SECTION_{s_num}_TITLE"] = {
            "type": "section_title", "pattern": "섹션 제목",
            "example": title_text, "location": "body"
        }
        if not has_co_paragraphs:
            placeholders[f"SECTION_{s_num}_CONTENT"] = {
                "type": "section_content", "pattern": "섹션 본문",
                "example": "", "location": "body"
            }

# ================================================================
#  content_order 기반 문단/이미지 분석 (v5.2+)
# ================================================================

def _analyze_content_order(template_info: dict, semantic_map: dict,
                           placeholders: dict):
    """content_order의 paragraph/image → PARA_n, IMAGE_n placeholder 생성.

    content_order가 없거나 문단이 없으면 아무것도 안 함 (legacy 호환).
    """
    content_order = template_info.get("content_order")
    if not content_order:
        return
    if not any(c['type'] == 'paragraph' for c in content_order):
        return

    # 섹션 제목 패턴 (template_manager와 동일)
    sec_patterns = [
        re.compile(r'^\d+\.\s+\S'),
        re.compile(r'^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ]\.\s*\S'),
        re.compile(r'^제\s*\d+\s*[장절항]\s*\S'),
    ]

    para_num = 0
    img_num = 0
    section_num = 0

    for item in content_order:
        itype = item['type']

        if itype == 'empty':
            continue

        # ── 표: _analyze_tables에서 처리 → 건너뛰기 ──
        if itype == 'table':
            continue

        # ── 이미지 ──
        if itype == 'image':
            img_num += 1
            placeholders[f"IMAGE_{img_num}"] = {
                "type": "image",
                "pattern": "이미지",
                "example_ref": item.get("binaryItemIDRef", ""),
                "location": "body"
            }
            caption = item.get("text", "")
            if caption:
                placeholders[f"IMAGE_{img_num}_CAPTION"] = {
                    "type": "image_caption",
                    "pattern": "이미지 캡션",
                    "example": caption,
                    "location": "body"
                }
            continue

        # ── 문단 ──
        if itype == 'paragraph':
            text = item.get('text', '')

            # 섹션 제목 → SECTION_n_TITLE (이미 _analyze_sections에서 등록됐을 수 있음)
            if any(p.match(text) for p in sec_patterns):
                section_num += 1
                ph = f"SECTION_{section_num}_TITLE"
                if ph not in placeholders:
                    placeholders[ph] = {
                        "type": "section_title",
                        "pattern": "섹션 제목",
                        "example": text,
                        "location": "body"
                    }
                continue

            # 일반 문단
            para_num += 1
            runs = item.get('runs', [])

            if len(runs) > 1:
                # 다중 run → 각 run별 placeholder
                for r_idx, run in enumerate(runs):
                    ph = f"PARA_{para_num}_RUN_{r_idx+1}"
                    run_text = run.get("text", "")
                    info = _classify_text(run_text)
                    info["example"] = run_text[:100] if len(run_text) > 100 else run_text
                    info["location"] = "body"
                    info["run_index"] = r_idx + 1
                    placeholders[ph] = info
            else:
                ph = f"PARA_{para_num}"
                info = _classify_text(text)
                info["example"] = text[:100] if len(text) > 100 else text
                info["location"] = "body"
                placeholders[ph] = info


# ================================================================
#  표 분석 → placeholder + 표 가이드
# ================================================================

def _analyze_tables(template_info: dict, semantic_map: dict,
                    placeholders: dict, table_guide: dict):
    """본문 데이터 표 → placeholder + table_guide"""
    tables = template_info.get("tables", [])
    body_indices = semantic_map.get("body_tables", [])
    table_roles = semantic_map.get("table_roles", {})

    for tbl_num_0, tbl_idx in enumerate(body_indices):
        tbl_num = tbl_num_0 + 1
        tbl = next((t for t in tables if t["index"] == tbl_idx), None)
        if not tbl:
            continue

        role_info = table_roles.get(tbl_idx, table_roles.get(str(tbl_idx), {}))
        col_headers = role_info.get("col_headers", [])
        col_cnt = len(col_headers) if col_headers else tbl.get("colCnt", 0)

        # ── 헤더 placeholder ──
        for c_idx, h_text in enumerate(col_headers):
            ph = f"TABLE_{tbl_num}_H_C{c_idx+1}"
            placeholders[ph] = {
                "type": "table_header", "pattern": "표 열 제목",
                "example": h_text, "location": f"table_{tbl_num}"
            }

        # ── BODY placeholder ──
        placeholders[f"TABLE_{tbl_num}_BODY"] = {
            "type": "table_body",
            "pattern": "표 데이터 행들 (HTML <tr> 반복)",
            "example": "",
            "location": f"table_{tbl_num}"
        }

        # ── 표 가이드 ──
        table_guide[str(tbl_num)] = {
            "col_headers": col_headers,
            "col_count": col_cnt,
            "row_count": tbl.get("rowCnt", 0),
            "merge_pattern": _detect_merge_pattern(tbl),
            "bullet_chars": _detect_bullet_chars(tbl),
            "example_rows": _extract_example_rows(tbl, role_info),
            "col_types": _classify_columns(col_headers),
            "row_bf_pattern": _extract_row_bf_pattern(tbl, role_info),
        }


def _detect_merge_pattern(tbl: dict) -> dict:
    """셀 병합 패턴 감지"""
    pattern = {}
    for row in tbl.get("rows", []):
        for cell in row:
            col = cell.get("colAddr", 0)
            if cell.get("rowSpan", 1) > 1:
                pattern.setdefault(f"col_{col}", "row_group")
            if cell.get("colSpan", 1) > 1:
                pattern.setdefault(f"col_{col}", "col_span")
    return pattern


def _detect_bullet_chars(tbl: dict) -> list:
    """표 셀 텍스트에서 불릿 문자 감지"""
    bullets = set()
    pats = [
        (r'^-\s',  '- '),  (r'^·\s',  '· '),  (r'^•\s',  '• '),
        (r'^▸\s',  '▸ '),  (r'^▶\s',  '▶ '),  (r'^※\s',  '※ '),
        (r'^◈\s',  '◈ '),  (r'^○\s',  '○ '),  (r'^●\s',  '● '),
    ]
    for row in tbl.get("rows", []):
        for cell in row:
            for line in cell.get("lines", []):
                for pat, char in pats:
                    if re.match(pat, line.strip()):
                        bullets.add(char)
    return sorted(bullets)


def _extract_example_rows(tbl: dict, role_info: dict) -> list:
    """데이터 행에서 예시 최대 3행 추출"""
    rows = tbl.get("rows", [])
    header_row = role_info.get("header_row")
    if header_row is None:
        header_row = -1

    examples = []
    for r_idx, row in enumerate(rows):
        if r_idx <= header_row:
            continue
        row_data = []
        for cell in row:
            text = cell.get("text", "").strip()
            if len(text) > 80:
                text = text[:77] + "..."
            row_data.append(text)
        examples.append(row_data)
        if len(examples) >= 3:
            break
    return examples


def _classify_columns(col_headers: list) -> list:
    """열 헤더 키워드로 용도 추론"""
    type_map = {
        "category": ['구분', '분류', '항목', '카테고리'],
        "content":  ['내용', '설명', '상세', '세부내용'],
        "note":     ['비고', '참고', '기타', '메모'],
        "date":     ['날짜', '일자', '일시', '기간'],
        "person":   ['담당', '담당자', '작성자', '책임'],
        "number":   ['수량', '금액', '단가', '합계'],
    }
    result = []
    for c_idx, header in enumerate(col_headers):
        h = header.strip()
        col_type = "text"
        for t, keywords in type_map.items():
            if h in keywords:
                col_type = t
                break
        result.append({"col": c_idx, "type": col_type, "header": h})
    return result

def _extract_row_bf_pattern(tbl: dict, role_info: dict) -> list:
    """첫 데이터행의 셀별 borderFillIDRef → 열별 bf class 패턴.

    AI가 TABLE_BODY <td> 생성 시 class="bf-{id}" 적용하도록 안내.
    예: [{"col": 0, "bf_class": "bf-12"}, {"col": 1, "bf_class": "bf-8"}, ...]
    """
    rows = tbl.get("rows", [])
    header_row = role_info.get("header_row")
    if header_row is None:
        header_row = -1

    # 첫 데이터행 찾기
    for r_idx, row in enumerate(rows):
        if r_idx <= header_row:
            continue
        pattern = []
        for cell in row:
            bf_id = cell.get("borderFillIDRef")
            pattern.append({
                "col": cell.get("colAddr", len(pattern)),
                "bf_class": f"bf-{bf_id}" if bf_id else "",
                "colSpan": cell.get("colSpan", 1),
                "rowSpan": cell.get("rowSpan", 1),
            })
        return pattern

    return []
# ================================================================
#  작성 패턴 분석
# ================================================================

def _analyze_writing_patterns(template_info: dict,
                              semantic_map: dict) -> dict:
    """문서 전체의 작성 패턴 분석"""
    result = {
        "bullet_styles": [],
        "numbering_patterns": [],
        "avg_line_length": 0,
        "font_primary": "",
        "font_size_body": ""
    }

    # ── 불릿 수집 (모든 표 텍스트) ──
    all_bullets = set()
    tables = template_info.get("tables", [])
    for tbl in tables:
        for row in tbl.get("rows", []):
            for cell in row:
                for line in cell.get("lines", []):
                    if re.match(r'^[-·•▸▶※◈○●]\s', line.strip()):
                        all_bullets.add(line.strip()[0] + " ")

    # ── numbering tools 데이터 ──
    numbering = template_info.get("numbering", {})
    for num in numbering.get("numberings", []):
        levels = num.get("levels", [])
        patterns = [lv.get("pattern", "") for lv in levels[:3]]
        if patterns:
            result["numbering_patterns"].append(patterns)

    for b in numbering.get("bullets", []):
        char = b.get("char", "")
        if char:
            all_bullets.add(char + " ")

    result["bullet_styles"] = sorted(all_bullets)

# ── 평균 라인 길이 ──
    lengths = []
    for tbl in tables:
        for row in tbl.get("rows", []):
            for cell in row:
                for line in cell.get("lines", []):
                    if line.strip():
                        lengths.append(len(line.strip()))

    # content_order 문단 텍스트도 포함
    content_order = template_info.get("content_order", [])
    for item in content_order:
        if item['type'] == 'paragraph':
            text = item.get('text', '').strip()
            if text:
                lengths.append(len(text))
                # 불릿 감지도 추가
                if re.match(r'^[-·•▸▶※◈○●]\s', text):
                    all_bullets.add(text[0] + " ")

    if lengths:
        result["avg_line_length"] = round(sum(lengths) / len(lengths))

    # ── 주요 폰트 ──
    fonts = template_info.get("fonts", {})
    hangul = fonts.get("HANGUL", [])
    if hangul and isinstance(hangul, list) and len(hangul) > 0:
        result["font_primary"] = hangul[0].get("face", "")

    # ── 본문 글자 크기 (char_styles id=0 기본) ──
    char_styles = template_info.get("char_styles", [])
    if char_styles:
        result["font_size_body"] = f"{char_styles[0].get('height_pt', 10)}pt"

    return result