From f0b60e7c199f2f15a43d23ea12a54ac05c2a457d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EC=9D=B4=EA=B2=BD=EB=AF=BC?= Date: Thu, 19 Mar 2026 12:55:41 +0900 Subject: [PATCH] Update handlers/doc/content_analyzer.py --- .../업로드용/handlers/doc/content_analyzer.py | 330 +++++++++++++++++- 1 file changed, 327 insertions(+), 3 deletions(-) diff --git a/03.Code/업로드용/handlers/doc/content_analyzer.py b/03.Code/업로드용/handlers/doc/content_analyzer.py index 85ccd00..290fc70 100644 --- a/03.Code/업로드용/handlers/doc/content_analyzer.py +++ b/03.Code/업로드용/handlers/doc/content_analyzer.py @@ -1,3 +1,327 @@ -# 문서 내용 분석기 -def analyze_content(content): - pass +# -*- coding: utf-8 -*- +""" +Content Analyzer (Phase 3 – Layer A) +- template_info + semantic_map -> content_prompt.json +- 각 placeholder의 의미/역할/예시값/작성 패턴 추출 +- Phase 4에서 AI가 새 문서 생성 시 "레시피"로 참조 + +★ 원칙: 모든 분석은 코드 100% (AI 없이) + purpose_hint / audience_hint / tone_hint는 빈 문자열로 남겨둠 + → 추후 AI enrichment 단계에서 채울 수 있도록 설계 +""" + +import re + + +def generate(template_info: dict, semantic_map: dict, + parsed: dict = None) -> dict: + """ + content_prompt.json 생성 + + Args: + template_info: doc_template_analyzer 추출 결과 + semantic_map: semantic_mapper 분석 결과 + parsed: HWPX 파싱 원본 (선택) + + Returns: + content_prompt.json 구조 + """ + placeholders = {} + table_guide = {} + + # ① 문서 기본 정보 + document = _analyze_document(template_info) + + # ② 헤더 placeholders + _analyze_header(template_info, placeholders) + + # ③ 푸터 placeholders + _analyze_footer(template_info, placeholders) + + # ④ 제목 placeholder + _analyze_title(template_info, semantic_map, placeholders) + + # ⑤ 섹션 placeholders + _analyze_sections(semantic_map, placeholders, template_info) + + # ⑤-b content_order 기반 문단/이미지 placeholders + _analyze_content_order(template_info, semantic_map, placeholders) + + # ⑥ 표 가이드 + placeholders + _analyze_tables(template_info, semantic_map, + placeholders, table_guide) + + # ⑦ 작성 패턴 + writing_guide = _analyze_writing_patterns(template_info, semantic_map) + + return { + "version": "1.0", + "document": document, + "placeholders": placeholders, + "table_guide": table_guide, + "writing_guide": writing_guide + } + + +# ================================================================ +# 문서 기본 정보 +# ================================================================ + +def _analyze_document(template_info: dict) -> dict: + """문서 레이아웃 정보 추출""" + page = template_info.get("page", {}) + paper = page.get("paper", {}) + + return { + "paper": paper.get("name", "A4"), + "layout": "landscape" if paper.get("landscape") else "portrait", + "margins": page.get("margins", {}), + "purpose_hint": "", # AI enrichment 예약 + "audience_hint": "", # AI enrichment 예약 + "tone_hint": "" # AI enrichment 예약 + } + + +# ================================================================ +# 텍스트 유형 분류 (코드 100%, AI 없이) +# ================================================================ + +def _classify_text(text: str) -> dict: + """텍스트 패턴으로 콘텐츠 유형 분류""" + text = text.strip() + if not text: + return {"type": "empty", "pattern": "빈 소스"} + + # 날짜: "2025. 1. 30(목)", "2025-01-30", "2025.01.30" + if re.match(r'\d{4}[\.\-\/]\s*\d{1,2}[\.\-\/]\s*\d{1,2}', text): + return {"type": "date", "pattern": "날짜 (YYYY. M. D)"} + + # ★ 직위+이름 (분석보다 매칭!) + positions = [ + '사원', '대리', '과장', '차장', '부장', '이사', '상무', '전무', + '연구원', '선임연구원', '책임연구원', '수석연구원', + '주임', '계장', '팀장', '본부장', '센터장' + ] + for pos in positions: + if pos in text: + return {"type": "author", "pattern": f"이름 + 직위({pos})"} + + # 부서 (직위가 아닌 경우에만 예비적으로) + if re.search(r'(실|부|국|과|원|처|팀|센터|본부)$', text) and len(text) <= 12: + return {"type": "department", "pattern": "조직명"} + + # 팀 + if re.search(r'팀$', text) and len(text) <= 10: + return {"type": "team", "pattern": "팀명"} + + # 페이지 참조: "1p", "2p" + if re.match(r'\d+p$', text): + return {"type": "page_ref", "pattern": "페이지 참조"} + + # 문서 제목: ~계획(안), ~보고서, ~제안서 등 + if re.search(r'(계획|보고서|제안서|기획서|결과|방안|현황|분석)', + r'\s*(\(안\))?\s*$', text): + return {"type": "doc_title", "pattern": "문서 제목"} + + # 슬로건/비전 (길고 특수 문장 키워드 포함) + if len(text) > 10 and any(k in text for k in + ['함께', '세계', '미래', '가치', '네트워크']): + return {"type": "slogan", "pattern": "회사 슬로건/비전"} + + # 기본 + return {"type": "text", "pattern": "자유 텍스트"} + + +# ================================================================ +# 헤더 분석 +# ================================================================ + +def _analyze_header(template_info: dict, placeholders: dict): + """헤더 영역 placeholder 분석""" + header = template_info.get("header", {}) + if not header or not header.get("exists"): + return + + if header.get("type") == "table" and header.get("table"): + _analyze_table_area(header["table"], "HEADER", "header", + placeholders) + else: + texts = header.get("texts", []) + for i in range(max(len(texts), 1)): + ph = f"HEADER_TEXT_{i+1}" + example = texts[i] if i < len(texts) else "" + info = _classify_text(example) + info["example"] = example.strip() + info["location"] = "header" + placeholders[ph] = info + + +# ================================================================ +# 푸터 분석 +# ================================================================ + +def _analyze_footer(template_info: dict, placeholders: dict): + """푸터 영역 placeholder 분석""" + footer = template_info.get("footer", {}) + if not footer or not footer.get("exists"): + return + + if footer.get("type") == "table" and footer.get("table"): + _analyze_table_area(footer["table"], "FOOTER", "footer", + placeholders) + else: + texts = footer.get("texts", []) + for i in range(max(len(texts), 1)): + ph = f"FOOTER_TEXT_{i+1}" + example = texts[i] if i < len(texts) else "" + info = _classify_text(example) + info["example"] = example.strip() + info["location"] = "footer" + placeholders[ph] = info + + +# ================================================================ +# 제목 분석 +# ================================================================ + +def _analyze_title(template_info: dict, semantic_map: dict, + placeholders: dict): + """제목 영역 분석""" + title_block = template_info.get("titleBlock", {}) + if not title_block or title_block.get("type") == "none": + return + + example = title_block.get("text", "") + info = _classify_text(example) + info["example"] = example + info["location"] = "title_block" + placeholders["DOCUMENT_TITLE"] = info + + +# ================================================================ +# 섹션 분석 +# ================================================================ + +def _analyze_sections(semantic_map: dict, placeholders: dict, + template_info: dict): + """섹션 제목 placeholder 분석""" + sections = semantic_map.get("sections", []) + for i, sec in enumerate(sections): + ph = f"SECTION_TITLE_{i+1}" + example = sec.get("name", "") + placeholders[ph] = { + "type": "section_title", + "pattern": "대항목 제목", + "example": example, + "location": "body", + "hasBullet": sec.get("hasBulletIcon", False) + } + + +# ================================================================ +# 콘텐츠 순서(Content Order) 기반 분석 +# ================================================================ + +def _analyze_content_order(template_info: dict, semantic_map: dict, + placeholders: dict): + """문단/이미지 placeholder 분석""" + # 1. 문단(Paragraphs) + paragraphs = template_info.get("content_order", {}).get("paragraphs", []) + for i, p in enumerate(paragraphs): + text = p.get("text", "").strip() + if not text: + continue + + ph = f"PARA_{i+1}" + info = _classify_text(text) + info["example"] = text + info["location"] = "body" + placeholders[ph] = info + + # 2. 이미지(Images) + images = template_info.get("content_order", {}).get("images", []) + for i, img in enumerate(images): + ph = f"IMAGE_{i+1}" + placeholders[ph] = { + "type": "image", + "pattern": "삽입 이미지", + "example": f"Reference: {img.get('ref')}", + "location": "body" + } + + +# ================================================================ +# 표(Table) 분석 +# ================================================================ + +def _analyze_tables(template_info: dict, semantic_map: dict, + placeholders: dict, table_guide: dict): + """표 구조 및 내부 placeholder 분석""" + tables = template_info.get("tables", []) + body_tables = [t for t in tables if t.get("location") == "body"] + + for i, tbl in enumerate(body_tables): + if tbl.get("isLayoutTable"): + continue + + table_id = f"TABLE_{i+1}" + cells = tbl.get("cells", []) + if not cells: + continue + + # 헤더 추출 + header_row = cells[0] + headers = [c.get("text", "") for c in header_row] + + table_guide[table_id] = { + "rowCount": tbl.get("rowCount"), + "colCount": tbl.get("colCount"), + "headers": headers, + "style_hint": "템플릿 표 스타일 유지" + } + + # 데이터 셀을 placeholder로 (첫 2행 정도) + for r_idx, row in enumerate(cells[1:3]): + for c_idx, cell in enumerate(row): + example = cell.get("text", "") + if not example.strip(): + continue + + ph = f"{table_id}_R{r_idx+1}_C{c_idx+1}" + info = _classify_text(example) + info["example"] = example + info["header"] = headers[c_idx] if c_idx < len(headers) else "" + placeholders[ph] = info + + +def _analyze_table_area(tbl_info: dict, prefix: str, location: str, + placeholders: dict): + """헤더/푸터 내의 표 데이터 분석""" + cells = tbl_info.get("cellTexts", []) + for i, text in enumerate(cells): + if not text.strip(): + continue + ph = f"{prefix}_CELL_{i+1}" + info = _classify_text(text) + info["example"] = text + info["location"] = location + placeholders[ph] = info + + +# ================================================================ +# 작성 패턴 분석 +# ================================================================ + +def _analyze_writing_patterns(template_info: dict, semantic_map: dict) -> dict: + """문체 및 구성 패턴 분석""" + overall = semantic_map.get("overallStyle", {}) + + return { + "style": overall.get("writingStyle", "혼합형"), + "bullet_char": overall.get("bulletType", "•"), + "table_usage": overall.get("tableUsage", "보통"), + "principles": [ + "템플릿의 문체(~함, ~임)를 유지할 것", + "불렛 기호와 들여쓰기 수준을 일관되게 적용할 것" + ] + }