diff --git a/03.Code/업로드용/domain_api.py b/03.Code/업로드용/domain_api.py new file mode 100644 index 0000000..3356919 --- /dev/null +++ b/03.Code/업로드용/domain_api.py @@ -0,0 +1,962 @@ +# -*- coding: utf-8 -*- +from dotenv import load_dotenv +load_dotenv() + +""" +domain_api.py + + + 硫由 API + + +ъ⑸ (app.py): + from domain_api import register_domain_routes + register_domain_routes(app) +""" + +import os +import json +from pathlib import Path +from flask import request, jsonify +import threading +import hashlib +import psycopg2 +from db import get_conn +from db import get_conn + +# ===== 寃쎈 + ㅼ ===== +BASE_DIR = Path(__file__).parent +DOMAIN_CONFIG_PATH = BASE_DIR / "domain_config.json" +DOMAIN_DIR = BASE_DIR / "domain" +# + 寃쎈 (step3~9媛 ъ⑺ 寃쎈) +PIPELINE_OUTPUT_ROOT = Path(os.getenv("PIPELINE_OUTPUT_ROOT", "/tmp/pipeline_output")) +CONTEXT_DIR = PIPELINE_OUTPUT_ROOT / "context" +pipeline_jobs = {} +def register_domain_routes(app): + """Flask 깆 + 硫 고 깅 + @app.route('/api/domain-config', methods=['GET']) + def get_domain_config(): + """ + 硫 + ㅼ 諛 + 硫 щ 泥댄 + for cat in config.get('categories', []): + if cat.get('file'): + fpath = DOMAIN_DIR / cat['file'] + cat['file_exists'] = fpath.exists() + cat['file_size'] = fpath.stat().st_size if fpath.exists() else 0 + + for child in cat.get('children', []): + if child.get('file'): + fpath = DOMAIN_DIR / child['file'] + child['file_exists'] = fpath.exists() + child['file_size'] = fpath.stat().st_size if fpath.exists() else 0 + + return jsonify(config) + else: + return jsonify({'error': 'domain_config.json not found', 'categories': []}), 404 + except Exception as e: + return jsonify({'error': str(e), 'categories': []}), 500 + + + @app.route('/api/domain-combine', methods=['POST']) + def combine_domains(): + """ + + + 硫㼼 + ⑹ + domain_prompt.txt濡 + + 泥: + { "selected": ["civil_general", "survey", "bim"] } + + : + { "success": true, "combined_length": 3200, "selected_names": [...] } + """ + try: + data = request.get_json() + selected_ids = data.get('selected', []) + + if not selected_ids: + return jsonify({ + 'success': True, + 'combined_length': 0, + 'selected_names': [], + 'message': ' + - step3 遺 + + ' + }) + + # config 濡 + config = json.loads(DOMAIN_CONFIG_PATH.read_text(encoding='utf-8')) + + # + ID + + 留ㅽ + domain_parts = [] + guide_parts = [] + selected_names = [] + + for cat in config.get('categories', []): + is_guide = (cat['id'] == 'report_guide') + target = guide_parts if is_guide else domain_parts + + if cat['id'] in selected_ids and cat.get('file'): + fpath = DOMAIN_DIR / cat['file'] + if fpath.exists(): + content = fpath.read_text(encoding='utf-8', errors='ignore').strip() + if content: + target.append(f"[{cat['label']}]\n{content}") + selected_names.append(cat['label']) + + for child in cat.get('children', []): + if child['id'] in selected_ids and child.get('file'): + fpath = DOMAIN_DIR / child['file'] + if fpath.exists(): + content = fpath.read_text(encoding='utf-8', errors='ignore').strip() + if content: + target.append(f"[{child['label']}]\n{content}") + selected_names.append(child['label']) + selected_names.append(child['label']) + + if not domain_parts and not guide_parts: + return jsonify({ + 'success': False, + 'error': ' + + 硫 + 댁듬.' + }) + + sep = "\n\n" + "=" * 50 + "\n\n" + sections = [] + + if domain_parts: + domain_names = [n for n in selected_names if n not in ['紐⑹감 援ъ + 媛', '蹂닿 + 臾몄껜 媛']] + sections.append( + f" + ㅼ 遺 + 쇱 + 臾멸: {', '.join(domain_names)}.\n" + f"ㅼ + 硫 + 湲곕쇰, ъㅼ쇨굅 + 臾몄 댁⑹ + + 깊痢≪쎌 湲吏, 怨듬 洹쇨굅 몄 + 理 蹂댁〈 + 硫 + 臾 吏 + + 媛]\n" + f"ㅼ 媛瑜 李멸 蹂닿 + ⑹감 援ъ + 깃낵 臾몄껜瑜 寃곗 二쇱 + (ъ⑹ + + 닿 + + 媛]\n" + "ㅼ 媛瑜 李멸 蹂닿 + ⑹감 援ъ + 깃낵 臾몄껜瑜 寃곗 + CONTEXT_DIR.mkdir(parents=True, exist_ok=True) + output_path = CONTEXT_DIR / "domain_prompt.txt" + output_path.write_text(final_text, encoding='utf-8') + + return jsonify({ + 'success': True, + 'combined_length': len(final_text), + 'selected_names': selected_names, + 'selected_ids': selected_ids, + 'output_path': str(output_path) + }) + + except Exception as e: + return jsonify({'success': False, 'error': str(e)}), 500 + + + @app.route('/api/domain-list', methods=['GET']) + def list_domain_files(): + """ + domains/ 대 + 硫由ъ + """ + try: + files = [] + + if DOMAIN_DIR.exists(): + for f in sorted(DOMAIN_DIR.rglob('*.txt')): + rel = f.relative_to(DOMAIN_DIR) + files.append({ + 'path': str(rel), + 'name': f.stem, + 'size': f.stat().st_size, + 'preview': f.read_text(encoding='utf-8', errors='ignore')[:200] + }) + + return jsonify({ + 'success': True, + 'files': files, + 'domains_dir': str(DOMAIN_DIR) + }) + + except Exception as e: + return jsonify({'success': False, 'error': str(e)}), 500 + + + @app.route('/api/domain-save', methods=['POST']) + def save_domain_file(): + """ + + 硫/ + + 泥: + { "id": "survey", "content": "痢〓 遺 + 쇱 + 臾 吏 content媛 + ⑸.'}) + + # config + 李얘린 + config = json.loads(DOMAIN_CONFIG_PATH.read_text(encoding='utf-8')) + file_path = None + + for cat in config.get('categories', []): + if cat['id'] == domain_id: + file_path = cat.get('file') + break + for child in cat.get('children', []): + if child['id'] == domain_id: + file_path = child.get('file') + break + if file_path: + break + + if not file_path: + return jsonify({'success': False, 'error': f' + 硫 + 듬: {domain_id}'}) + + # + full_path = BASE_DIR / file_path + full_path.parent.mkdir(parents=True, exist_ok=True) + full_path.write_text(content, encoding='utf-8') + + return jsonify({ + 'success': True, + 'path': str(full_path), + 'size': len(content) + }) + + except Exception as e: + return jsonify({'success': False, 'error': str(e)}), 500 + + + @app.route('/api/pipeline/status', methods=['GET']) + def pipeline_status(): + """ + step щ""" + try: + status = { + 'step3_domain': (CONTEXT_DIR / 'domain_prompt.txt').exists(), + 'step4_chunks': len(list((PIPELINE_OUTPUT_ROOT / 'rag').glob('*_chunks.json'))) if (PIPELINE_OUTPUT_ROOT / 'rag').exists() else 0, + 'step5_faiss': (PIPELINE_OUTPUT_ROOT / 'rag' / 'faiss.index').exists(), + 'step6_corpus': (CONTEXT_DIR / 'corpus.txt').exists(), + 'step7_outline': (CONTEXT_DIR / 'outline_issue_report.txt').exists(), + 'step8_report': (PIPELINE_OUTPUT_ROOT / 'generated' / 'report_draft.md').exists(), + 'step9_html': (PIPELINE_OUTPUT_ROOT / 'generated' / 'report.html').exists(), + } + + return jsonify({'success': True, 'status': status}) + + except Exception as e: + return jsonify({'success': False, 'error': str(e)}), 500 + + def run_toc_pipeline(session_id, input_dir, output_dir, doc_type='report', attach_pages=1): + try: + pipeline_jobs[session_id] = {'status': 'running', 'step': 2} + from converters.pipeline import step2_extract, step3_domain, step4_chunk, step5_rag, step6_corpus, step7_index + + # 댁 怨 + + input_files = sorted(Path(input_dir).glob('*')) if Path(input_dir).exists() else [] + file_hashes = [] + new_files = [] # RAG 罹 HIT: {f.name} ({h})", flush=True) + else: + new_files.append(f) + print(f"[DB] RAG 罹 MISS: {f.name} ({h})", flush=True) + except Exception as de: + print(f"[DB] 罹 議고 ㅽ, 洹 泥: {de}", flush=True) + new_files.append(f) + + # 洹 step2~5 ㅽ + if new_files: + step2_extract.process_all_pdfs(input_dir, output_dir) + pipeline_jobs[session_id]['step'] = 3 + step3_domain.main(input_dir, output_dir) + pipeline_jobs[session_id]['step'] = 4 + step4_chunk.main(output_dir, output_dir) + pipeline_jobs[session_id]['step'] = 5 + step5_rag.main(output_dir, output_dir) + + # RAG 寃곌낵臾 DB + faiss_path = rag_dir / 'faiss.index' + vectors_path = rag_dir / 'vectors.npy' + meta_path = rag_dir / 'meta.json' + chunks_files = list(rag_dir.glob('*_chunks.json')) + + faiss_bytes = faiss_path.read_bytes() if faiss_path.exists() else b'' + vectors_bytes = vectors_path.read_bytes() if vectors_path.exists() else b'' + meta_text = meta_path.read_text(encoding='utf-8') if meta_path.exists() else '' + chunks_text = chunks_files[0].read_text(encoding='utf-8') if chunks_files else '' + + for f in new_files: + h = hashlib.md5(f.read_bytes()).hexdigest() + try: + with get_conn() as conn: + with conn.cursor() as cur: + cur.execute(""" + INSERT INTO files (file_hash, filename) + VALUES (%s, %s) + ON CONFLICT (file_hash) DO NOTHING + """, (h, f.name)) + cur.execute(""" + INSERT INTO rag_cache (file_hash, chunks_json, faiss_index, vectors, meta_json) + VALUES (%s, %s, %s, %s, %s) + ON CONFLICT (file_hash) DO NOTHING + """, (h, chunks_text, + psycopg2.Binary(faiss_bytes), + psycopg2.Binary(vectors_bytes), + meta_text)) + conn.commit() + print(f"[DB] RAG 罹 : {f.name}", flush=True) + except Exception as de: + print(f"[DB] RAG ㅽ: {de}", flush=True) + else: + print("[DB] 紐⑤ HIT step2/4/5 ㅽ, step3 ㅽ + 罹 蹂듭 + RAG 寃곌낵臾 蹂듭 蹂듭 + 猷", flush=True) + except Exception as de: + print(f"[DB] 罹 蹂듭ㅽ, step4~5 ㅽ: {de}", flush=True) + step4_chunk.main(output_dir, output_dir) + step5_rag.main(output_dir, output_dir) + pipeline_jobs[session_id]['step'] = 5 + + # step6~7 ㅽ + pipeline_jobs[session_id]['step'] = 6 + step6_corpus.main(output_dir, output_dir) + pipeline_jobs[session_id]['step'] = 7 + step7_index.main(output_dir, output_dir, doc_type=doc_type) + + outline_txt = Path(output_dir) / 'context' / 'outline_issue_report.txt' + print("[DEBUG outline]", outline_txt.read_text(encoding='utf-8')[:500], flush=True) + + # sessions / outlines DB + outline_text = outline_txt.read_text(encoding='utf-8') if outline_txt.exists() else '' + try: + with get_conn() as conn: + with conn.cursor() as cur: + cur.execute(""" + INSERT INTO sessions (session_id, file_hashes, doc_type) + VALUES (%s, %s, %s) + ON CONFLICT (session_id) DO UPDATE SET doc_type=EXCLUDED.doc_type + """, (session_id, file_hashes, doc_type)) + cur.execute(""" + INSERT INTO outlines (session_id, outline_text) + VALUES (%s, %s) + ON CONFLICT (session_id) DO UPDATE SET outline_text=EXCLUDED.outline_text + """, (session_id, outline_text)) + conn.commit() + print(f"[DB] session/outline + 猷: {session_id}", flush=True) + except Exception as de: + print(f"[DB] session/outline ㅽ: {de}", flush=True) + + pipeline_jobs[session_id] = {'status': 'done', 'doc_type': doc_type} + + except Exception as e: + import traceback + print(f"[PIPELINE ERROR] {e}", flush=True) + print(traceback.format_exc(), flush=True) + pipeline_jobs[session_id] = {'status': 'error', 'error': str(e)} + + + # ===== + ㅽ API ===== + @app.route('/api/generate-toc', methods=['POST']) + def generate_toc(): + """ + 紐⑹감 + API (step3 4 5 6 7) + + + 硫 + + 寃쎌: step3 ㅽ () + + 硫 + 寃쎌: step3 ㅽ + + 泥: + { + "folder_path": "D:\\...", + "domain_selected": true/false, + "selected_domains": ["civil_general", "survey"] + } + + : + { + "success": true, + "title": "蹂닿 + 紐", + "toc_items": [ + { "num": "1.1.1", "title": "...", "guide": "...", "keywords": [...] } + ] + } + """ + try: + data = request.get_json() + session_id = data.get('session_id', '') + domain_selected = data.get('domain_selected', False) + write_mode = data.get('write_mode', 'restructure') + instruction = data.get('instruction', '') + + if not session_id: + return jsonify({'success': False, 'error': 'session_id媛 듬. + 癒쇱 +濡 + 몄.'}) + + input_dir = f'/tmp/{session_id}/input' + output_dir = f'/tmp/{session_id}/output' + os.makedirs(output_dir, exist_ok=True) + + doc_type = data.get('doc_type', 'report') + attach_pages = int(data.get('attach_pages', 1)) + t = threading.Thread(target=run_toc_pipeline, args=(session_id, input_dir, output_dir, doc_type, attach_pages)) + t.daemon = True + t.start() + return jsonify({'success': True, 'status': 'processing', 'session_id': session_id}) + + except Exception as e: + return jsonify({'success': False, 'error': str(e)}), 500 + + @app.route('/api/toc-status/', methods=['GET']) + def toc_status(session_id): + job = pipeline_jobs.get(session_id, {'status': 'unknown'}) + if job.get('status') == 'done': + outline_path = Path(f'/tmp/{session_id}/output/context/outline_issue_report.txt') + if outline_path.exists(): + doc_type = job.get('doc_type', 'report') + if doc_type == 'briefing': + toc_items = parse_briefing_plan_for_frontend(outline_path) + else: + toc_items = parse_outline_for_frontend(outline_path) + return jsonify({'status': 'done', 'toc_items': toc_items}) + return jsonify(job) + + + @app.route('/api/generate-report-from-toc', methods=['POST']) + def generate_report_from_toc(): + """ + 몄 紐⑹감濡 蹂닿 + + (step8 step9) + + 泥: + { + "toc_items": [...], # 몄 紐⑹감 + "write_mode": "restructure", + "instruction": "..." + } + """ + try: + data = request.get_json() + session_id = data.get('session_id', '') + toc_items = data.get('toc_items', []) + write_mode = data.get('write_mode', 'restructure') + instruction = data.get('instruction', '') + + if not session_id: + return jsonify({'success': False, 'error': 'session_id媛 듬.'}) + + input_dir = f'/tmp/{session_id}/input' + output_dir = f'/tmp/{session_id}/output' + + from converters.pipeline import step8_content, step9_html + + doc_type = data.get('doc_type', 'report') + + step8_content.main(output_dir, output_dir, doc_type=doc_type) + step9_html.main(output_dir, output_dir, doc_type=doc_type) + + report_html_path = Path(output_dir) / 'generated' / 'report.html' + + # briefing_content 쇰㈃ None) + briefing_json_path = Path(output_dir) / 'generated' / 'briefing_content.json' + briefing_content = None + if briefing_json_path.exists(): + briefing_content = json.loads(briefing_json_path.read_text(encoding='utf-8')) + + # 湲곗〈 html 諛吏 吏 + briefing_content + 留 異媛 + if report_html_path.exists(): + html = report_html_path.read_text(encoding='utf-8') + # briefing 寃곌낵臾 DB + try: + with get_conn() as conn: + with conn.cursor() as cur: + cur.execute(""" + INSERT INTO briefings (session_id, briefing_json, html) + VALUES (%s, %s, %s) + ON CONFLICT (session_id) DO UPDATE + SET briefing_json=EXCLUDED.briefing_json, html=EXCLUDED.html + """, (session_id, + json.dumps(briefing_content, ensure_ascii=False) if briefing_content else '', + html)) + conn.commit() + print(f"[DB] briefing + 猷: {session_id}", flush=True) + except Exception as de: + print(f"[DB] briefing ㅽ: {de}", flush=True) + return jsonify({ + 'success': True, + 'html': html, + 'briefing_content': briefing_content + }) + else: + return jsonify({ + 'success': False, + 'error': '蹂닿 + + 깆ㅽ⑦듬.' + }) + + except Exception as e: + return jsonify({'success': False, 'error': str(e)}), 500 + + + + @app.route('/api/check-folder', methods=['POST']) + def check_folder(): + """대 寃쎈 + 遺 + 瑜대瑜 李얠 + 듬.'}) + + SUPPORTED = {'.hwpx', '.hwp', '.pdf', '.docx', '.xlsx', '.pptx', '.txt', '.csv', 'md', 'json','img', 'png', 'html'} + + all_files = [f for f in folder.rglob('*') if f.is_file()] + ok_files = [f for f in all_files if f.suffix.lower() in SUPPORTED] + unknown_files = [f for f in all_files if f.suffix.lower() not in SUPPORTED] + + return jsonify({ + 'success': True, + 'total': len(all_files), + 'ok': len(ok_files), + 'unknown': len(unknown_files), + 'ok_list': [{'name': f.name, 'size': f.stat().st_size} for f in ok_files], + 'unknown_list': [f.name for f in unknown_files] + }) + except Exception as e: + return jsonify({'success': False, 'error': str(e)}), 500 + + @app.route('/api/analyze-briefing', methods=['POST']) + def analyze_briefing(): + """ + +濡 + 遺 + + 댁 + 湲고 + 援ъ + 怨 + 諛 +濡 + +ㅽ 異異 + source_text = content + if session_id: + input_dir = Path(f'/tmp/{session_id}/input') + output_dir = Path(f'/tmp/{session_id}/output') + if input_dir.exists(): + try: + from converters.pipeline import step2_extract + output_dir.mkdir(parents=True, exist_ok=True) + step2_extract.process_all_pdfs(str(input_dir), str(output_dir)) + except Exception as ex: + print(f"step2 異異 + + 댁⑹듬.'}) + + client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY')) + + prompt = f"""ㅼ 臾몄 + 瑜 遺 + + A4 湲고 + 濡 뎄 + 깊 怨 + + JSON쇰 諛 + 댁: +{source_text[:4000]} + +諛, + ㅻ + 紐", + "sections": [ + {{"type": "由щ諛", "content": "듭 硫吏 以 + "}}, + {{"type": " + 뱀 + +뱀 +紐 + 댁 + 댁 "}}, + {{"type": " + 뱀 + +뱀 +紐 + 댁 + 댁 "}}, + {{"type": "⑤", "content": "듭 寃곕 以 + "}} + ] + }}, + {{ + "page": 2, + "title": "[泥⑤] 紐", + "sections": [ + {{"type": " + 뱀 +⑤ 댁 "}}, + {{"type": "⑤", "content": "듭 寃곕"}} + ] + }} + ] +}} + +洹移: +- 蹂몃Ц 1 + + 泥⑤ 1 (댁⑹ + 留) +- 媛 + 뱀 + ㅼ 臾몄 + 댁 湲곕쇰 援ъ껜쇰 + +- JSON留 諛щㅼ 肄釉濡釉濡嫄 + raw = raw.replace('```json', '').replace('```', '').strip() + plan = json.loads(raw) + + return jsonify({'success': True, 'plan': plan}) + + except Exception as e: + import traceback + print(traceback.format_exc(), flush=True) + return jsonify({'success': False, 'error': str(e)}), 500 + + + @app.route('/api/generate-briefing', methods=['POST']) + def generate_briefing(): + """ + 援ъ + 怨 + + 肄硫 諛 ㅼ A4 Navy HTML 湲고 + + + """ + try: + import openai + data = request.get_json() + session_id = data.get('session_id', '') + plan = data.get('plan', {}) + comment = data.get('comment', '') + content = data.get('content', '') + doc_type = data.get('doc_type', '') + + # +ㅽ + source_text = content + if session_id: + input_dir = Path(f'/tmp/{session_id}/input') + output_dir = Path(f'/tmp/{session_id}/output') + + if input_dir.exists(): + # + step2濡 PDF +ㅽ 蹂쇱 ㅽ + try: + from converters.pipeline import step2_extract + output_dir.mkdir(parents=True, exist_ok=True) + step2_extract.process_all_pdfs(str(input_dir), str(output_dir)) + except Exception as ex: + print(f"step2 異異 + 異異 .md + ъ⑹硫 (諛 諛):\n{comment}" if comment else "" + + prompt = f"""ㅼ 援ъ + 怨 + 怨 臾몄 + 瑜 諛쇰 A4 HTML 湲고 + 瑜 + 깊 + 怨 + : +{plan_str} + + 臾몄 + : +{source_text[:5000]} + +[異 洹移] +- + +由 援ъ“瑜 곕 寃 +- Navy 而щ ㅽ 吏 (#1a365d, #2c5282, #f7fafc) +- .sheet overflow: hidden 댁⑹쇰 硫 +- 媛 蹂 + +
濡 援щ + +- HTML + 泥 肄留 諛 + ㅻ + +由] + + + + + + + + + +""" + + resp = client.chat.completions.create( + model=os.getenv('OPENAI_MODEL', 'gpt-4o'), + messages=[{'role': 'user', 'content': prompt}], + temperature=0.4, + max_tokens=4000 + ) + + html = resp.choices[0].message.content.strip() + # 肄釉濡嫄 + if html.startswith('```'): + html = html.split('\n', 1)[1] + html = html.rsplit('```', 1)[0] + + return jsonify({'success': True, 'html': html}) + + except Exception as e: + import traceback + print(traceback.format_exc(), flush=True) + return jsonify({'success': False, 'error': str(e)}), 500 + + + +def parse_outline_for_frontend(outline_path: Path) -> list: + """ + outline_issue_report.txt瑜 깊 + displayTocWithAnimation() 쇰 蹂紐⑹감 紐", + "guide": "吏 + 媛", + "keywords": ["ㅼ1", "ㅼ2"] + } + ] + """ + import re + + raw = outline_path.read_text(encoding='utf-8', errors='ignore').splitlines() + if not raw: + return [] + + report_title = raw[0].strip() + items = [] + + re_l3_head = re.compile(r'^\s*(\d+\.\d+\.\d+)\s+(.+)$') + re_l3_topic = re.compile(r'^\s*[\-\*]\s+(.+?)\s*\|\s*(.+?)\s*\|\s*(\[.+?\])\s*\|\s*(.+)$') + re_keywords = re.compile(r'(#\S+)') + + current_l3 = None + + for ln in raw[1:]: + line = ln.strip() + if not line: + continue + + m3h = re_l3_head.match(line) + if m3h: + current_l3 = { + 'num': m3h.group(1), + 'title': m3h.group(2), + 'report_title': report_title, + 'guide': '', + 'keywords': [] + } + items.append(current_l3) + continue + + m3t = re_l3_topic.match(line) + if m3t and current_l3: + kws = [k.lstrip('#').strip() for k in re_keywords.findall(m3t.group(2))] + # 湲곗〈 ㅼ媛 + current_l3['keywords'].extend(kws) + # 媛 + + if current_l3['guide']: + current_l3['guide'] += ' / ' + current_l3['guide'] += m3t.group(4) + + return items + +def parse_briefing_plan_for_frontend(outline_path: Path) -> list: + raw = outline_path.read_text(encoding='utf-8', errors='ignore').strip() + raw_lines = raw.splitlines() + if not raw_lines: + return [] + + # + / 媛 ⑤ + 以 + 濡 寃쎌 ㅼ 以 + 怨 ⑹ + merged = [] + idx = 0 + while idx < len(raw_lines): + ln = raw_lines[idx].strip() + if ln in [' + ', ''] and idx + 1 < len(raw_lines): + merged.append(ln + ' ' + raw_lines[idx + 1].strip()) + idx += 2 + continue + merged.append(raw_lines[idx]) + idx += 1 + lines = merged + + items = [] + current_page = None + + for line in lines: + line = line.strip() + if not line: + continue + + if ' + ' in line or '' in line: + icon = ' + ' if ' + ' in line else '' + title = line.replace(' + ', '').replace('', '').strip() + # "蹂몃Ц N" "蹂몃Ц", "泥⑤ N" "泥⑤ N" + import re as _re + title = _re.sub(r'蹂몃Ц\s*\d*?', '蹂몃Ц', title).strip() + title = _re.sub(r'泥⑤\s*(\d+)?', r'泥⑤ \1', title).strip() + if '紐:' in title: + title = title.split('紐:')[-1].strip() + if not title: + title = '蹂몃Ц' if icon == ' + ' else '泥⑤' + current_page = { + 'num': icon, + 'title': title, + 'guide': '', + 'keywords': [], + 'sections': [] + } + items.append(current_page) + + elif current_page is not None and line.strip(): + content = line.lstrip('-').strip() + + if content.startswith(':'): + continue + + if content.startswith('紐:'): + current_page['title'] = content.replace('紐:', '').strip() + continue + + # 由щ諛/⑤ (": " щ諛/⑤ ы⑤ 蹂 + + 泥) + import re as _re + if _re.match(r'^由щ諛\s*:', content): + lead_text = content.split(':', 1)[-1].strip() + current_page['sections'].append({'label': '由щ諛', 'text': lead_text}) + continue + if _re.match(r'^⑤\s*:', content): + bottom_text = content.split(':', 1)[-1].strip() + current_page['sections'].append({'label': '⑤', 'text': bottom_text}) + continue + + if '|' in content: + parts = [p.strip() for p in content.split('|')] + section_name = parts[0].split(':')[-1].strip() + comment = parts[1] if len(parts) > 1 else '' + fmt = parts[3] if len(parts) > 3 else (parts[2] if len(parts) > 2 else '') + current_page['sections'].append({ + 'label': section_name, + 'text': comment, + 'fmt': fmt + }) + else: + current_page['sections'].append({'label': content, 'text': ''}) + + # guide 嫄 - sections 由ъㅽ 洹몃濡 + 濡몄 + + for item in items: + item['guide'] = '' # + 濡몄 + sections 吏 ъ + + return items