diff --git a/03.Code/업로드용/domain_api.py b/03.Code/업로드용/domain_api.py deleted file mode 100644 index 3356919..0000000 --- a/03.Code/업로드용/domain_api.py +++ /dev/null @@ -1,962 +0,0 @@ -# -*- coding: utf-8 -*- -from dotenv import load_dotenv -load_dotenv() - -""" -domain_api.py - - - 硫由 API + - -ъ⑸ (app.py): - from domain_api import register_domain_routes - register_domain_routes(app) -""" - -import os -import json -from pathlib import Path -from flask import request, jsonify -import threading -import hashlib -import psycopg2 -from db import get_conn -from db import get_conn - -# ===== 寃쎈 - ㅼ ===== -BASE_DIR = Path(__file__).parent -DOMAIN_CONFIG_PATH = BASE_DIR / "domain_config.json" -DOMAIN_DIR = BASE_DIR / "domain" -# - 寃쎈 (step3~9媛 ъ⑺ 寃쎈) -PIPELINE_OUTPUT_ROOT = Path(os.getenv("PIPELINE_OUTPUT_ROOT", "/tmp/pipeline_output")) -CONTEXT_DIR = PIPELINE_OUTPUT_ROOT / "context" -pipeline_jobs = {} -def register_domain_routes(app): - """Flask 깆 - 硫 고 깅 - @app.route('/api/domain-config', methods=['GET']) - def get_domain_config(): - """ - 硫 - ㅼ 諛 - 硫 щ 泥댄 - for cat in config.get('categories', []): - if cat.get('file'): - fpath = DOMAIN_DIR / cat['file'] - cat['file_exists'] = fpath.exists() - cat['file_size'] = fpath.stat().st_size if fpath.exists() else 0 - - for child in cat.get('children', []): - if child.get('file'): - fpath = DOMAIN_DIR / child['file'] - child['file_exists'] = fpath.exists() - child['file_size'] = fpath.stat().st_size if fpath.exists() else 0 - - return jsonify(config) - else: - return jsonify({'error': 'domain_config.json not found', 'categories': []}), 404 - except Exception as e: - return jsonify({'error': str(e), 'categories': []}), 500 - - - @app.route('/api/domain-combine', methods=['POST']) - def combine_domains(): - """ - - - 硫㼼 - ⑹ - domain_prompt.txt濡 - - 泥: - { "selected": ["civil_general", "survey", "bim"] } - - : - { "success": true, "combined_length": 3200, "selected_names": [...] } - """ - try: - data = request.get_json() - selected_ids = data.get('selected', []) - - if not selected_ids: - return jsonify({ - 'success': True, - 'combined_length': 0, - 'selected_names': [], - 'message': ' - - step3 遺 - - ' - }) - - # config 濡 - config = json.loads(DOMAIN_CONFIG_PATH.read_text(encoding='utf-8')) - - # - ID + - 留ㅽ - domain_parts = [] - guide_parts = [] - selected_names = [] - - for cat in config.get('categories', []): - is_guide = (cat['id'] == 'report_guide') - target = guide_parts if is_guide else domain_parts - - if cat['id'] in selected_ids and cat.get('file'): - fpath = DOMAIN_DIR / cat['file'] - if fpath.exists(): - content = fpath.read_text(encoding='utf-8', errors='ignore').strip() - if content: - target.append(f"[{cat['label']}]\n{content}") - selected_names.append(cat['label']) - - for child in cat.get('children', []): - if child['id'] in selected_ids and child.get('file'): - fpath = DOMAIN_DIR / child['file'] - if fpath.exists(): - content = fpath.read_text(encoding='utf-8', errors='ignore').strip() - if content: - target.append(f"[{child['label']}]\n{content}") - selected_names.append(child['label']) - selected_names.append(child['label']) - - if not domain_parts and not guide_parts: - return jsonify({ - 'success': False, - 'error': ' - - 硫 - 댁듬.' - }) - - sep = "\n\n" + "=" * 50 + "\n\n" - sections = [] - - if domain_parts: - domain_names = [n for n in selected_names if n not in ['紐⑹감 援ъ - 媛', '蹂닿 - 臾몄껜 媛']] - sections.append( - f" - ㅼ 遺 - 쇱 - 臾멸: {', '.join(domain_names)}.\n" - f"ㅼ - 硫 - 湲곕쇰, ъㅼ쇨굅 - 臾몄 댁⑹ - - 깊痢≪쎌 湲吏, 怨듬 洹쇨굅 몄 - 理 蹂댁〈 - 硫 - 臾 吏 - - 媛]\n" - f"ㅼ 媛瑜 李멸 蹂닿 - ⑹감 援ъ - 깃낵 臾몄껜瑜 寃곗 二쇱 - (ъ⑹ - - 닿 - - 媛]\n" - "ㅼ 媛瑜 李멸 蹂닿 - ⑹감 援ъ - 깃낵 臾몄껜瑜 寃곗 - CONTEXT_DIR.mkdir(parents=True, exist_ok=True) - output_path = CONTEXT_DIR / "domain_prompt.txt" - output_path.write_text(final_text, encoding='utf-8') - - return jsonify({ - 'success': True, - 'combined_length': len(final_text), - 'selected_names': selected_names, - 'selected_ids': selected_ids, - 'output_path': str(output_path) - }) - - except Exception as e: - return jsonify({'success': False, 'error': str(e)}), 500 - - - @app.route('/api/domain-list', methods=['GET']) - def list_domain_files(): - """ - domains/ 대 - 硫由ъ - """ - try: - files = [] - - if DOMAIN_DIR.exists(): - for f in sorted(DOMAIN_DIR.rglob('*.txt')): - rel = f.relative_to(DOMAIN_DIR) - files.append({ - 'path': str(rel), - 'name': f.stem, - 'size': f.stat().st_size, - 'preview': f.read_text(encoding='utf-8', errors='ignore')[:200] - }) - - return jsonify({ - 'success': True, - 'files': files, - 'domains_dir': str(DOMAIN_DIR) - }) - - except Exception as e: - return jsonify({'success': False, 'error': str(e)}), 500 - - - @app.route('/api/domain-save', methods=['POST']) - def save_domain_file(): - """ - - 硫/ - - 泥: - { "id": "survey", "content": "痢〓 遺 - 쇱 - 臾 吏 content媛 - ⑸.'}) - - # config - 李얘린 - config = json.loads(DOMAIN_CONFIG_PATH.read_text(encoding='utf-8')) - file_path = None - - for cat in config.get('categories', []): - if cat['id'] == domain_id: - file_path = cat.get('file') - break - for child in cat.get('children', []): - if child['id'] == domain_id: - file_path = child.get('file') - break - if file_path: - break - - if not file_path: - return jsonify({'success': False, 'error': f' - 硫 - 듬: {domain_id}'}) - - # - full_path = BASE_DIR / file_path - full_path.parent.mkdir(parents=True, exist_ok=True) - full_path.write_text(content, encoding='utf-8') - - return jsonify({ - 'success': True, - 'path': str(full_path), - 'size': len(content) - }) - - except Exception as e: - return jsonify({'success': False, 'error': str(e)}), 500 - - - @app.route('/api/pipeline/status', methods=['GET']) - def pipeline_status(): - """ - step щ""" - try: - status = { - 'step3_domain': (CONTEXT_DIR / 'domain_prompt.txt').exists(), - 'step4_chunks': len(list((PIPELINE_OUTPUT_ROOT / 'rag').glob('*_chunks.json'))) if (PIPELINE_OUTPUT_ROOT / 'rag').exists() else 0, - 'step5_faiss': (PIPELINE_OUTPUT_ROOT / 'rag' / 'faiss.index').exists(), - 'step6_corpus': (CONTEXT_DIR / 'corpus.txt').exists(), - 'step7_outline': (CONTEXT_DIR / 'outline_issue_report.txt').exists(), - 'step8_report': (PIPELINE_OUTPUT_ROOT / 'generated' / 'report_draft.md').exists(), - 'step9_html': (PIPELINE_OUTPUT_ROOT / 'generated' / 'report.html').exists(), - } - - return jsonify({'success': True, 'status': status}) - - except Exception as e: - return jsonify({'success': False, 'error': str(e)}), 500 - - def run_toc_pipeline(session_id, input_dir, output_dir, doc_type='report', attach_pages=1): - try: - pipeline_jobs[session_id] = {'status': 'running', 'step': 2} - from converters.pipeline import step2_extract, step3_domain, step4_chunk, step5_rag, step6_corpus, step7_index - - # 댁 怨 - - input_files = sorted(Path(input_dir).glob('*')) if Path(input_dir).exists() else [] - file_hashes = [] - new_files = [] # RAG 罹 HIT: {f.name} ({h})", flush=True) - else: - new_files.append(f) - print(f"[DB] RAG 罹 MISS: {f.name} ({h})", flush=True) - except Exception as de: - print(f"[DB] 罹 議고 ㅽ, 洹 泥: {de}", flush=True) - new_files.append(f) - - # 洹 step2~5 ㅽ - if new_files: - step2_extract.process_all_pdfs(input_dir, output_dir) - pipeline_jobs[session_id]['step'] = 3 - step3_domain.main(input_dir, output_dir) - pipeline_jobs[session_id]['step'] = 4 - step4_chunk.main(output_dir, output_dir) - pipeline_jobs[session_id]['step'] = 5 - step5_rag.main(output_dir, output_dir) - - # RAG 寃곌낵臾 DB - faiss_path = rag_dir / 'faiss.index' - vectors_path = rag_dir / 'vectors.npy' - meta_path = rag_dir / 'meta.json' - chunks_files = list(rag_dir.glob('*_chunks.json')) - - faiss_bytes = faiss_path.read_bytes() if faiss_path.exists() else b'' - vectors_bytes = vectors_path.read_bytes() if vectors_path.exists() else b'' - meta_text = meta_path.read_text(encoding='utf-8') if meta_path.exists() else '' - chunks_text = chunks_files[0].read_text(encoding='utf-8') if chunks_files else '' - - for f in new_files: - h = hashlib.md5(f.read_bytes()).hexdigest() - try: - with get_conn() as conn: - with conn.cursor() as cur: - cur.execute(""" - INSERT INTO files (file_hash, filename) - VALUES (%s, %s) - ON CONFLICT (file_hash) DO NOTHING - """, (h, f.name)) - cur.execute(""" - INSERT INTO rag_cache (file_hash, chunks_json, faiss_index, vectors, meta_json) - VALUES (%s, %s, %s, %s, %s) - ON CONFLICT (file_hash) DO NOTHING - """, (h, chunks_text, - psycopg2.Binary(faiss_bytes), - psycopg2.Binary(vectors_bytes), - meta_text)) - conn.commit() - print(f"[DB] RAG 罹 : {f.name}", flush=True) - except Exception as de: - print(f"[DB] RAG ㅽ: {de}", flush=True) - else: - print("[DB] 紐⑤ HIT step2/4/5 ㅽ, step3 ㅽ + 罹 蹂듭 - RAG 寃곌낵臾 蹂듭 蹂듭 - 猷", flush=True) - except Exception as de: - print(f"[DB] 罹 蹂듭ㅽ, step4~5 ㅽ: {de}", flush=True) - step4_chunk.main(output_dir, output_dir) - step5_rag.main(output_dir, output_dir) - pipeline_jobs[session_id]['step'] = 5 - - # step6~7 ㅽ - pipeline_jobs[session_id]['step'] = 6 - step6_corpus.main(output_dir, output_dir) - pipeline_jobs[session_id]['step'] = 7 - step7_index.main(output_dir, output_dir, doc_type=doc_type) - - outline_txt = Path(output_dir) / 'context' / 'outline_issue_report.txt' - print("[DEBUG outline]", outline_txt.read_text(encoding='utf-8')[:500], flush=True) - - # sessions / outlines DB - outline_text = outline_txt.read_text(encoding='utf-8') if outline_txt.exists() else '' - try: - with get_conn() as conn: - with conn.cursor() as cur: - cur.execute(""" - INSERT INTO sessions (session_id, file_hashes, doc_type) - VALUES (%s, %s, %s) - ON CONFLICT (session_id) DO UPDATE SET doc_type=EXCLUDED.doc_type - """, (session_id, file_hashes, doc_type)) - cur.execute(""" - INSERT INTO outlines (session_id, outline_text) - VALUES (%s, %s) - ON CONFLICT (session_id) DO UPDATE SET outline_text=EXCLUDED.outline_text - """, (session_id, outline_text)) - conn.commit() - print(f"[DB] session/outline - 猷: {session_id}", flush=True) - except Exception as de: - print(f"[DB] session/outline ㅽ: {de}", flush=True) - - pipeline_jobs[session_id] = {'status': 'done', 'doc_type': doc_type} - - except Exception as e: - import traceback - print(f"[PIPELINE ERROR] {e}", flush=True) - print(traceback.format_exc(), flush=True) - pipeline_jobs[session_id] = {'status': 'error', 'error': str(e)} - - - # ===== - ㅽ API ===== - @app.route('/api/generate-toc', methods=['POST']) - def generate_toc(): - """ - 紐⑹감 - API (step3 4 5 6 7) - - - 硫 - - 寃쎌: step3 ㅽ () - - 硫 - 寃쎌: step3 ㅽ - - 泥: - { - "folder_path": "D:\\...", - "domain_selected": true/false, - "selected_domains": ["civil_general", "survey"] - } - - : - { - "success": true, - "title": "蹂닿 - 紐", - "toc_items": [ - { "num": "1.1.1", "title": "...", "guide": "...", "keywords": [...] } - ] - } - """ - try: - data = request.get_json() - session_id = data.get('session_id', '') - domain_selected = data.get('domain_selected', False) - write_mode = data.get('write_mode', 'restructure') - instruction = data.get('instruction', '') - - if not session_id: - return jsonify({'success': False, 'error': 'session_id媛 듬. - 癒쇱 -濡 - 몄.'}) - - input_dir = f'/tmp/{session_id}/input' - output_dir = f'/tmp/{session_id}/output' - os.makedirs(output_dir, exist_ok=True) - - doc_type = data.get('doc_type', 'report') - attach_pages = int(data.get('attach_pages', 1)) - t = threading.Thread(target=run_toc_pipeline, args=(session_id, input_dir, output_dir, doc_type, attach_pages)) - t.daemon = True - t.start() - return jsonify({'success': True, 'status': 'processing', 'session_id': session_id}) - - except Exception as e: - return jsonify({'success': False, 'error': str(e)}), 500 - - @app.route('/api/toc-status/', methods=['GET']) - def toc_status(session_id): - job = pipeline_jobs.get(session_id, {'status': 'unknown'}) - if job.get('status') == 'done': - outline_path = Path(f'/tmp/{session_id}/output/context/outline_issue_report.txt') - if outline_path.exists(): - doc_type = job.get('doc_type', 'report') - if doc_type == 'briefing': - toc_items = parse_briefing_plan_for_frontend(outline_path) - else: - toc_items = parse_outline_for_frontend(outline_path) - return jsonify({'status': 'done', 'toc_items': toc_items}) - return jsonify(job) - - - @app.route('/api/generate-report-from-toc', methods=['POST']) - def generate_report_from_toc(): - """ - 몄 紐⑹감濡 蹂닿 - - (step8 step9) - - 泥: - { - "toc_items": [...], # 몄 紐⑹감 - "write_mode": "restructure", - "instruction": "..." - } - """ - try: - data = request.get_json() - session_id = data.get('session_id', '') - toc_items = data.get('toc_items', []) - write_mode = data.get('write_mode', 'restructure') - instruction = data.get('instruction', '') - - if not session_id: - return jsonify({'success': False, 'error': 'session_id媛 듬.'}) - - input_dir = f'/tmp/{session_id}/input' - output_dir = f'/tmp/{session_id}/output' - - from converters.pipeline import step8_content, step9_html - - doc_type = data.get('doc_type', 'report') - - step8_content.main(output_dir, output_dir, doc_type=doc_type) - step9_html.main(output_dir, output_dir, doc_type=doc_type) - - report_html_path = Path(output_dir) / 'generated' / 'report.html' - - # briefing_content 쇰㈃ None) - briefing_json_path = Path(output_dir) / 'generated' / 'briefing_content.json' - briefing_content = None - if briefing_json_path.exists(): - briefing_content = json.loads(briefing_json_path.read_text(encoding='utf-8')) - - # 湲곗〈 html 諛吏 吏 + briefing_content - 留 異媛 - if report_html_path.exists(): - html = report_html_path.read_text(encoding='utf-8') - # briefing 寃곌낵臾 DB - try: - with get_conn() as conn: - with conn.cursor() as cur: - cur.execute(""" - INSERT INTO briefings (session_id, briefing_json, html) - VALUES (%s, %s, %s) - ON CONFLICT (session_id) DO UPDATE - SET briefing_json=EXCLUDED.briefing_json, html=EXCLUDED.html - """, (session_id, - json.dumps(briefing_content, ensure_ascii=False) if briefing_content else '', - html)) - conn.commit() - print(f"[DB] briefing - 猷: {session_id}", flush=True) - except Exception as de: - print(f"[DB] briefing ㅽ: {de}", flush=True) - return jsonify({ - 'success': True, - 'html': html, - 'briefing_content': briefing_content - }) - else: - return jsonify({ - 'success': False, - 'error': '蹂닿 - - 깆ㅽ⑦듬.' - }) - - except Exception as e: - return jsonify({'success': False, 'error': str(e)}), 500 - - - - @app.route('/api/check-folder', methods=['POST']) - def check_folder(): - """대 寃쎈 - 遺 - 瑜대瑜 李얠 - 듬.'}) - - SUPPORTED = {'.hwpx', '.hwp', '.pdf', '.docx', '.xlsx', '.pptx', '.txt', '.csv', 'md', 'json','img', 'png', 'html'} - - all_files = [f for f in folder.rglob('*') if f.is_file()] - ok_files = [f for f in all_files if f.suffix.lower() in SUPPORTED] - unknown_files = [f for f in all_files if f.suffix.lower() not in SUPPORTED] - - return jsonify({ - 'success': True, - 'total': len(all_files), - 'ok': len(ok_files), - 'unknown': len(unknown_files), - 'ok_list': [{'name': f.name, 'size': f.stat().st_size} for f in ok_files], - 'unknown_list': [f.name for f in unknown_files] - }) - except Exception as e: - return jsonify({'success': False, 'error': str(e)}), 500 - - @app.route('/api/analyze-briefing', methods=['POST']) - def analyze_briefing(): - """ - -濡 - 遺 - - 댁 - 湲고 - 援ъ - 怨 - 諛 -濡 - -ㅽ 異異 - source_text = content - if session_id: - input_dir = Path(f'/tmp/{session_id}/input') - output_dir = Path(f'/tmp/{session_id}/output') - if input_dir.exists(): - try: - from converters.pipeline import step2_extract - output_dir.mkdir(parents=True, exist_ok=True) - step2_extract.process_all_pdfs(str(input_dir), str(output_dir)) - except Exception as ex: - print(f"step2 異異 - - 댁⑹듬.'}) - - client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY')) - - prompt = f"""ㅼ 臾몄 - 瑜 遺 - - A4 湲고 - 濡 뎄 - 깊 怨 - - JSON쇰 諛 - 댁: -{source_text[:4000]} - -諛, - ㅻ - 紐", - "sections": [ - {{"type": "由щ諛", "content": "듭 硫吏 以 - "}}, - {{"type": " - 뱀 - -뱀 -紐 + 댁 - 댁 "}}, - {{"type": " - 뱀 - -뱀 -紐 + 댁 - 댁 "}}, - {{"type": "⑤", "content": "듭 寃곕 以 - "}} - ] - }}, - {{ - "page": 2, - "title": "[泥⑤] 紐", - "sections": [ - {{"type": " - 뱀 -⑤ 댁 "}}, - {{"type": "⑤", "content": "듭 寃곕"}} - ] - }} - ] -}} - -洹移: -- 蹂몃Ц 1 + - 泥⑤ 1 (댁⑹ - 留) -- 媛 - 뱀 - ㅼ 臾몄 - 댁 湲곕쇰 援ъ껜쇰 - -- JSON留 諛щㅼ 肄釉濡釉濡嫄 - raw = raw.replace('```json', '').replace('```', '').strip() - plan = json.loads(raw) - - return jsonify({'success': True, 'plan': plan}) - - except Exception as e: - import traceback - print(traceback.format_exc(), flush=True) - return jsonify({'success': False, 'error': str(e)}), 500 - - - @app.route('/api/generate-briefing', methods=['POST']) - def generate_briefing(): - """ - 援ъ - 怨 - + 肄硫 諛 ㅼ A4 Navy HTML 湲고 - - - """ - try: - import openai - data = request.get_json() - session_id = data.get('session_id', '') - plan = data.get('plan', {}) - comment = data.get('comment', '') - content = data.get('content', '') - doc_type = data.get('doc_type', '') - - # -ㅽ - source_text = content - if session_id: - input_dir = Path(f'/tmp/{session_id}/input') - output_dir = Path(f'/tmp/{session_id}/output') - - if input_dir.exists(): - # - step2濡 PDF -ㅽ 蹂쇱 ㅽ - try: - from converters.pipeline import step2_extract - output_dir.mkdir(parents=True, exist_ok=True) - step2_extract.process_all_pdfs(str(input_dir), str(output_dir)) - except Exception as ex: - print(f"step2 異異 - 異異 .md - ъ⑹硫 (諛 諛):\n{comment}" if comment else "" - - prompt = f"""ㅼ 援ъ - 怨 - 怨 臾몄 - 瑜 諛쇰 A4 HTML 湲고 - 瑜 - 깊 - 怨 - : -{plan_str} - - 臾몄 - : -{source_text[:5000]} - -[異 洹移] -- - -由 援ъ“瑜 곕 寃 -- Navy 而щ ㅽ 吏 (#1a365d, #2c5282, #f7fafc) -- .sheet overflow: hidden 댁⑹쇰 硫 -- 媛 蹂 - -
濡 援щ - -- HTML - 泥 肄留 諛 - ㅻ - -由] - - - - - - - - - -""" - - resp = client.chat.completions.create( - model=os.getenv('OPENAI_MODEL', 'gpt-4o'), - messages=[{'role': 'user', 'content': prompt}], - temperature=0.4, - max_tokens=4000 - ) - - html = resp.choices[0].message.content.strip() - # 肄釉濡嫄 - if html.startswith('```'): - html = html.split('\n', 1)[1] - html = html.rsplit('```', 1)[0] - - return jsonify({'success': True, 'html': html}) - - except Exception as e: - import traceback - print(traceback.format_exc(), flush=True) - return jsonify({'success': False, 'error': str(e)}), 500 - - - -def parse_outline_for_frontend(outline_path: Path) -> list: - """ - outline_issue_report.txt瑜 깊 - displayTocWithAnimation() 쇰 蹂紐⑹감 紐", - "guide": "吏 - 媛", - "keywords": ["ㅼ1", "ㅼ2"] - } - ] - """ - import re - - raw = outline_path.read_text(encoding='utf-8', errors='ignore').splitlines() - if not raw: - return [] - - report_title = raw[0].strip() - items = [] - - re_l3_head = re.compile(r'^\s*(\d+\.\d+\.\d+)\s+(.+)$') - re_l3_topic = re.compile(r'^\s*[\-\*]\s+(.+?)\s*\|\s*(.+?)\s*\|\s*(\[.+?\])\s*\|\s*(.+)$') - re_keywords = re.compile(r'(#\S+)') - - current_l3 = None - - for ln in raw[1:]: - line = ln.strip() - if not line: - continue - - m3h = re_l3_head.match(line) - if m3h: - current_l3 = { - 'num': m3h.group(1), - 'title': m3h.group(2), - 'report_title': report_title, - 'guide': '', - 'keywords': [] - } - items.append(current_l3) - continue - - m3t = re_l3_topic.match(line) - if m3t and current_l3: - kws = [k.lstrip('#').strip() for k in re_keywords.findall(m3t.group(2))] - # 湲곗〈 ㅼ媛 - current_l3['keywords'].extend(kws) - # 媛 - - if current_l3['guide']: - current_l3['guide'] += ' / ' - current_l3['guide'] += m3t.group(4) - - return items - -def parse_briefing_plan_for_frontend(outline_path: Path) -> list: - raw = outline_path.read_text(encoding='utf-8', errors='ignore').strip() - raw_lines = raw.splitlines() - if not raw_lines: - return [] - - # - / 媛 ⑤ - 以 - 濡 寃쎌 ㅼ 以 - 怨 ⑹ - merged = [] - idx = 0 - while idx < len(raw_lines): - ln = raw_lines[idx].strip() - if ln in [' - ', ''] and idx + 1 < len(raw_lines): - merged.append(ln + ' ' + raw_lines[idx + 1].strip()) - idx += 2 - continue - merged.append(raw_lines[idx]) - idx += 1 - lines = merged - - items = [] - current_page = None - - for line in lines: - line = line.strip() - if not line: - continue - - if ' - ' in line or '' in line: - icon = ' - ' if ' - ' in line else '' - title = line.replace(' - ', '').replace('', '').strip() - # "蹂몃Ц N" "蹂몃Ц", "泥⑤ N" "泥⑤ N" - import re as _re - title = _re.sub(r'蹂몃Ц\s*\d*?', '蹂몃Ц', title).strip() - title = _re.sub(r'泥⑤\s*(\d+)?', r'泥⑤ \1', title).strip() - if '紐:' in title: - title = title.split('紐:')[-1].strip() - if not title: - title = '蹂몃Ц' if icon == ' - ' else '泥⑤' - current_page = { - 'num': icon, - 'title': title, - 'guide': '', - 'keywords': [], - 'sections': [] - } - items.append(current_page) - - elif current_page is not None and line.strip(): - content = line.lstrip('-').strip() - - if content.startswith(':'): - continue - - if content.startswith('紐:'): - current_page['title'] = content.replace('紐:', '').strip() - continue - - # 由щ諛/⑤ (": " щ諛/⑤ ы⑤ 蹂 - - 泥) - import re as _re - if _re.match(r'^由щ諛\s*:', content): - lead_text = content.split(':', 1)[-1].strip() - current_page['sections'].append({'label': '由щ諛', 'text': lead_text}) - continue - if _re.match(r'^⑤\s*:', content): - bottom_text = content.split(':', 1)[-1].strip() - current_page['sections'].append({'label': '⑤', 'text': bottom_text}) - continue - - if '|' in content: - parts = [p.strip() for p in content.split('|')] - section_name = parts[0].split(':')[-1].strip() - comment = parts[1] if len(parts) > 1 else '' - fmt = parts[3] if len(parts) > 3 else (parts[2] if len(parts) > 2 else '') - current_page['sections'].append({ - 'label': section_name, - 'text': comment, - 'fmt': fmt - }) - else: - current_page['sections'].append({'label': content, 'text': ''}) - - # guide 嫄 - sections 由ъㅽ 洹몃濡 - 濡몄 - - for item in items: - item['guide'] = '' # - 濡몄 - sections 吏 ъ - - return items