From f22fd99fec783d9239bc025f5a93aeb978fcc797 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=EC=9D=B4=EA=B2=BD=EB=AF=BC?= <b24009@hanmaceng.co.kr>
Date: Thu, 19 Mar 2026 09:39:13 +0900
Subject: [PATCH] Upload converter pipeline: router.py

---
 .../업로드용/converters/pipeline/router.py    | 116 +++++++++---------
 1 file changed, 56 insertions(+), 60 deletions(-)

diff --git a/03.Code/업로드용/converters/pipeline/router.py b/03.Code/업로드용/converters/pipeline/router.py
index 6897910..e30f755 100644
--- a/03.Code/업로드용/converters/pipeline/router.py
+++ b/03.Code/업로드용/converters/pipeline/router.py
@@ -6,25 +6,24 @@ load_dotenv()
 router.py
 
 기능:
-- HTML 파일 처리 로직의 메인 라우터
-- 문서 길이에 따라 Short Pipeline 또는 Long Pipeline으로 분기
-- Short Pipeline: 단순 HTML 변환 (step7, 8, 9 생략 가능)
-- Long Pipeline: RAG 기반의 문서 재구성 (step3~9 전체 과정)
+- HTML 입력의 분량을 판단하여 적절한 파이프라인으로 분기
+- 긴 문서 (5000자 이상): RAG 파이프라인 (step3→4→5→6→7→8→9)
+- 짧은 문서 (5000자 미만): 직접 생성 (step7→8→9)
 """
 
 import re
 import os
 from typing import Dict, Any
 
-# 분기 기준 문서 길이
-LONG_DOC_THRESHOLD = 5000  # 5000자 이상일 경우 Long Pipeline으로 분기
+# 분량 판단 기준
+LONG_DOC_THRESHOLD = 5000  # 5000자 이상이면 긴 문서
 
-# 앱 내 assets 경로 (개발 및 배포용) - r prefix 사용 안함!
+# 이미지 assets 경로 (개발용 고정) - r prefix 필수!
 ASSETS_BASE_PATH = os.environ.get("ASSETS_BASE_PATH", "/tmp/assets")
 
 
 def count_characters(html_content: str) -> int:
-    """HTML 태그를 제외한 실제 텍스트 글자 수 계산"""
+    """HTML 태그 제외한 순수 텍스트 글자 수 계산"""
     # HTML 태그 제거
     text_only = re.sub(r'<[^>]+>', '', html_content)
     # 공백 정리
@@ -33,45 +32,43 @@ def count_characters(html_content: str) -> int:
 
 
 def is_long_document(html_content: str) -> bool:
-    """긴 문서 여부 판별"""
+    """긴 문서 여부 판단"""
     char_count = count_characters(html_content)
     return char_count >= LONG_DOC_THRESHOLD
 
-
 def convert_image_paths(html_content: str) -> str:
     """
-    HTML 내의 이미지 경로를 상대 경로로 변경
-    - assets/xxx.png -> /assets/xxx.png (Flask 정적 파일 대응)
-    - 외부 경로는 그대로 유지
+    HTML 내 이미지 경로를 서버 경로로 변환
+    - assets/xxx.png → /assets/xxx.png (Flask 서빙용)
+    - 절대 경로나 URL은 그대로 유지
     """
-
+    
     def replace_src(match):
         original_path = match.group(1)
-
-        # 절대 경로 또는 URL인 경우 그대로 유지
+        
+        # 이미 절대 경로이거나 URL이면 그대로
         if original_path.startswith(('http://', 'https://', 'file://', 'D:', 'C:', '/')):
             return match.group(0)
-
-        # assets/로 시작하면 /assets/로 변경 (Flask 대응)
+        
+        # assets/로 시작하면 /assets/로 변환 (Flask 서빙)
         if original_path.startswith('assets/'):
             return f'src="/{original_path}"'
-
+        
         return match.group(0)
-
-    # src="..." 패턴을 찾아서 변경
+    
+    # src="..." 패턴 찾아서 변환
     result = re.sub(r'src="([^"]+)"', replace_src, html_content)
     return result
 
-
 def run_short_pipeline(html_content: str, options: dict) -> Dict[str, Any]:
     """
-    단기 파이프라인 (5000자 미만)
+    짧은 문서 파이프라인 (5000자 미만)
     """
     try:
-        # 이미지 경로 변환 로직
+        # 이미지 경로 변환
         processed_html = convert_image_paths(html_content)
-
-        # TODO: step7, step8, step9 과정 최적화
+        
+        # TODO: step7, step8, step9 연동
         return {
             'success': True,
             'pipeline': 'short',
@@ -85,100 +82,99 @@ def run_short_pipeline(html_content: str, options: dict) -> Dict[str, Any]:
             'pipeline': 'short'
         }
 
-
 def inject_template_css(html_content: str, template_css: str) -> str:
     """
-    HTML문서에 템플릿 CSS 주입
-    - <style> 태그가 있으면 그 뒤에 추가
-    - 없으면 <head>내에 추가
+    HTML에 템플릿 CSS 주입
+    - <style> 태그가 있으면 그 안에 추가
+    - 없으면 <head>에 새로 생성
     """
     if not template_css:
         return html_content
-
-    css_block = f"\n/* ===== 템플릿 스타일 추가 ===== */\n{template_css}\n"
-
-    # 기존에 </style> 태그가 있는 경우
+    
+    css_block = f"\n/* ===== 템플릿 스타일 ===== */\n{template_css}\n"
+    
+    # 기존 </style> 태그 앞에 추가
     if '</style>' in html_content:
         return html_content.replace('</style>', f'{css_block}</style>', 1)
-
-    # <head> 태그 뒤에 추가
+    
+    # <head> 태그 뒤에 새로 추가
     elif '<head>' in html_content:
         return html_content.replace('<head>', f'<head>\n<style>{css_block}</style>', 1)
-
-    # head가 없는 경우 맨 앞에 추가
+    
+    # head도 없으면 맨 앞에 추가
     else:
         return f'<style>{css_block}</style>\n{html_content}'
 
 
 def run_long_pipeline(html_content: str, options: dict) -> Dict[str, Any]:
     """
-    장기 파이프라인 (5000자 이상)
-    단계별 step 실행을 위한 준비
+    긴 문서 파이프라인 (5000자 이상)
+    이제 실제 step들을 호출함
     """
     try:
         processed_html = convert_image_paths(html_content)
-
+        
         folder_path = options.get('folder_path', '')
         write_mode = options.get('write_mode', 'restructure')
-
+        
         if not folder_path:
-            # 폴더가 없으면 HTML만 처리 (기존 로직)
+            # 폴더 없으면 HTML만으로 처리 (기존 로직)
             return {
                 'success': True,
                 'pipeline': 'long',
                 'char_count': count_characters(html_content),
                 'html': processed_html
             }
-
-        # 이 단계 이후 /api/generate-toc 와 /api/generate-report-from-toc 에서 처리
-        # router는 우선 HTML 통과만 담당함
+        
+        # ★ 파이프라인 실행은 /api/generate-toc → /api/generate-report-from-toc 에서 처리
+        # router는 여전히 HTML 통과 역할 유지
         return {
             'success': True,
             'pipeline': 'long',
             'char_count': count_characters(html_content),
             'html': processed_html,
-            'needs_pipeline': True  # 프론트엔드 분기 처리용
+            'needs_pipeline': True  # ← 프론트에서 분기 판단용
         }
-
+        
     except Exception as e:
         return {'success': False, 'error': str(e), 'pipeline': 'long'}
 
 
 def process_document(content: str, options: dict = None) -> Dict[str, Any]:
     """
-    문서 처리를 위한 메인 진입점
-    - 분기 로직: 문서 길이에 따라 다른 파이프라인 실행
-
+    메인 라우터 함수
+    - 분량에 따라 적절한 파이프라인으로 분기
+    
     Args:
-        content: HTML 내용
+        content: HTML 문자열
         options: 추가 옵션 (page_option, instruction 등)
-
+    
     Returns:
         {'success': bool, 'html': str, 'pipeline': str, ...}
     """
     if options is None:
         options = {}
-
+    
     if not content or not content.strip():
         return {
             'success': False,
             'error': '내용이 비어있습니다.'
         }
-
+    
     char_count = count_characters(content)
-
+    
     if is_long_document(content):
         result = run_long_pipeline(content, options)
     else:
         result = run_short_pipeline(content, options)
-
+    
     # 공통 정보 추가
     result['char_count'] = char_count
     result['threshold'] = LONG_DOC_THRESHOLD
-
-    # 템플릿 CSS 주입
+    
+    # ⭐ 템플릿 CSS 주입
     template_css = options.get('template_css')
     if template_css and result.get('success') and result.get('html'):
         result['html'] = inject_template_css(result['html'], template_css)
-
+    
     return result