feat: 프로젝트 활성도 분석 시스템 및 크롤링 인증/중단 기능 구현 - DB 연결 최적화, 활성도 위젯 및 내비게이션, 관리자 인증 모달, 중단 기능, UI 레이아웃 최적화, 코드 리팩토링 및 파일 정리

2026-03-11 14:03:26 +09:00
parent 4a995c11f4
commit 9f06857bea
26 changed files with 587 additions and 1323 deletions
--- a/crawler_service.py
+++ b/crawler_service.py
@@ -11,15 +11,18 @@ from datetime import datetime
 from playwright.async_api import async_playwright
 from dotenv import load_dotenv

-load_dotenv()
+load_dotenv(override=True)
+
+# 글로벌 중단 제어용 이벤트
+crawl_stop_event = threading.Event()

 def get_db_connection():
-    """MySQL 데이터베이스 연결을 반환합니다."""
+    """MySQL 데이터베이스 연결을 반환 (환경변수 기반)"""
    return pymysql.connect(
-        host='localhost',
-        user='root',
-        password='45278434',
-        database='crawling',
+        host=os.getenv('DB_HOST', 'localhost'),
+        user=os.getenv('DB_USER', 'root'),
+        password=os.getenv('DB_PASSWORD', '45278434'),
+        database=os.getenv('DB_NAME', 'PM_proto'),
        charset='utf8mb4',
        cursorclass=pymysql.cursors.DictCursor
    )
@@ -27,12 +30,10 @@ def get_db_connection():
 def clean_date_string(date_str):
    if not date_str: return ""
    match = re.search(r'(\d{2})[./-](\d{2})[./-](\d{2})', date_str)
-    if match:
-        return f"20{match.group(1)}.{match.group(2)}.{match.group(3)}"
+    if match: return f"20{match.group(1)}.{match.group(2)}.{match.group(3)}"
    return date_str[:10].replace("-", ".")

 def parse_log_id(log_id):
-    """ID 구조: 로그고유번호_시간_활동한 사람_활동내용_활동대상"""
    if not log_id or "_" not in log_id: return log_id
    try:
        parts = log_id.split('_')
@@ -45,6 +46,7 @@ def parse_log_id(log_id):
    return log_id

 def crawler_thread_worker(msg_queue, user_id, password):
+    crawl_stop_event.clear()
    if sys.platform == 'win32':
        asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
    
@@ -55,11 +57,18 @@ def crawler_thread_worker(msg_queue, user_id, password):
        async with async_playwright() as p:
            browser = None
            try:
-                msg_queue.put(json.dumps({'type': 'log', 'message': '브라우저 엔진 가동 (전 기능 완벽 복구 모드)...'}))
-                browser = await p.chromium.launch(headless=False, args=["--no-sandbox"])
-                context = await browser.new_context(viewport={'width': 1600, 'height': 900})
+                msg_queue.put(json.dumps({'type': 'log', 'message': '브라우저 엔진 가동 (전 기능 복구 모드)...'}))
+                browser = await p.chromium.launch(headless=False, args=[
+                    "--no-sandbox", 
+                    "--disable-dev-shm-usage",
+                    "--disable-blink-features=AutomationControlled"
+                ])
+                context = await browser.new_context(
+                    viewport={'width': 1600, 'height': 900},
+                    user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
+                )
                
-                captured_data = {"tree": None, "_is_root_archive": False, "_tree_url": "", "project_list": []}
+                captured_data = {"tree": None, "_is_root_archive": False, "project_list": []}

                async def global_interceptor(response):
                    url = response.url
@@ -68,17 +77,13 @@ def crawler_thread_worker(msg_queue, user_id, password):
                            data = await response.json()
                            captured_data["project_list"] = data.get("data", [])
                        elif "getTreeObject" in url:
-                            # [핵심 복원] 정확한 루트 경로 판별 로직
                            is_root = False
                            if "params[resourcePath]=" in url:
                                path_val = url.split("params[resourcePath]=")[1].split("&")[0]
                                if path_val in ["%2F", "/"]: is_root = True
-                            
                            if is_root:
-                                data = await response.json()
-                                captured_data["tree"] = data
-                                captured_data["_is_root_archive"] = "archive" in url
-                                captured_data["_tree_url"] = url
+                                captured_data["tree"] = await response.json()
+                                captured_data["_is_root_archive"] = True
                    except: pass

                context.on("response", global_interceptor)
@@ -86,54 +91,52 @@ def crawler_thread_worker(msg_queue, user_id, password):
                await page.goto("https://overseas.projectmastercloud.com/dashboard", wait_until="domcontentloaded")
                
                # 로그인
-                if await page.locator("#login-by-id").is_visible(timeout=5000):
+                if await page.locator("#login-by-id").is_visible(timeout=10000):
                    await page.click("#login-by-id")
                    await page.fill("#user_id", user_id)
                    await page.fill("#user_pw", password)
                    await page.click("#login-btn")

-                # 리스트 로딩 대기
                await page.wait_for_selector("h4.list__contents_aria_group_body_list_item_label", timeout=60000)
                await asyncio.sleep(3)

-                # [Phase 1] DB 기초 정보 동기화 (마스터 테이블)
+                # [Phase 1] DB 마스터 정보 동기화
                if captured_data["project_list"]:
                    conn = get_db_connection()
                    try:
                        with conn.cursor() as cursor:
                            for p_info in captured_data["project_list"]:
-                                try:
-                                    sql = """
-                                    INSERT INTO projects_master (project_id, project_nm, short_nm, master, continent, country)
-                                    VALUES (%s, %s, %s, %s, %s, %s)
-                                    ON DUPLICATE KEY UPDATE 
-                                        project_nm = VALUES(project_nm), short_nm = VALUES(short_nm),
-                                        master = VALUES(master), continent = VALUES(continent), country = VALUES(country)
-                                    """
-                                    cursor.execute(sql, (p_info.get("project_id"), p_info.get("project_nm"), 
-                                                       p_info.get("short_nm", "").strip(), p_info.get("master"), 
-                                                       p_info.get("large_class"), p_info.get("mid_class")))
-                                except: continue
-                        conn.commit()
-                        msg_queue.put(json.dumps({'type': 'log', 'message': f'DB 마스터 정보 동기화 완료.'}))
+                                sql = """
+                                INSERT INTO projects_master (project_id, project_nm, short_nm, master, continent, country)
+                                VALUES (%s, %s, %s, %s, %s, %s)
+                                ON DUPLICATE KEY UPDATE 
+                                    project_nm = VALUES(project_nm), short_nm = VALUES(short_nm),
+                                    master = VALUES(master), continent = VALUES(continent), country = VALUES(country)
+                                """
+                                cursor.execute(sql, (p_info.get("project_id"), p_info.get("project_nm"), 
+                                                   p_info.get("short_nm", "").strip(), p_info.get("master"), 
+                                                   p_info.get("large_class"), p_info.get("mid_class")))
+                            conn.commit()
+                        msg_queue.put(json.dumps({'type': 'log', 'message': 'DB 마스터 정보 동기화 완료.'}))
                    finally: conn.close()

-                # [Phase 2] h4 태그 기반 수집 루프
+                # [Phase 2] 수집 루프
                names = await page.locator("h4.list__contents_aria_group_body_list_item_label").all_inner_texts()
                project_names = list(dict.fromkeys([n.strip() for n in names if n.strip()]))
                count = len(project_names)

                for i, project_name in enumerate(project_names):
-                    # 현재 프로젝트의 고유 ID 매칭 (저장용)
+                    if crawl_stop_event.is_set():
+                        msg_queue.put(json.dumps({'type': 'log', 'message': '>>> 중단 신호 감지: 종료합니다.'}))
+                        break
+
+                    msg_queue.put(json.dumps({'type': 'log', 'message': f'[{i+1}/{count}] {project_name} 수집 시작'}))
                    p_match = next((p for p in captured_data["project_list"] if p.get('project_nm') == project_name or p.get('short_nm', '').strip() == project_name), None)
                    current_p_id = p_match.get('project_id') if p_match else None
+                    captured_data["tree"] = None; captured_data["_is_root_archive"] = False

-                    captured_data["tree"] = None
-                    captured_data["_is_root_archive"] = False
-                    msg_queue.put(json.dumps({'type': 'log', 'message': f'[{i+1}/{count}] {project_name} 수집 시작'}))
-                    
                    try:
-                        # 1. 프로젝트 진입 ([완전 복원] 좌표 클릭)
+                        # 1. 프로젝트 진입 (좌표 클릭)
                        target_el = page.locator(f"h4.list__contents_aria_group_body_list_item_label:has-text('{project_name}')").first
                        await target_el.scroll_into_view_if_needed()
                        box = await target_el.bounding_box()
@@ -143,44 +146,34 @@ def crawler_thread_worker(msg_queue, user_id, password):
                        await page.wait_for_selector("text=활동로그", timeout=30000)
                        await asyncio.sleep(2)

-                        recent_log = "데이터 없음"
-                        file_count = 0
+                        recent_log = "데이터 없음"; file_count = 0
                        
-                        # 2. 활동로그 ([완전 복원] 3회 재시도 + 좌표 클릭 + 날짜 필터)
+                        # 2. 활동로그 (날짜 필터 적용 버전)
                        modal_opened = False
                        for _ in range(3):
-                            log_btn = page.get_by_text("활동로그").first
-                            btn_box = await log_btn.bounding_box()
-                            if btn_box: await page.mouse.click(btn_box['x'] + 5, btn_box['y'] + 5)
-                            else: await page.evaluate("(el) => el.click()", await log_btn.element_handle())
-                            
+                            await page.get_by_text("활동로그").first.click()
                            try:
                                await page.wait_for_selector("article.archive-modal", timeout=5000)
-                                modal_opened = True
-                                break
+                                modal_opened = True; break
                            except: await asyncio.sleep(1)

                        if modal_opened:
-                            # 날짜 필터 입력
+                            # 날짜 필터 2020-01-01 적용
                            inputs = await page.locator("article.archive-modal input").all()
                            for inp in inputs:
                                if (await inp.get_attribute("type")) == "date":
-                                    await inp.fill("2020-01-01")
-                                    break
+                                    await inp.fill("2020-01-01"); break
                            
                            apply_btn = page.locator("article.archive-modal").get_by_text("적용").first
                            if await apply_btn.is_visible():
                                await apply_btn.click()
-                                await asyncio.sleep(5) # 렌더링 보장
+                                await asyncio.sleep(5)
                                log_elements = await page.locator("article.archive-modal div[id*='_']").all()
                                if log_elements:
-                                    raw_id = await log_elements[0].get_attribute("id")
-                                    recent_log = parse_log_id(raw_id)
-                                    msg_queue.put(json.dumps({'type': 'log', 'message': f'   - [분석] 최신 로그 ID 추출 성공: {recent_log}'}))
-                            msg_queue.put(json.dumps({'type': 'log', 'message': f'   - [최종 결과] {recent_log}'}))
+                                    recent_log = parse_log_id(await log_elements[0].get_attribute("id"))
                            await page.keyboard.press("Escape")

-                        # 3. 구성 수집 ([완전 복원] BaseURL fetch + 정밀 합산)
+                        # 3. 구성 수집 (API Fetch 방식 - 팝업 없음)
                        await page.evaluate("""() => { 
                            const baseUrl = window.location.origin + window.location.pathname.split('/').slice(0, 2).join('/');
                            fetch(`${baseUrl}/archive/getTreeObject?params[storageType]=CLOUD&params[resourcePath]=/`); 
@@ -190,43 +183,26 @@ def crawler_thread_worker(msg_queue, user_id, password):
                            await asyncio.sleep(0.5)

                        if captured_data["tree"]:
-                            data_root = captured_data["tree"]
-                            tree = data_root.get('currentTreeObject', data_root) if isinstance(data_root, dict) else {}
-                            total = 0
-                            # 루트 파일 합산
-                            rf = tree.get("file", {})
-                            total += len(rf) if isinstance(rf, (dict, list)) else 0
-                            # 폴더별 filesCount 합산
+                            tree = captured_data["tree"].get('currentTreeObject', captured_data["tree"])
+                            total = len(tree.get("file", {}))
                            folders = tree.get("folder", {})
                            if isinstance(folders, dict):
-                                for f in folders.values():
-                                    c = f.get("filesCount", "0")
-                                    total += int(c) if str(c).isdigit() else 0
+                                for f in folders.values(): total += int(f.get("filesCount", 0))
                            file_count = total
-                            msg_queue.put(json.dumps({'type': 'log', 'message': f'   - [구성] 데이터 채택 성공: ...{captured_data.get("_tree_url", "")[-40:]}'}))
-                            msg_queue.put(json.dumps({'type': 'log', 'message': f'   - [구성] 최종 정밀 합산 성공 ({file_count}개)'}))

-                        # 4. DB 실시간 저장 (히스토리 테이블)
+                        # 4. DB 실시간 저장
                        if current_p_id:
-                            conn = get_db_connection()
-                            try:
+                            with get_db_connection() as conn:
                                with conn.cursor() as cursor:
-                                    # 오늘 날짜 히스토리 데이터 삽입 또는 업데이트
-                                    sql = """
-                                        INSERT INTO projects_history (project_id, crawl_date, recent_log, file_count)
-                                        VALUES (%s, CURRENT_DATE(), %s, %s)
-                                        ON DUPLICATE KEY UPDATE 
-                                            recent_log = VALUES(recent_log), file_count = VALUES(file_count)
-                                    """
+                                    sql = "INSERT INTO projects_history (project_id, crawl_date, recent_log, file_count) VALUES (%s, CURRENT_DATE(), %s, %s) ON DUPLICATE KEY UPDATE recent_log=VALUES(recent_log), file_count=VALUES(file_count)"
                                    cursor.execute(sql, (current_p_id, recent_log, file_count))
                                conn.commit()
-                                msg_queue.put(json.dumps({'type': 'log', 'message': f'   - [DB] 히스토리 업데이트 완료 (ID: {current_p_id})'}))
-                            finally: conn.close()
+                            msg_queue.put(json.dumps({'type': 'log', 'message': f'   - [성공] 로그: {recent_log[:20]}... / 파일: {file_count}개'}))

                        await page.goto("https://overseas.projectmastercloud.com/dashboard", wait_until="domcontentloaded")
                        
                    except Exception as e:
-                        msg_queue.put(json.dumps({'type': 'log', 'message': f'   - [{project_name}] 건너뜀: {str(e)}'}))
+                        msg_queue.put(json.dumps({'type': 'log', 'message': f'   - {project_name} 실패: {str(e)}'}))
                        await page.goto("https://overseas.projectmastercloud.com/dashboard")

                msg_queue.put(json.dumps({'type': 'done', 'data': []}))
@@ -241,10 +217,8 @@ def crawler_thread_worker(msg_queue, user_id, password):
    loop.close()

 async def run_crawler_service():
-    user_id = os.getenv("PM_USER_ID")
-    password = os.getenv("PM_PASSWORD")
    msg_queue = queue.Queue()
-    thread = threading.Thread(target=crawler_thread_worker, args=(msg_queue, user_id, password))
+    thread = threading.Thread(target=crawler_thread_worker, args=(msg_queue, os.getenv("PM_USER_ID"), os.getenv("PM_PASSWORD")))
    thread.start()
    while True:
        try: