Files
test-mcp/crawler_service.py
Taehoon 9369e18eb8 feat: 대시보드 데이터 파싱 로직 고도화 및 크롤링 서비스 개선
- 대시보드: 8컬럼 형식의 sheet.csv를 안정적으로 지원하도록 파싱 로직 개선

- 크롤러: Playwright 기반 크롤링 엔진 고도화 및 실시간 로그 전송 기능 강화

- UI/UX: 대시보드 동기화 버튼 및 헤더 레이아웃 최적화
2026-03-06 18:10:19 +09:00

200 lines
11 KiB
Python

import os
import re
import asyncio
import json
import csv
import traceback
from datetime import datetime
from playwright.async_api import async_playwright
from dotenv import load_dotenv
load_dotenv()
def clean_date_string(date_str):
"""
날짜 문자열에서 YY.MM.DD 형식만 추출합니다.
"""
if not date_str: return ""
match = re.search(r'(\d{2})[./-](\d{2})[./-](\d{2})', date_str)
if match:
return f"{match.group(1)}.{match.group(2)}.{match.group(3)}"
return date_str[:8]
async def run_crawler_service():
"""
상세 패킷을 강제 호출하여 팝업 없이 상세 파일 개수를 수집하며 모든 성공 로직을 보존한 크롤러입니다.
"""
user_id = os.getenv("PM_USER_ID")
password = os.getenv("PM_PASSWORD")
if not user_id or not password:
yield f"data: {json.dumps({'type': 'log', 'message': '오류: .env 파일에 계정 정보가 없습니다.'})}\n\n"
return
results = []
async with async_playwright() as p:
browser = None
try:
yield f"data: {json.dumps({'type': 'log', 'message': '브라우저 엔진 (데이터 강제 유도 모드) 가동...'})}\n\n"
browser = await p.chromium.launch(headless=False, args=["--no-sandbox", "--disable-dev-shm-usage"])
context = await browser.new_context(viewport={'width': 1600, 'height': 900})
captured_data = {"log": None, "tree": None}
async def global_interceptor(response):
url = response.url
try:
# 상세 패킷 감시 (params[resourcePath]=/ 가 포함된 상세 응답 우선)
if "getTreeObject" in url:
data = await response.json()
if data.get('currentTreeObject', {}).get('folder'):
captured_data["tree"] = data
elif "Log" in url:
captured_data["log"] = await response.json()
except: pass
context.on("response", global_interceptor)
page = await context.new_page()
# --- 1. 로그인 (안정 로직) ---
await page.goto("https://overseas.projectmastercloud.com/dashboard", wait_until="domcontentloaded")
if await page.locator("#login-by-id").is_visible(timeout=5000):
await page.click("#login-by-id")
await page.fill("#user_id", user_id)
await page.fill("#user_pw", password)
await page.click("#login-btn")
await page.wait_for_selector("h4.list__contents_aria_group_body_list_item_label", timeout=60000)
await asyncio.sleep(5)
names = await page.locator("h4.list__contents_aria_group_body_list_item_label").all_inner_texts()
project_names = [n.strip() for n in names if n.strip()]
count = len(project_names)
yield f"data: {json.dumps({'type': 'log', 'message': f'{count}개의 프로젝트 수집 시작.'})}\n\n"
for i, project_name in enumerate(project_names):
captured_data["log"] = None
captured_data["tree"] = None
yield f"data: {json.dumps({'type': 'log', 'message': f'[{i+1}/{count}] {project_name} 수집 시작'})}\n\n"
try:
# 상세 페이지 진입 (안정 로직)
target_el = page.get_by_text(project_name).first
await target_el.scroll_into_view_if_needed()
box = await target_el.bounding_box()
if box: await page.mouse.click(box['x'] + 5, box['y'] + 5)
else: await target_el.click(force=True)
await page.wait_for_selector("text=활동로그", timeout=30000)
await asyncio.sleep(3)
recent_log = "데이터 없음"
file_count = 0
# --- 2. 활동로그 수집 (100% 복구 로직) ---
try:
modal_opened = False
for _ in range(3):
log_btn = page.get_by_text("활동로그").first
await page.evaluate("(el) => el.click()", await log_btn.element_handle())
try:
await page.wait_for_selector("article.archive-modal", timeout=5000)
modal_opened = True
break
except: await asyncio.sleep(1)
if modal_opened:
inputs = await page.locator("article.archive-modal input").all()
for inp in inputs:
itype = await inp.get_attribute("type")
iname = (await inp.get_attribute("name") or "").lower()
iclass = (await inp.get_attribute("class") or "").lower()
if itype == "date" or "start" in iname or "start" in iclass:
await inp.fill("2020-01-01")
break
captured_data["log"] = None
apply_btn = page.locator("article.archive-modal").get_by_text("적용").first
if await apply_btn.is_visible():
await apply_btn.click()
await asyncio.sleep(4)
if captured_data["log"]:
data = captured_data["log"]
logs = data.get('logData', []) or data.get('result', [])
if logs and isinstance(logs, list) and len(logs) > 0:
top = logs[0]
rd = top.get('log_date') or top.get('date') or ""
u = top.get('user_name') or top.get('user') or ""
c = top.get('activity_content') or top.get('activity') or ""
recent_log = f"{clean_date_string(rd)}, {u}, {c}"
if recent_log == "데이터 없음":
modal = page.locator("article.archive-modal")
try:
d_v = (await modal.locator(".log-body .date .text").first.inner_text()).strip()
u_v = (await modal.locator(".log-body .user .text").first.inner_text()).strip()
a_v = (await modal.locator(".log-body .activity .text").first.inner_text()).strip()
if d_v: recent_log = f"{clean_date_string(d_v)}, {u_v}, {a_v}"
except: pass
yield f"data: {json.dumps({'type': 'log', 'message': f' - [로그 결과] {recent_log}'})}\n\n"
await page.keyboard.press("Escape")
else:
yield f"data: {json.dumps({'type': 'log', 'message': ' - [로그] 모달 진입 실패'})}\n\n"
except Exception as le:
yield f"data: {json.dumps({'type': 'log', 'message': f' - [로그] 오류: {str(le)}'})}\n\n"
# --- 3. 구성 수집 (상세 패킷 강제 유도) ---
try:
# [지능형 유도] 상세 정보를 포함한 API 요청을 브라우저가 직접 날리게 함
await page.evaluate("""() => {
fetch('/api/getTreeObject?params[storageType]=CLOUD&params[resourcePath]=/');
}""")
# 패킷 대기 (최대 10초)
for _ in range(20):
if captured_data["tree"]: break
await asyncio.sleep(0.5)
if captured_data["tree"]:
data = captured_data["tree"]
# 분석된 딕셔너리 구조 합산
folders = data.get('currentTreeObject', {}).get('folder', {})
total_files = 0
if isinstance(folders, dict):
for folder_name, folder_info in folders.items():
total_files += int(folder_info.get('filesCount', 0))
file_count = total_files
yield f"data: {json.dumps({'type': 'log', 'message': f' - [구성] 상세 합산 성공 ({file_count}개)'})}\n\n"
else:
yield f"data: {json.dumps({'type': 'log', 'message': ' - [구성] 상세 데이터 응답 없음'})}\n\n"
except: pass
results.append({"projectName": project_name, "recentLog": recent_log, "fileCount": file_count})
await page.goto("https://overseas.projectmastercloud.com/dashboard", wait_until="domcontentloaded")
await page.wait_for_selector("h4.list__contents_aria_group_body_list_item_label", timeout=20000)
except Exception as e:
yield f"data: {json.dumps({'type': 'log', 'message': f' - [{project_name}] 건너뜀 (사유: {str(e)})'})}\n\n"
await page.goto("https://overseas.projectmastercloud.com/dashboard")
# --- 4. CSV 파일 저장 ---
try:
today_str = datetime.now().strftime("%Y.%m.%d")
csv_path = f"crawling_result {today_str}.csv"
with open(csv_path, 'w', newline='', encoding='utf-8-sig') as f:
writer = csv.DictWriter(f, fieldnames=["projectName", "recentLog", "fileCount"])
writer.writeheader()
writer.writerows(results)
yield f"data: {json.dumps({'type': 'log', 'message': f'✅ 모든 데이터가 {csv_path}에 저장되었습니다.'})}\n\n"
except Exception as fe:
yield f"data: {json.dumps({'type': 'log', 'message': f'❌ CSV 저장 실패: {str(fe)}'})}\n\n"
yield f"data: {json.dumps({'type': 'done', 'data': results})}\n\n"
except Exception as e:
yield f"data: {json.dumps({'type': 'log', 'message': f'치명적 오류: {str(e)}'})}\n\n"
finally:
if browser: await browser.close()