Files
test-mcp/crawler_api.py

166 lines
9.1 KiB
Python

import os
import re
import asyncio
import json
import traceback
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import StreamingResponse
from playwright.async_api import async_playwright
from dotenv import load_dotenv
load_dotenv()
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=False,
allow_methods=["*"],
allow_headers=["*"],
)
@app.get("/sync")
async def sync_data():
async def event_generator():
user_id = os.getenv("PM_USER_ID")
password = os.getenv("PM_PASSWORD")
if not user_id or not password:
yield f"data: {json.dumps({'type': 'log', 'message': '오류: .env 파일에 계정 정보가 없습니다.'})}\n\n"
return
TIMEOUT_MS = 600000
results = []
async with async_playwright() as p:
yield f"data: {json.dumps({'type': 'log', 'message': '브라우저 실행 중...'})}\n\n"
browser = await p.chromium.launch(headless=True, args=["--no-sandbox", "--disable-dev-shm-usage"])
context = await browser.new_context(viewport={'width': 1920, 'height': 1080})
context.set_default_timeout(60000)
page = await context.new_page()
try:
yield f"data: {json.dumps({'type': 'log', 'message': '사이트 접속 및 로그인 중...'})}\n\n"
await page.goto("https://overseas.projectmastercloud.com/", wait_until="domcontentloaded")
await page.click("#login-by-id", timeout=10000)
await page.fill("#user_id", user_id)
await page.fill("#user_pw", password)
await page.click("#login-btn")
yield f"data: {json.dumps({'type': 'log', 'message': '대시보드 목록 대기 중...'})}\n\n"
await page.wait_for_selector("h4.list__contents_aria_group_body_list_item_label", timeout=60000)
locators = page.locator("h4.list__contents_aria_group_body_list_item_label")
count = await locators.count()
yield f"data: {json.dumps({'type': 'log', 'message': f'{count}개의 프로젝트 발견. 수집 시작.'})}\n\n"
for i in range(count):
try:
proj = page.locator("h4.list__contents_aria_group_body_list_item_label").nth(i)
project_name = (await proj.inner_text()).strip()
yield f"data: {json.dumps({'type': 'log', 'message': f'[{i+1}/{count}] {project_name} - 수집 시작...'})}\n\n"
await proj.scroll_into_view_if_needed()
await proj.click(force=True)
await page.wait_for_selector("div.footer", timeout=20000)
recent_log = "없음"
file_count = 0
# 1단계: 활동로그 수집
try:
log_btn = page.locator("div.wrap.log-wrap > div.title.text").first
if await log_btn.is_visible(timeout=5000):
await log_btn.click()
await page.wait_for_timeout(2000) # 로그 로딩 여유
log_content = page.locator("div.wrap.log-wrap .content-area").first
if await log_content.is_visible(timeout=5000):
content = await log_content.inner_text()
lines = [l.strip() for l in content.split("\n") if len(l.strip()) > 2]
if lines: recent_log = lines[0]
await page.locator("body > article.archive-modal div.close").first.click()
await page.wait_for_timeout(1000)
except: pass
# 2단계: 구성(파일 수) 수집 - 안정성 대폭 강화
try:
sitemap_btn = page.locator("div.wrap.site-map-wrap > div").first
if await sitemap_btn.is_visible(timeout=5000):
await sitemap_btn.click()
popup_page = None
for _ in range(20):
for p_item in context.pages:
if "composition-tab.html" in p_item.url:
popup_page = p_item
break
if popup_page: break
await asyncio.sleep(0.5)
if popup_page:
yield f"data: {json.dumps({'type': 'log', 'message': f'[{i+1}/{count}] 구성 데이터 로딩 대기 중 (여유있게)...'})}\n\n"
await popup_page.wait_for_load_state("domcontentloaded")
# 데이터가 로드될 때까지 점진적으로 대기 (최대 7초)
for _ in range(7):
h6_check = popup_page.locator("#composition-list li h6:nth-child(3)")
if await h6_check.count() > 0:
break
await asyncio.sleep(1)
# 최종 데이터를 가져오기 전 마지막 2초 추가 대기 (완전한 렌더링 확인)
await asyncio.sleep(2)
target_h6_locators = popup_page.locator("#composition-list li h6:nth-child(3)")
h6_count = await target_h6_locators.count()
yield f"data: {json.dumps({'type': 'log', 'message': f'[{i+1}/{count}] 총 {h6_count}개의 항목 로드됨. 합산 중...'})}\n\n"
current_total = 0
for j in range(h6_count):
text = (await target_h6_locators.nth(j).inner_text()).strip()
last_line = text.split('\n')[-1]
nums = re.findall(r'\d+', last_line)
if nums:
val = int(nums[0])
if val < 10000: # 1만개 미만만 합산 (연도 필터링)
current_total += val
file_count = current_total
await popup_page.close()
await page.bring_to_front()
except Exception as e:
yield f"data: {json.dumps({'type': 'log', 'message': f'!!! 구성 수집 지연: {str(e)[:30]}'})}\n\n"
summary_msg = f"[{i+1}/{count}] 수집 완료 - 파일: {file_count}개, 최근로그: {recent_log[:40]}..."
yield f"data: {json.dumps({'type': 'log', 'message': summary_msg})}\n\n"
results.append({"projectName": project_name, "recentLog": recent_log, "fileCount": file_count})
# 3단계: 복귀
home_btn = page.locator("div.header div.title div").first
try:
await home_btn.click(force=True, timeout=10000)
await page.wait_for_selector("h4.list__contents_aria_group_body_list_item_label", timeout=20000)
except:
await page.goto("https://overseas.projectmastercloud.com/dashboard", wait_until="domcontentloaded")
await page.wait_for_selector("h4.list__contents_aria_group_body_list_item_label", timeout=20000)
await page.wait_for_timeout(1500)
except Exception as e_proj:
yield f"data: {json.dumps({'type': 'log', 'message': f'!!! {i+1}번째 프로젝트 실패 (건너뜀)'})}\n\n"
await page.goto("https://overseas.projectmastercloud.com/", wait_until="domcontentloaded")
await page.wait_for_timeout(3000)
yield f"data: {json.dumps({'type': 'done', 'data': results})}\n\n"
except Exception as e:
yield f"data: {json.dumps({'type': 'log', 'message': f'치명적 오류: {str(e)}'})}\n\n"
finally:
await browser.close()
return StreamingResponse(event_generator(), media_type="text_event-stream")