142 lines
7.3 KiB
Python
142 lines
7.3 KiB
Python
import os
|
|
import re
|
|
import asyncio
|
|
import json
|
|
from playwright.async_api import async_playwright
|
|
from dotenv import load_dotenv
|
|
|
|
load_dotenv()
|
|
|
|
async def run_crawler_service():
|
|
"""
|
|
Playwright를 이용해 데이터를 수집하고 SSE(Server-Sent Events)용 제너레이터를 반환합니다.
|
|
"""
|
|
user_id = os.getenv("PM_USER_ID")
|
|
password = os.getenv("PM_PASSWORD")
|
|
|
|
if not user_id or not password:
|
|
yield f"data: {json.dumps({'type': 'log', 'message': '오류: .env 파일에 계정 정보가 없습니다.'})}\n\n"
|
|
return
|
|
|
|
results = []
|
|
|
|
async with async_playwright() as p:
|
|
browser = None
|
|
try:
|
|
yield f"data: {json.dumps({'type': 'log', 'message': '브라우저 실행 중...'})}\n\n"
|
|
browser = await p.chromium.launch(headless=True, args=[
|
|
"--no-sandbox",
|
|
"--disable-dev-shm-usage",
|
|
"--disable-blink-features=AutomationControlled"
|
|
])
|
|
context = await browser.new_context(
|
|
viewport={'width': 1920, 'height': 1080},
|
|
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
|
)
|
|
page = await context.new_page()
|
|
|
|
yield f"data: {json.dumps({'type': 'log', 'message': '사이트 접속 및 로그인 중...'})}\n\n"
|
|
await page.goto("https://overseas.projectmastercloud.com/", wait_until="domcontentloaded")
|
|
|
|
await page.click("#login-by-id", timeout=10000)
|
|
await page.fill("#user_id", user_id)
|
|
await page.fill("#user_pw", password)
|
|
await page.click("#login-btn")
|
|
|
|
yield f"data: {json.dumps({'type': 'log', 'message': '대시보드 목록 대기 중...'})}\n\n"
|
|
await page.wait_for_selector("h4.list__contents_aria_group_body_list_item_label", timeout=60000)
|
|
|
|
locators = page.locator("h4.list__contents_aria_group_body_list_item_label")
|
|
count = await locators.count()
|
|
yield f"data: {json.dumps({'type': 'log', 'message': f'총 {count}개의 프로젝트 발견. 수집 시작.'})}\n\n"
|
|
|
|
for i in range(count):
|
|
try:
|
|
proj = page.locator("h4.list__contents_aria_group_body_list_item_label").nth(i)
|
|
project_name = (await proj.inner_text()).strip()
|
|
|
|
yield f"data: {json.dumps({'type': 'log', 'message': f'[{i+1}/{count}] {project_name} - 시작'})}\n\n"
|
|
await proj.scroll_into_view_if_needed()
|
|
await proj.click(force=True)
|
|
|
|
await asyncio.sleep(5)
|
|
await page.wait_for_selector("div.footer", state="visible", timeout=20000)
|
|
|
|
recent_log = "기존데이터유지"
|
|
file_count = 0
|
|
|
|
# 로그 수집
|
|
try:
|
|
log_btn_sel = "body > div.footer > div.left > div.wrap.log-wrap > div.title.text"
|
|
log_btn = page.locator(log_btn_sel).first
|
|
if await log_btn.is_visible(timeout=5000):
|
|
await log_btn.click(force=True)
|
|
await asyncio.sleep(5)
|
|
|
|
date_sel = "article.archive-modal .log-body .date .text"
|
|
user_sel = "article.archive-modal .log-body .user .text"
|
|
act_sel = "article.archive-modal .log-body .activity .text"
|
|
|
|
if await page.locator(date_sel).count() > 0:
|
|
raw_date = (await page.locator(date_sel).first.inner_text()).strip()
|
|
user_name = (await page.locator(user_sel).first.inner_text()).strip()
|
|
activity = (await page.locator(act_sel).first.inner_text()).strip()
|
|
formatted_date = re.sub(r'[-/]', '.', raw_date)[:10]
|
|
recent_log = f"{formatted_date}, {user_name}, {activity}"
|
|
yield f"data: {json.dumps({'type': 'log', 'message': f' - [로그] 수집 완료'})}\n\n"
|
|
|
|
await page.click("article.archive-modal div.close", timeout=3000)
|
|
await asyncio.sleep(1.5)
|
|
except: pass
|
|
|
|
# 구성 수집
|
|
try:
|
|
sitemap_btn_sel = "body > div.footer > div.left > div.wrap.site-map-wrap"
|
|
sitemap_btn = page.locator(sitemap_btn_sel).first
|
|
if await sitemap_btn.is_visible(timeout=5000):
|
|
await sitemap_btn.click(force=True)
|
|
|
|
popup_page = None
|
|
for _ in range(20):
|
|
for p_item in context.pages:
|
|
if "composition" in p_item.url:
|
|
popup_page = p_item
|
|
break
|
|
if popup_page: break
|
|
await asyncio.sleep(0.5)
|
|
|
|
if popup_page:
|
|
target_selector = "#composition-list h6:nth-child(3)"
|
|
await asyncio.sleep(5) # 로딩 대기
|
|
locators_h6 = popup_page.locator(target_selector)
|
|
h6_count = await locators_h6.count()
|
|
current_total = 0
|
|
for j in range(h6_count):
|
|
text = (await locators_h6.nth(j).inner_text()).strip()
|
|
nums = re.findall(r'\d+', text.split('\n')[-1])
|
|
if nums: current_total += int(nums[0])
|
|
file_count = current_total
|
|
yield f"data: {json.dumps({'type': 'log', 'message': f' - [구성] {file_count}개 확인'})}\n\n"
|
|
await popup_page.close()
|
|
except: pass
|
|
|
|
results.append({"projectName": project_name, "recentLog": recent_log, "fileCount": file_count})
|
|
|
|
# 홈 복귀
|
|
await page.locator("div.header div.title div").first.click(force=True)
|
|
await page.wait_for_selector("h4.list__contents_aria_group_body_list_item_label", timeout=20000)
|
|
await asyncio.sleep(2)
|
|
|
|
except Exception:
|
|
await page.goto("https://overseas.projectmastercloud.com/dashboard", wait_until="domcontentloaded")
|
|
|
|
yield f"data: {json.dumps({'type': 'done', 'data': results})}\n\n"
|
|
|
|
except GeneratorExit:
|
|
# SSE 연결이 클라이언트 측에서 먼저 끊겼을 때 실행
|
|
if browser: await browser.close()
|
|
except Exception as e:
|
|
yield f"data: {json.dumps({'type': 'log', 'message': f'치명적 오류: {str(e)}'})}\n\n"
|
|
finally:
|
|
if browser: await browser.close()
|