Improve crawler reliability and update project management rules

This commit is contained in:
2026-02-25 17:53:36 +09:00
parent 0cead18c80
commit 93a67f4cfa
4 changed files with 123 additions and 80 deletions

View File

@@ -5,7 +5,8 @@ import json
import traceback
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import StreamingResponse
from fastapi.responses import StreamingResponse, FileResponse
from fastapi.staticfiles import StaticFiles
from playwright.async_api import async_playwright
from dotenv import load_dotenv
@@ -13,6 +14,9 @@ load_dotenv()
app = FastAPI()
# Mount static files (css, images etc)
app.mount("/style", StaticFiles(directory="style"), name="style")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
@@ -21,6 +25,10 @@ app.add_middleware(
allow_headers=["*"],
)
@app.get("/")
async def get_dashboard():
return FileResponse("dashboard.html")
@app.get("/sync")
async def sync_data():
async def event_generator():
@@ -31,14 +39,19 @@ async def sync_data():
yield f"data: {json.dumps({'type': 'log', 'message': '오류: .env 파일에 계정 정보가 없습니다.'})}\n\n"
return
TIMEOUT_MS = 600000
results = []
async with async_playwright() as p:
yield f"data: {json.dumps({'type': 'log', 'message': '브라우저 실행 중...'})}\n\n"
browser = await p.chromium.launch(headless=True, args=["--no-sandbox", "--disable-dev-shm-usage"])
context = await browser.new_context(viewport={'width': 1920, 'height': 1080})
context.set_default_timeout(60000)
browser = await p.chromium.launch(headless=True, args=[
"--no-sandbox",
"--disable-dev-shm-usage",
"--disable-blink-features=AutomationControlled"
])
context = await browser.new_context(
viewport={'width': 1920, 'height': 1080},
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
)
page = await context.new_page()
try:
@@ -62,98 +75,126 @@ async def sync_data():
proj = page.locator("h4.list__contents_aria_group_body_list_item_label").nth(i)
project_name = (await proj.inner_text()).strip()
yield f"data: {json.dumps({'type': 'log', 'message': f'[{i+1}/{count}] {project_name} - 수집 시작...'})}\n\n"
yield f"data: {json.dumps({'type': 'log', 'message': f'[{i+1}/{count}] {project_name} - 시작'})}\n\n"
await proj.scroll_into_view_if_needed()
await proj.click(force=True)
await page.wait_for_selector("div.footer", timeout=20000)
recent_log = "없음"
# 프로젝트 로딩 대기 (Gitea 방식: 물리적 대기)
await asyncio.sleep(5)
await page.wait_for_selector("div.footer", state="visible", timeout=20000)
recent_log = "기존데이터유지"
file_count = 0
# 1단계: 활동로그 수집
# 1단계: 활동로그 수집 (Gitea 방식 복구 + 정밀 셀렉터)
try:
log_btn = page.locator("div.wrap.log-wrap > div.title.text").first
log_btn_sel = "body > div.footer > div.left > div.wrap.log-wrap > div.title.text"
log_btn = page.locator(log_btn_sel).first
if await log_btn.is_visible(timeout=5000):
await log_btn.click()
await page.wait_for_timeout(2000) # 로그 로딩 여유
log_content = page.locator("div.wrap.log-wrap .content-area").first
if await log_content.is_visible(timeout=5000):
content = await log_content.inner_text()
lines = [l.strip() for l in content.split("\n") if len(l.strip()) > 2]
if lines: recent_log = lines[0]
await page.locator("body > article.archive-modal div.close").first.click()
await page.wait_for_timeout(1000)
except: pass
# 2단계: 구성(파일 수) 수집 - 안정성 대폭 강화
try:
sitemap_btn = page.locator("div.wrap.site-map-wrap > div").first
if await sitemap_btn.is_visible(timeout=5000):
await sitemap_btn.click()
yield f"data: {json.dumps({'type': 'log', 'message': ' - [로그] 창 열기 시도...'})}\n\n"
await log_btn.click(force=True)
await asyncio.sleep(5) # 로딩 충분히 대기
modal_sel = "article.archive-modal"
if await page.locator(modal_sel).is_visible():
yield f"data: {json.dumps({'type': 'log', 'message': ' - [로그] 모달 발견. 데이터 추출 중...'})}\n\n"
# 사용자 제공 정밀 셀렉터 기반 추출
date_sel = "body > article.archive-modal > div > div > div.modal-body > div.log-wrap > div.log-item-wrap.log-body.scrollbar.scroll-container > div.date > div.text"
user_sel = "body > article.archive-modal > div > div > div.modal-body > div.log-wrap > div.log-item-wrap.log-body.scrollbar.scroll-container > div.user > div.text"
act_sel = "body > article.archive-modal > div > div > div.modal-body > div.log-wrap > div.log-item-wrap.log-body.scrollbar.scroll-container > div.activity > div.text"
# 데이터가 나타날 때까지 반복 대기
success_log = False
for _ in range(10):
if await page.locator(date_sel).count() > 0:
raw_date = (await page.locator(date_sel).first.inner_text()).strip()
if raw_date and "활동시간" not in raw_date:
success_log = True
break
await asyncio.sleep(1)
if success_log:
user_name = (await page.locator(user_sel).first.inner_text()).strip()
activity = (await page.locator(act_sel).first.inner_text()).strip()
formatted_date = re.sub(r'[-/]', '.', raw_date)[:10]
recent_log = f"{formatted_date}, {user_name}, {activity}"
yield f"data: {json.dumps({'type': 'log', 'message': f' - [로그] 성공: {recent_log[:30]}...'})}\n\n"
else:
yield f"data: {json.dumps({'type': 'log', 'message': ' - [로그] 데이터 추출 실패'})}\n\n"
await page.click("article.archive-modal div.close", timeout=3000)
await asyncio.sleep(1.5)
except Exception as e:
yield f"data: {json.dumps({'type': 'log', 'message': f' - [로그] 오류: {str(e)[:20]}'})}\n\n"
# 2단계: 구성(파일 수) 수집 (Gitea 순회 방식 복구 + 대기 시간 대폭 연장)
try:
sitemap_btn_sel = "body > div.footer > div.left > div.wrap.site-map-wrap"
sitemap_btn = page.locator(sitemap_btn_sel).first
if await sitemap_btn.is_visible(timeout=5000):
yield f"data: {json.dumps({'type': 'log', 'message': ' - [구성] 진입 시도...'})}\n\n"
await sitemap_btn.click(force=True)
# Gitea 방식: context.pages 직접 뒤져서 팝업 찾기
popup_page = None
for _ in range(20):
for _ in range(30): # 최대 15초 대기
for p_item in context.pages:
if "composition-tab.html" in p_item.url:
popup_page = p_item
break
try:
if "composition" in p_item.url:
popup_page = p_item
break
except: pass
if popup_page: break
await asyncio.sleep(0.5)
if popup_page:
yield f"data: {json.dumps({'type': 'log', 'message': f'[{i+1}/{count}] 구성 데이터 로딩 대기 중 (여유있게)...'})}\n\n"
await popup_page.wait_for_load_state("domcontentloaded")
yield f"data: {json.dumps({'type': 'log', 'message': ' - [구성] 창 발견. 데이터 로딩 대기 (최대 80초)...'})}\n\n"
target_selector = "#composition-list h6"
success_comp = False
# 데이터가 로드될 때까지 점진적으로 대기 (최대 7초)
for _ in range(7):
h6_check = popup_page.locator("#composition-list li h6:nth-child(3)")
if await h6_check.count() > 0:
# 최대 80초간 끝까지 대기
for _ in range(80):
h6_count = await popup_page.locator(target_selector).count()
if h6_count > 5: # 일정 개수 이상의 목록이 나타나면 로딩 시작으로 간주
success_comp = True
break
await asyncio.sleep(1)
# 최종 데이터를 가져오기 전 마지막 2초 추가 대기 (완전한 렌더링 확인)
await asyncio.sleep(2)
if success_comp:
yield f"data: {json.dumps({'type': 'log', 'message': ' - [구성] 데이터 감지됨. 15초간 최종 렌더링 대기...'})}\n\n"
await asyncio.sleep(15) # 완전한 로딩을 위한 강제 대기
# 유연한 데이터 수집
locators_h6 = popup_page.locator(target_selector)
h6_count = await locators_h6.count()
current_total = 0
for j in range(h6_count):
text = (await locators_h6.nth(j).inner_text()).strip()
nums = re.findall(r'\d+', text.split('\n')[-1])
if nums:
val = int(nums[0])
if val < 5000: current_total += val
file_count = current_total
yield f"data: {json.dumps({'type': 'log', 'message': f' - [구성] 성공 ({file_count}개)'})}\n\n"
else:
yield f"data: {json.dumps({'type': 'log', 'message': ' - [구성] 로딩 타임아웃'})}\n\n"
target_h6_locators = popup_page.locator("#composition-list li h6:nth-child(3)")
h6_count = await target_h6_locators.count()
yield f"data: {json.dumps({'type': 'log', 'message': f'[{i+1}/{count}] 총 {h6_count}개의 항목 로드됨. 합산 중...'})}\n\n"
current_total = 0
for j in range(h6_count):
text = (await target_h6_locators.nth(j).inner_text()).strip()
last_line = text.split('\n')[-1]
nums = re.findall(r'\d+', last_line)
if nums:
val = int(nums[0])
if val < 10000: # 1만개 미만만 합산 (연도 필터링)
current_total += val
file_count = current_total
await popup_page.close()
await page.bring_to_front()
else:
yield f"data: {json.dumps({'type': 'log', 'message': ' - [구성] 팝업창 발견 실패'})}\n\n"
except Exception as e:
yield f"data: {json.dumps({'type': 'log', 'message': f'!!! 구성 수집 지연: {str(e)[:30]}'})}\n\n"
summary_msg = f"[{i+1}/{count}] 수집 완료 - 파일: {file_count}개, 최근로그: {recent_log[:40]}..."
yield f"data: {json.dumps({'type': 'log', 'message': summary_msg})}\n\n"
yield f"data: {json.dumps({'type': 'log', 'message': f' - [구성] 오류: {str(e)[:20]}'})}\n\n"
results.append({"projectName": project_name, "recentLog": recent_log, "fileCount": file_count})
# 3단계: 복귀
home_btn = page.locator("div.header div.title div").first
try:
await home_btn.click(force=True, timeout=10000)
await page.wait_for_selector("h4.list__contents_aria_group_body_list_item_label", timeout=20000)
except:
await page.goto("https://overseas.projectmastercloud.com/dashboard", wait_until="domcontentloaded")
await page.wait_for_selector("h4.list__contents_aria_group_body_list_item_label", timeout=20000)
await page.wait_for_timeout(1500)
# 복귀
await page.locator("div.header div.title div").first.click(force=True)
await page.wait_for_selector("h4.list__contents_aria_group_body_list_item_label", timeout=20000)
await asyncio.sleep(2)
except Exception as e_proj:
yield f"data: {json.dumps({'type': 'log', 'message': f'!!! {i+1}번째 프로젝트 실패 (건너뜀)'})}\n\n"
await page.goto("https://overseas.projectmastercloud.com/", wait_until="domcontentloaded")
await page.wait_for_timeout(3000)
except Exception:
await page.goto("https://overseas.projectmastercloud.com/dashboard", wait_until="domcontentloaded")
yield f"data: {json.dumps({'type': 'done', 'data': results})}\n\n"