feat: 대시보드 데이터 파싱 로직 고도화 및 크롤링 서비스 개선

- 대시보드: 8컬럼 형식의 sheet.csv를 안정적으로 지원하도록 파싱 로직 개선

- 크롤러: Playwright 기반 크롤링 엔진 고도화 및 실시간 로그 전송 기능 강화

- UI/UX: 대시보드 동기화 버튼 및 헤더 레이아웃 최적화
This commit is contained in:
2026-03-06 18:10:19 +09:00
parent eebd3a89e5
commit 9369e18eb8
6 changed files with 233 additions and 104 deletions

View File

@@ -2,14 +2,27 @@ import os
import re
import asyncio
import json
import csv
import traceback
from datetime import datetime
from playwright.async_api import async_playwright
from dotenv import load_dotenv
load_dotenv()
def clean_date_string(date_str):
"""
날짜 문자열에서 YY.MM.DD 형식만 추출합니다.
"""
if not date_str: return ""
match = re.search(r'(\d{2})[./-](\d{2})[./-](\d{2})', date_str)
if match:
return f"{match.group(1)}.{match.group(2)}.{match.group(3)}"
return date_str[:8]
async def run_crawler_service():
"""
Playwright를 이용해 데이터를 수집하고 SSE(Server-Sent Events)용 제너레이터를 반환합니다.
상세 패킷을 강제 호출하여 팝업 없이 상세 파일 개수를 수집하며 모든 성공 로직을 보존한 크롤러입니다.
"""
user_id = os.getenv("PM_USER_ID")
password = os.getenv("PM_PASSWORD")
@@ -23,118 +36,163 @@ async def run_crawler_service():
async with async_playwright() as p:
browser = None
try:
yield f"data: {json.dumps({'type': 'log', 'message': '브라우저 실행 중...'})}\n\n"
browser = await p.chromium.launch(headless=True, args=[
"--no-sandbox",
"--disable-dev-shm-usage",
"--disable-blink-features=AutomationControlled"
])
context = await browser.new_context(
viewport={'width': 1920, 'height': 1080},
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
)
yield f"data: {json.dumps({'type': 'log', 'message': '브라우저 엔진 (데이터 강제 유도 모드) 가동...'})}\n\n"
browser = await p.chromium.launch(headless=False, args=["--no-sandbox", "--disable-dev-shm-usage"])
context = await browser.new_context(viewport={'width': 1600, 'height': 900})
captured_data = {"log": None, "tree": None}
async def global_interceptor(response):
url = response.url
try:
# 상세 패킷 감시 (params[resourcePath]=/ 가 포함된 상세 응답 우선)
if "getTreeObject" in url:
data = await response.json()
if data.get('currentTreeObject', {}).get('folder'):
captured_data["tree"] = data
elif "Log" in url:
captured_data["log"] = await response.json()
except: pass
context.on("response", global_interceptor)
page = await context.new_page()
yield f"data: {json.dumps({'type': 'log', 'message': '사이트 접속 및 로그인 중...'})}\n\n"
await page.goto("https://overseas.projectmastercloud.com/", wait_until="domcontentloaded")
await page.click("#login-by-id", timeout=10000)
await page.fill("#user_id", user_id)
await page.fill("#user_pw", password)
await page.click("#login-btn")
# --- 1. 로그인 (안정 로직) ---
await page.goto("https://overseas.projectmastercloud.com/dashboard", wait_until="domcontentloaded")
if await page.locator("#login-by-id").is_visible(timeout=5000):
await page.click("#login-by-id")
await page.fill("#user_id", user_id)
await page.fill("#user_pw", password)
await page.click("#login-btn")
yield f"data: {json.dumps({'type': 'log', 'message': '대시보드 목록 대기 중...'})}\n\n"
await page.wait_for_selector("h4.list__contents_aria_group_body_list_item_label", timeout=60000)
await asyncio.sleep(5)
locators = page.locator("h4.list__contents_aria_group_body_list_item_label")
count = await locators.count()
yield f"data: {json.dumps({'type': 'log', 'message': f'{count}개의 프로젝트 발견. 수집 시작.'})}\n\n"
names = await page.locator("h4.list__contents_aria_group_body_list_item_label").all_inner_texts()
project_names = [n.strip() for n in names if n.strip()]
count = len(project_names)
yield f"data: {json.dumps({'type': 'log', 'message': f'{count}개의 프로젝트 수집 시작.'})}\n\n"
for i in range(count):
for i, project_name in enumerate(project_names):
captured_data["log"] = None
captured_data["tree"] = None
yield f"data: {json.dumps({'type': 'log', 'message': f'[{i+1}/{count}] {project_name} 수집 시작'})}\n\n"
try:
proj = page.locator("h4.list__contents_aria_group_body_list_item_label").nth(i)
project_name = (await proj.inner_text()).strip()
# 상세 페이지 진입 (안정 로직)
target_el = page.get_by_text(project_name).first
await target_el.scroll_into_view_if_needed()
box = await target_el.bounding_box()
if box: await page.mouse.click(box['x'] + 5, box['y'] + 5)
else: await target_el.click(force=True)
yield f"data: {json.dumps({'type': 'log', 'message': f'[{i+1}/{count}] {project_name} - 시작'})}\n\n"
await proj.scroll_into_view_if_needed()
await proj.click(force=True)
await asyncio.sleep(5)
await page.wait_for_selector("div.footer", state="visible", timeout=20000)
recent_log = "기존데이터유지"
await page.wait_for_selector("text=활동로그", timeout=30000)
await asyncio.sleep(3)
recent_log = "데이터 없음"
file_count = 0
# 로그 수집
# --- 2. 활동로그 수집 (100% 복구 로직) ---
try:
log_btn_sel = "body > div.footer > div.left > div.wrap.log-wrap > div.title.text"
log_btn = page.locator(log_btn_sel).first
if await log_btn.is_visible(timeout=5000):
await log_btn.click(force=True)
await asyncio.sleep(5)
modal_opened = False
for _ in range(3):
log_btn = page.get_by_text("활동로그").first
await page.evaluate("(el) => el.click()", await log_btn.element_handle())
try:
await page.wait_for_selector("article.archive-modal", timeout=5000)
modal_opened = True
break
except: await asyncio.sleep(1)
if modal_opened:
inputs = await page.locator("article.archive-modal input").all()
for inp in inputs:
itype = await inp.get_attribute("type")
iname = (await inp.get_attribute("name") or "").lower()
iclass = (await inp.get_attribute("class") or "").lower()
if itype == "date" or "start" in iname or "start" in iclass:
await inp.fill("2020-01-01")
break
date_sel = "article.archive-modal .log-body .date .text"
user_sel = "article.archive-modal .log-body .user .text"
act_sel = "article.archive-modal .log-body .activity .text"
if await page.locator(date_sel).count() > 0:
raw_date = (await page.locator(date_sel).first.inner_text()).strip()
user_name = (await page.locator(user_sel).first.inner_text()).strip()
activity = (await page.locator(act_sel).first.inner_text()).strip()
formatted_date = re.sub(r'[-/]', '.', raw_date)[:10]
recent_log = f"{formatted_date}, {user_name}, {activity}"
yield f"data: {json.dumps({'type': 'log', 'message': f' - [로그] 수집 완료'})}\n\n"
await page.click("article.archive-modal div.close", timeout=3000)
await asyncio.sleep(1.5)
except: pass
# 구성 수집
captured_data["log"] = None
apply_btn = page.locator("article.archive-modal").get_by_text("적용").first
if await apply_btn.is_visible():
await apply_btn.click()
await asyncio.sleep(4)
if captured_data["log"]:
data = captured_data["log"]
logs = data.get('logData', []) or data.get('result', [])
if logs and isinstance(logs, list) and len(logs) > 0:
top = logs[0]
rd = top.get('log_date') or top.get('date') or ""
u = top.get('user_name') or top.get('user') or ""
c = top.get('activity_content') or top.get('activity') or ""
recent_log = f"{clean_date_string(rd)}, {u}, {c}"
if recent_log == "데이터 없음":
modal = page.locator("article.archive-modal")
try:
d_v = (await modal.locator(".log-body .date .text").first.inner_text()).strip()
u_v = (await modal.locator(".log-body .user .text").first.inner_text()).strip()
a_v = (await modal.locator(".log-body .activity .text").first.inner_text()).strip()
if d_v: recent_log = f"{clean_date_string(d_v)}, {u_v}, {a_v}"
except: pass
yield f"data: {json.dumps({'type': 'log', 'message': f' - [로그 결과] {recent_log}'})}\n\n"
await page.keyboard.press("Escape")
else:
yield f"data: {json.dumps({'type': 'log', 'message': ' - [로그] 모달 진입 실패'})}\n\n"
except Exception as le:
yield f"data: {json.dumps({'type': 'log', 'message': f' - [로그] 오류: {str(le)}'})}\n\n"
# --- 3. 구성 수집 (상세 패킷 강제 유도) ---
try:
sitemap_btn_sel = "body > div.footer > div.left > div.wrap.site-map-wrap"
sitemap_btn = page.locator(sitemap_btn_sel).first
if await sitemap_btn.is_visible(timeout=5000):
await sitemap_btn.click(force=True)
popup_page = None
for _ in range(20):
for p_item in context.pages:
if "composition" in p_item.url:
popup_page = p_item
break
if popup_page: break
await asyncio.sleep(0.5)
if popup_page:
target_selector = "#composition-list h6:nth-child(3)"
await asyncio.sleep(5) # 로딩 대기
locators_h6 = popup_page.locator(target_selector)
h6_count = await locators_h6.count()
current_total = 0
for j in range(h6_count):
text = (await locators_h6.nth(j).inner_text()).strip()
nums = re.findall(r'\d+', text.split('\n')[-1])
if nums: current_total += int(nums[0])
file_count = current_total
yield f"data: {json.dumps({'type': 'log', 'message': f' - [구성] {file_count}개 확인'})}\n\n"
await popup_page.close()
# [지능형 유도] 상세 정보를 포함한 API 요청을 브라우저가 직접 날리게 함
await page.evaluate("""() => {
fetch('/api/getTreeObject?params[storageType]=CLOUD&params[resourcePath]=/');
}""")
# 패킷 대기 (최대 10초)
for _ in range(20):
if captured_data["tree"]: break
await asyncio.sleep(0.5)
if captured_data["tree"]:
data = captured_data["tree"]
# 분석된 딕셔너리 구조 합산
folders = data.get('currentTreeObject', {}).get('folder', {})
total_files = 0
if isinstance(folders, dict):
for folder_name, folder_info in folders.items():
total_files += int(folder_info.get('filesCount', 0))
file_count = total_files
yield f"data: {json.dumps({'type': 'log', 'message': f' - [구성] 상세 합산 성공 ({file_count}개)'})}\n\n"
else:
yield f"data: {json.dumps({'type': 'log', 'message': ' - [구성] 상세 데이터 응답 없음'})}\n\n"
except: pass
results.append({"projectName": project_name, "recentLog": recent_log, "fileCount": file_count})
# 홈 복귀
await page.locator("div.header div.title div").first.click(force=True)
await page.wait_for_selector("h4.list__contents_aria_group_body_list_item_label", timeout=20000)
await asyncio.sleep(2)
except Exception:
await page.goto("https://overseas.projectmastercloud.com/dashboard", wait_until="domcontentloaded")
await page.wait_for_selector("h4.list__contents_aria_group_body_list_item_label", timeout=20000)
except Exception as e:
yield f"data: {json.dumps({'type': 'log', 'message': f' - [{project_name}] 건너뜀 (사유: {str(e)})'})}\n\n"
await page.goto("https://overseas.projectmastercloud.com/dashboard")
# --- 4. CSV 파일 저장 ---
try:
today_str = datetime.now().strftime("%Y.%m.%d")
csv_path = f"crawling_result {today_str}.csv"
with open(csv_path, 'w', newline='', encoding='utf-8-sig') as f:
writer = csv.DictWriter(f, fieldnames=["projectName", "recentLog", "fileCount"])
writer.writeheader()
writer.writerows(results)
yield f"data: {json.dumps({'type': 'log', 'message': f'✅ 모든 데이터가 {csv_path}에 저장되었습니다.'})}\n\n"
except Exception as fe:
yield f"data: {json.dumps({'type': 'log', 'message': f'❌ CSV 저장 실패: {str(fe)}'})}\n\n"
yield f"data: {json.dumps({'type': 'done', 'data': results})}\n\n"
except GeneratorExit:
# SSE 연결이 클라이언트 측에서 먼저 끊겼을 때 실행
if browser: await browser.close()
except Exception as e:
yield f"data: {json.dumps({'type': 'log', 'message': f'치명적 오류: {str(e)}'})}\n\n"
finally:

Binary file not shown.

Before

Width:  |  Height:  |  Size: 289 KiB

View File

@@ -210,8 +210,13 @@ async function syncData() {
}
try {
console.log("Attempting to connect to /sync...");
const response = await fetch(`/sync`);
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}
const reader = response.body.getReader();
const decoder = new TextDecoder();

View File

@@ -40,6 +40,7 @@ app.add_middleware(
allow_headers=["*"],
)
# --- HTML 라우팅 ---
import csv
@@ -50,29 +51,41 @@ async def get_project_data():
"""
projects = []
try:
if not os.path.exists("sheet.csv"):
return []
with open("sheet.csv", mode="r", encoding="utf-8-sig") as f:
reader = csv.reader(f)
rows = [row for row in reader if row] # 빈 행 제외
rows = list(reader)
# 실제 데이터 시작되는 지점 찾기 (No. 로 시작하는 행 다음부터)
start_idx = -1
# "No." 헤더를 찾아 데이터 시작점 결정
start_idx = None
for i, row in enumerate(rows):
if row and "No." in row[0]:
if any("No." in cell for cell in row):
start_idx = i + 1
break
if start_idx != -1:
if start_idx is not None:
for row in rows[start_idx:]:
if len(row) >= 8:
# [프로젝트명, 담당부서, 담당자, 최근활동로그, 파일수] 형식으로 추출
# [프로젝트명, 담당부서, 담당자, 최근활동로그, 파일수]
# 복구된 sheet.csv 형식에 맞춰 인덱스 추출 (1, 2, 3, 5, 7)
try:
# 파일 수 숫자로 변환 (공백 제거 후 처리)
raw_count = row[7].strip()
file_count = int(raw_count) if raw_count.isdigit() else 0
except (ValueError, IndexError):
file_count = 0
projects.append([
row[1], # 프로젝트 명
row[2], # 담당부서
row[3], # 담당자
row[5], # 최근 활동로그
int(row[7]) if row[7].isdigit() else 0 # 파일 수
file_count # 파일 수
])
except Exception as e:
print(f"Error reading sheet.csv: {e}")
return {"error": str(e)}
return projects
@@ -118,4 +131,5 @@ async def sync_data():
"""
크롤링 서비스(crawler_service.py) 호출
"""
print(">>> /sync request received")
return StreamingResponse(run_crawler_service(), media_type="text_event-stream")

View File

@@ -262,3 +262,57 @@
color: var(--text-sub);
font-weight: 600;
}
/* Header & Sync Button */
header {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: var(--space-lg);
padding-bottom: var(--space-md);
border-bottom: 1px solid var(--border-color);
}
.main-content {
margin-top: 36px;
padding: var(--space-lg);
max-width: 1400px;
margin-left: auto;
margin-right: auto;
}
.sync-btn {
display: flex;
align-items: center;
gap: var(--space-sm);
background-color: var(--primary-color);
color: #fff;
padding: 8px 16px;
border-radius: var(--radius-lg);
font-size: 13px;
font-weight: 600;
box-shadow: var(--box-shadow);
}
.sync-btn:hover {
background-color: var(--primary-lv-8);
}
.sync-btn.loading .spinner {
display: inline-block;
}
.admin-info {
font-size: 13px;
color: var(--text-sub);
margin-left: var(--space-md);
padding: 6px 12px;
background: var(--bg-muted);
border-radius: var(--radius-sm);
border: 1px solid var(--border-color);
}
.admin-info strong {
color: var(--primary-color);
font-weight: 700;
}

View File

@@ -31,10 +31,8 @@
<main class="main-content">
<header>
<div class="flex-center">
<h1>프로젝트 현황</h1>
</div>
<div class="flex-center">
<h1>프로젝트 현황</h1>
<div class="header-actions" style="display: flex; align-items: center;">
<button id="syncBtn" class="sync-btn" onclick="syncData()">
<span class="spinner"></span>
데이터 동기화 (크롤링)