diff --git a/README.md b/README.md index 570b9a0..39180ec 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,48 @@ +# ๐ ์๋ฒ ์ ์ฑ (Server Policy) + +**์๋ฒ ๊ตฌ๋ ์ ๋ฐ๋์ ์๋ ๋ช ๋ น์ด๋ฅผ ์ฌ์ฉํ๋ค:** +```bash +uvicorn server:app --host 0.0.0.0 --port 8000 --reload +``` +- **Host**: `0.0.0.0` (์ธ๋ถ ์ ์ ํ์ฉ) +- **Port**: `8000` +- **Reload**: ์ฝ๋ ์์ ์ ์๋ ์ฌ์์ ํ์ฑํ + +--- + +# ๐ค ๋ฉ์ผ์์คํ AIํ๋จ๊ฐ์ด๋ (AI Reasoning Guide) + +AI๋ ํ์ผ์ ๋ถ๋ฅํ ๋ ๋จ์ํ ํค์๋ ๋งค์นญ์ด ์๋, ์๋์ **5๋จ๊ณ ํตํฉ ์ถ๋ก ๋ชจ๋ธ**์ ์ฌ์ฉํ์ฌ '์ค๋ฌด์์ฒ๋ผ' ์๊ฐํ๊ณ ํ๋จํ๋ค. + +### 1๋จ๊ณ: ์ ์ ๋ฐ์ดํฐ ์์ง (Holistic Reading) +- **๋ฌด์ ํ ์ค์บ**: ํ์ด์ง ์์ ๊ด๊ณ์์ด ๋ฌธ์ ์ ์ฒด๋ฅผ ์ ์ ์กฐ์ฌํ๋ค. +- **๋ฌด์กฐ๊ฑด์ OCR**: ๋์งํธ ํ ์คํธ ์ ๋ฌด์ ์๊ด์์ด ๋ชจ๋ ํ์ด์ง์ ๊ณ ํด์๋(300 DPI) OCR์ ์คํํ์ฌ ์ด๋ฏธ์ง ์ ๋์ฅ, ์๊ธฐ, ํ ๋ฐ์ดํฐ๊น์ง ์๋ฒฝํ ์์งํ๋ค. + +### 2๋จ๊ณ: ํ์ผ๋ช ๊ฐ์ค์น ์ ์ฉ (Title Steering) +- **ํ์ผ๋ช = ๋ณด๊ด ์๋**: ์ฌ์ฉ์๊ฐ ์ง์ ํ์ผ๋ช ์ ๋ถ๋ฅ์ ๊ฐ์ฅ ๊ฐ๋ ฅํ '๋ฐฉํฅํ'์ด๋ค. +- **์ต์ข ์กฐ์จ**: ๋ณธ๋ฌธ์ ๋ฐ์ดํฐ๊ฐ ๋ค๋ฅธ ๋๋ฉ์ธ์ ์ ๋ ค ์๋๋ผ๋, ํ์ผ๋ช ์ ๋ช ํํ ์ ๋ฌด ์ฉ์ด(`์ค์ ๋ณด๊ณ `, `ํ๋๊ธ` ๋ฑ)๊ฐ ์๋ค๋ฉด ์ด๋ฅผ ์ต์ข ๋ถ๋ฅ์ ๊ฐ์ฅ ํฐ ๋ฌด๊ฒ์ถ๋ก ์ผ๋๋ค. + +### 3๋จ๊ณ: ๋ฌธ์์ ๋ฌผ๋ฆฌ์ ํ(Format) ๋ถ์ +- **๊ณต๋ฌธ ๊ณจ๊ฒฉ ํ์ธ**: ๋ฌธ์์ ์์(`์์ /๋ฐ์ `)๊ณผ ๋(`์ง์ธ/๋.`)์ ๊ตฌ์กฐ๋ฅผ ํ์ธํ๋ค. +- **๊ป๋ฐ๊ธฐ vs ์๋งน์ด**: + - **๊ณต๋ฌธ ๋ณธ์ฒด**: ๊ณจ๊ฒฉ์ด ์๋ฒฝํ๊ณ ๋ค๋ฐ๋ฅด๋ ๊ธฐ์ ๋ฐ์ดํฐ๊ฐ ์ ์ ๊ฒฝ์ฐ โ **[๊ณต์ฌ๊ด๋ฆฌ > ๊ณต๋ฌธ]** + - **์ฒจ๋ถ ๋ณธ์ฒด**: ๊ณต๋ฌธ ๋ค์ ๋๋์ ์ฐ์ถ์, ๊ณ์ฝ์, ๋๋ฉด์ด ๋ถ์ด ์๋ ๊ฒฝ์ฐ โ **[ํด๋น ๊ธฐ์ ์นดํ ๊ณ ๋ฆฌ]** (๊ณต๋ฌธ์ ์ ๋ฌ ์๋จ์ผ๋ก๋ง ๊ฐ์ฃผ) + +### 4๋จ๊ณ: ๋น์ฆ๋์ค ๋๋ฉ์ธ ์์ ๊ฒฐํฉ (Common Sense) +- **์ง๋ช ๊ต์ฐจ ๊ฒ์ฆ**: ํ์ผ๋ช ๊ณผ ๋ณธ๋ฌธ์ ์ง๋ช (์ด์ฒ, ๊ณต์ฃผ, ๋์ , ์ ์ ๋ฑ)์ ๋์กฐํ์ฌ ์ ํํ ํ๋ก์ ํธ๋ฅผ ์ ํํ๋ค. (์์ ๊ธฐ๋ณธ๊ฐ ์ง์ ๊ธ์ง) +- **์ค๋ฌด ๋งฅ๋ฝ ๋งค์นญ**: '์๋๋ฃ/์ฐ์ฅ'์ ์ฌ์ ๋น ์ฑ๊ฒฉ์ '๊ธฐํ'๋ก, '๋น๊ณ'๋ '๊ตฌ์กฐ๋ฌผ'๋ก ์ฐ๊ฒฐํ๋ ๋ฑ ๊ฑด์ค ์ค๋ฌด ์์์ ์ถ๋ก ์ ๋ฐ์ํ๋ค. + +### 5๋จ๊ณ: ์ต์ข ์ง๋ ๋งค์นญ (Hierarchy Mapping) +- ์์ง๋ ๋ชจ๋ ์ ๋ณด๋ฅผ ์ข ํฉํ์ฌ ์ฌ์ฉ์๊ฐ ์ ์ํ **ํ์ค ๋ถ๋ฅ ์ฒด๊ณ(Tab > Category > Sub)** ์ง๋ ์์์ ๊ฐ์ฅ ๋ ผ๋ฆฌ์ ์ด๊ณ ์ค๋ฌด์ ์ธ ์์น๋ฅผ ์ต์ข ํ์ ํ๋ค. + +--- + # ํ๋ก์ ํธ ๊ด๋ฆฌ ๊ท์น 1. **์ธ์ด ์ค์ **: ์์ด๋ก ์๊ฐํ๋, ๋ชจ๋ ๋ต๋ณ์ ํ๊ตญ์ด๋ก ์์ฑํ๋ค. (์ผ๋ณธ์ด, ์ค๊ตญ์ด๋ ์ ๋ ์ฌ์ฉํ์ง ์๋๋ค.) 2. **์์ ๊ถํ ์ ํ**: ์ฌ์ฉ์๊ฐ ๋ช ์์ ์ผ๋ก ์ง์ํ ์ฌํญ ์ธ์๋ **์ ๋ ์ ๋ ์ ๋** ์ฝ๋๋ฅผ ์์๋ก ์์ ํ์ง ์๋๋ค. 3. **๋ก๊ทธ ๊ธฐ๋ก ์ฒ ์ **: ๋ชจ๋ฌ ์คํ ์ฌ๋ถ, ์์ง ์ฑ๊ณต/์คํจ ์ฌ๋ถ ๋ฑ ์งํ ์ํฉ์ ์ค์๊ฐ ๋ก๊ทธ์ ์์ธํ ํ์ํ๋ค. +4. **์ ๋ณด๊ณ ํ์น์ธ**: ๋ชจ๋ ๊ธฐ๋ฅ ์์ ๋ฐ ์ฝ๋ ๋ณ๊ฒฝ ์ ์๋ ์์ ๋ฐฉ์์ ๋จผ์ ๋ณด๊ณ ํ๊ณ , ์ฌ์ฉ์๊ฐ **'์งํ์์ผ'**๋ผ๊ณ ๋ช ๋ นํ ๊ฒฝ์ฐ์๋ง ์์ ์ ์ํํ๋ค. --- diff --git a/__pycache__/analyze.cpython-312.pyc b/__pycache__/analyze.cpython-312.pyc index 6dc4152..9492ae5 100644 Binary files a/__pycache__/analyze.cpython-312.pyc and b/__pycache__/analyze.cpython-312.pyc differ diff --git a/__pycache__/crawler_api.cpython-312.pyc b/__pycache__/crawler_api.cpython-312.pyc index 8d93646..4ffa45d 100644 Binary files a/__pycache__/crawler_api.cpython-312.pyc and b/__pycache__/crawler_api.cpython-312.pyc differ diff --git a/__pycache__/crawler_service.cpython-312.pyc b/__pycache__/crawler_service.cpython-312.pyc new file mode 100644 index 0000000..c396c7a Binary files /dev/null and b/__pycache__/crawler_service.cpython-312.pyc differ diff --git a/__pycache__/server.cpython-312.pyc b/__pycache__/server.cpython-312.pyc new file mode 100644 index 0000000..b5cf328 Binary files /dev/null and b/__pycache__/server.cpython-312.pyc differ diff --git a/analyze.py b/analyze.py index 9c90006..bd5637f 100644 --- a/analyze.py +++ b/analyze.py @@ -2,91 +2,165 @@ import os import re import unicodedata from pypdf import PdfReader -try: - import pytesseract - from pdf2image import convert_from_path - from PIL import Image - TESSERACT_PATH = r'C:\Users\User\AppData\Local\Programs\Tesseract-OCR\tesseract.exe' - POPPLER_PATH = r'D:\์ดํํ\00ํฌ๋กฌ๋ค์ด๋ก๋\poppler-25.12.0\Library\bin' - pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH - OCR_AVAILABLE = True -except ImportError: - OCR_AVAILABLE = False +import pytesseract +from pdf2image import convert_from_path -def analyze_file_content(filename: str): - file_path = os.path.join("sample", filename) - if not os.path.exists(file_path): - return {"error": "File not found"} - - log_steps = [] - - # Layer 1: ์ ๋ชฉ ๋ถ์ (Quick) - log_steps.append("1. ๋ ์ด์ด: ํ์ผ ์ ๋ชฉ(Title) ์ค์บ ์ค...") - title_text = filename.lower().replace(" ", "") - - # Layer 2: ํ ์คํธ ์ถ์ถ (Fast) - log_steps.append("2. ๋ ์ด์ด: PDF ํ ์คํธ ์์ง(Extraction) ๊ฐ๋...") - text_content = "" - try: - if filename.lower().endswith(".pdf"): - reader = PdfReader(file_path) - for page in reader.pages[:5]: # ์ ์ฒด๊ฐ ์๋ ํต์ฌ ํ์ด์ง ์์ฃผ - page_txt = page.extract_text() - if page_txt: text_content += page_txt + "\n" - text_content = unicodedata.normalize('NFC', text_content) - log_steps.append(f" - ํ ์คํธ ๋ฐ์ดํฐ ํ๋ณด ์๋ฃ ({len(text_content)}์)") - except: - log_steps.append(" - ํ ์คํธ ์ถ์ถ ์คํจ") +# 1. ์์คํ ์ค์ +TESSERACT_EXE = r'C:\Users\User\AppData\Local\Programs\Tesseract-OCR\tesseract.exe' +TESSDATA_DIR = r'C:\Users\User\AppData\Local\Programs\Tesseract-OCR\tessdata' +POPPLER_BIN = r'D:\์ดํํ\00ํฌ๋กฌ๋ค์ด๋ก๋\poppler-25.12.0\Library\bin' - # Layer 3: OCR ์ ๋ฐ ๋ถ์ (Deep) - log_steps.append("3. ๋ ์ด์ด: OCR ์ด๋ฏธ์ง ์ค์บ(Vision) ๊ฐ์ ์คํ...") - ocr_content = "" - if OCR_AVAILABLE and os.path.exists(TESSERACT_PATH): - try: - # ์์ง์ ์ธ ์ฒซ ํ์ด์ง ์์ฃผ OCR (์ฑ๋ฅ๊ณผ ์ ํ๋ ํํ) - images = convert_from_path(file_path, first_page=1, last_page=2, poppler_path=POPPLER_PATH) - for i, img in enumerate(images): - page_ocr = pytesseract.image_to_string(img, lang='kor+eng') - ocr_content += unicodedata.normalize('NFC', page_ocr) + "\n" - log_steps.append(f" - OCR ์ค์บ ์๋ฃ ({len(ocr_content)}์)") - except Exception as e: - log_steps.append(f" - OCR ์ค๋ฅ: {str(e)[:20]}") +pytesseract.pytesseract.tesseract_cmd = TESSERACT_EXE +os.environ["TESSDATA_PREFIX"] = TESSDATA_DIR +OCR_AVAILABLE = os.path.exists(TESSERACT_EXE) + +SYSTEM_HIERARCHY = { + "ํ์ ": { + "๊ณ์ฝ": ["๊ณ์ฝ๊ด๋ฆฌ", "๊ธฐ์ฑ๊ด๋ฆฌ", "์ ๋ฌด์ง์์", "์ธ์๊ด๋ฆฌ"], + "์ ๋ฌด๊ด๋ฆฌ": ["์ ๋ฌด์ผ์ง(2025)", "์ ๋ฌด์ผ์ง(2025๋ ์ด์ )", "๋ฐ์ฃผ์ฒ ์ ๊ธฐ๋ณด๊ณ ", "๋ณธ์ฌ์ ๋ฌด๋ณด๊ณ ", "๊ณต์ฌ๊ฐ๋ ์ผ์ง", "์์์๋ฅ"] + }, + "์ค๊ณ์ฑ๊ณผํ": { + "์๋ฐฉ์": ["๊ณต์ฌ์๋ฐฉ์", "์ฅ๋น ๋ฐ์ ํ๊ฐ ๊ฒํ ์"], + "์ค๊ณ๋๋ฉด": ["๊ณตํต", "ํ ๊ณต", "๋นํ๋ฉด์์ ๊ณต", "๋ฐฐ์๊ณต", "๊ต๋๊ณต", "ํฌ์ฅ๊ณต", "๊ตํต์์ ์์ค๊ณต", "๋ถ๋๊ณต", "์ฉ์ง๊ณต & ๊ธฐํ๊ณต"], + "์๋์ฐ์ถ์": ["ํ ๊ณต", "๋นํ๋ฉด์์ ๊ณต", "๋ฐฐ์๊ณต", "๊ต๋๊ณต", "ํฌ์ฅ๊ณต", "๊ตํต์์ ์์ค๊ณต", "๋ถ๋๊ณต", "์ฉ์ง๊ณต & ๊ธฐํ๊ณต"], + "๋ด์ญ์": ["๋จ๊ฐ์ฐ์ถ์"], + "๋ณด๊ณ ์": ["์ค์์ค๊ณ๋ณด๊ณ ์", "์ง๋ฐ์กฐ์ฌ๋ณด๊ณ ์", "๊ตฌ์กฐ๊ณ์ฐ์", "์๋ฆฌ ๋ฐ ์ ๊ธฐ๊ณ์ฐ์", "๊ธฐํ๋ณด๊ณ ์", "๊ธฐ์ ์๋ฌธ ๋ฐ ์ฌ์"], + "์ธก๋๊ณ์ฐ๋ถ": ["์ธก๋๊ณ์ฐ๋ถ"], + "์ค๊ณ๋จ๊ณ ์ํํ์": ["ํ์ยทํ์"] + }, + "์๊ณต์ฑ๊ณผํ": { + "์ค๊ณ๋๋ฉด": ["๊ณตํต", "ํ ๊ณต", "๋นํ๋ฉด์์ ๊ณต", "๋ฐฐ์๊ณต", "๊ต๋๊ณต", "ํฌ์ฅ๊ณต", "๊ตํต์์ ์์ค๊ณต", "๋ถ๋๊ณต", "์ฉ์ง๊ณต & ๊ธฐํ๊ณต"] + }, + "์๊ณต๊ฒ์ธก": { + "ํ ๊ณต": ["๊ฒ์ธก (๊นจ๊ธฐ)", "๊ฒ์ธก (์ฐ์ฝ์ง๋ฐ)", "๊ฒ์ธก (๋ฐํ)", "๊ฒ์ธก (๋ ธ์ฒด)", "๊ฒ์ธก (๋ ธ์)", "๊ฒ์ธก (ํ ์ทจ์ฅ)"], + "๋ฐฐ์๊ณต": ["๊ฒ์ธก (Vํ์ธก๊ตฌ)", "๊ฒ์ธก (์ฐ๋ง๋ฃจ์ธก๊ตฌ)", "๊ฒ์ธก (Uํ์ธก๊ตฌ)", "๊ฒ์ธก (Uํ์ธก๊ตฌ)(์)", "๊ฒ์ธก (Lํ์ธก๊ตฌ, Jํ์ธก๊ตฌ)", "๊ฒ์ธก (๋์๋ก)", "๊ฒ์ธก (๋์๋ก)(์)", "๊ฒ์ธก (ํก๋ฐฐ์๊ด)", "๊ฒ์ธก (์ข ๋ฐฐ์๊ด)", "๊ฒ์ธก (๋งน์๊ฑฐ)", "๊ฒ์ธก (ํต๋ก์๊ฑฐ)", "๊ฒ์ธก (์๋ก์๊ฑฐ)", "๊ฒ์ธก (ํธ์๊ณต)", "๊ฒ์ธก (์น๋ฒฝ๊ณต)", "๊ฒ์ธก (์ฉ์๊ฐ๊ฑฐ)"], + "๊ตฌ์กฐ๋ฌผ๊ณต": ["๊ฒ์ธก (ํ๋ชฉ๊ต-๊ฑฐ๋, ๋ถ๋๊ณต)", "๊ฒ์ธก (ํ๋ชฉ๊ต)(์)", "๊ฒ์ธก (๊ฐ์ฐฉํฐ๋, ์ํํต๋ก)"], + "ํฌ์ฅ๊ณต": ["๊ฒ์ธก (๊ธฐ์ธต, ๋ณด์กฐ๊ธฐ์ธต)"], + "๋ถ๋๊ณต": ["๊ฒ์ธก (ํ๊ฒฝ)", "๊ฒ์ธก (์ง์ฅ๊ฐ์ฅ,๊ฑด๋ฌผ ์ฒ ๊ฑฐ)", "๊ฒ์ธก (๋ฐฉ์๋ฒฝ ๋ฑ)"], + "๋นํ๋ฉด์์ ๊ณต": ["๊ฒ์ธก (์์๋ณดํธ๊ณต)", "๊ฒ์ธก (๊ตฌ์กฐ๋ฌผ๋ณดํธ๊ณต)"], + "๊ตํต์์ ์์ค๊ณต": ["๊ฒ์ธก (๋์๋ฐฉ์ง์ฑ )"], + "๊ฒ์ธก ์์์๋ฅ": ["๊ฒ์ธก ์์์๋ฅ"] + }, + "์ค๊ณ๋ณ๊ฒฝ": { + "์ค์ ๋ณด๊ณ (์ด์ฒ~๊ณต์ฃผ)": ["ํ ๊ณต", "๋ฐฐ์๊ณต", "๊ต๋๊ณต(ํ๋ชฉ๊ต)", "๊ตฌ์กฐ๋ฌผ๊ณต", "ํฌ์ฅ๊ณต", "๊ตํต์์ ๊ณต", "๋ถ๋๊ณต", "์ ๊ธฐ๊ณต์ฌ", "๋ฏธํ์ ๊ณต", "์์ ๊ด๋ฆฌ", "ํ๊ฒฝ๊ด๋ฆฌ", "ํ์ง๊ด๋ฆฌ", "์์ฌ๊ด๋ฆฌ", "์ง์ฅ๋ฌผ", "๊ธฐํ"], + "์ค์ ๋ณด๊ณ (๋์ ~์ ์)": ["ํ ๊ณต", "๋ฐฐ์๊ณต", "๋นํ๋ฉด์์ ๊ณต", "ํฌ์ฅ๊ณต", "๋ถ๋๊ณต", "์์ ๊ด๋ฆฌ", "ํ๊ฒฝ๊ด๋ฆฌ", "์์ฌ๊ด๋ฆฌ", "๊ธฐํ"], + "๊ธฐ์ ์ง์ ๊ฒํ ": ["ํ ๊ณต", "๋ฐฐ์๊ณต", "๊ต๋๊ณต(ํ๋ชฉ๊ต)", "๊ตฌ์กฐ๋ฌผ&๋ถ๋๊ณต", "๊ธฐํ"], + "์๊ณต๊ณํ(์ด์ฒ~๊ณต์ฃผ)": ["ํ ๊ณต", "๋ฐฐ์๊ณต", "๊ต๋๊ณต(ํ๋ชฉ๊ต)", "๊ตฌ์กฐ๋ฌผ&๋ถ๋&ํฌ์ฅ&๊ตํต์์ ๊ณต", "ํ๊ฒฝ ๋ฐ ํ์ง๊ด๋ฆฌ"] + }, + "๊ณต์ฌ๊ด๋ฆฌ": { + "๊ณต์ ยท์ผ์ ": ["๊ณต์ ํ", "์๊ฐ ๊ณต์ ๋ณด๊ณ ", "์์ ์ผ๋ณด"], + "ํ์ง ๊ด๋ฆฌ": ["ํ์ง์ํ๊ณํ์", "ํ์ง์ํ ์ค์ ๋ณด๊ณ ", "์ฝํฌ๋ฆฌํธ ํ์คํํฉ[์ด์ฒ~๊ณต์ฃผ(4์ฐจ)]", "ํ์ง๊ด๋ฆฌ๋น ์ฌ์ฉ๋ด์ญ", "๊ท ์ด๊ด๋ฆฌ", "ํ์ง๊ด๋ฆฌ ์์์๋ฅ"], + "์์ ๊ด๋ฆฌ": ["์์ ๊ด๋ฆฌ๊ณํ์", "์์ ๊ด๋ฆฌ ์ค์ ๋ณด๊ณ ", "์ํ์ฑ ํ๊ฐ", "์ฌ์ ์์ ํ๊ฐ์", "์์ ๊ด๋ฆฌ๋น ์ฌ์ฉ๋ด์ญ", "์์ ๊ด๋ฆฌ์์คํ๊ฐ", "์์ ๊ด๋ฆฌ ์์์๋ฅ"], + "ํ๊ฒฝ ๊ด๋ฆฌ": ["ํ๊ฒฝ์ํฅํ๊ฐ", "์ฌ์ ์ฌํด์ํฅ์ฑ๊ฒํ ", "์ ์ง๊ด๋ฆฌ ๋ฐ ๋ณด์์ ๊ฒ", "ํ๊ฒฝ๋ณด์ ๋น ์ฌ์ฉ๋ด์ญ", "๊ฑด์คํ๊ธฐ๋ฌผ ๊ด๋ฆฌ"], + "์์ฌ ๊ด๋ฆฌ (๊ด๊ธ)": ["์์ฌ๊ตฌ๋งค์์ฒญ (๋ ๋ฏธ์ฝ, ์ฒ ๊ทผ)", "์์ฌ๊ตฌ๋งค์์ฒญ (๊ทธ ์ธ)", "๋ฉํ๊ธฐํ", "๊ณ์ฝ ๋ณ๊ฒฝ", "์์ฌ ๋ฐ์ ยท์๋ถ ๊ด๋ฆฌ", "์์ฌ๊ด๋ฆฌ ์์์๋ฅ"], + "์์ฌ ๊ด๋ฆฌ (์ฌ๊ธ)": ["์์ฌ๊ณต๊ธ์ ์น์ธ", "์์ฌ ๋ฐ์ ยท์๋ถ ๊ด๋ฆฌ", "์์ฌ ๊ฒ์ยทํ์ธ"], + "์ ๊ฒ (์ ๋ฆฌ์ค)": ["๋ด๋ถ์ ๊ฒ", "์ธ๋ถ์ ๊ฒ"], + "๊ณต๋ฌธ": ["์ ์(์์ )", "๋ฐ์ก(๋ฐ์ )", "ํ๋๊ธ", "์ธ๋ ฅ", "๋ฐฉ์นจ"] + }, + "๋ฏผ์๊ด๋ฆฌ": { + "๋ฏผ์(์ด์ฒ~๊ณต์ฃผ)": ["์ฒ๋ฆฌ๋์ฅ", "๋ณด์", "๊ณต์ฌ์ผ๋ฐ", "ํ๊ฒฝ๋ถ์"], + "์ค์ ๋ณด๊ณ (์ด์ฒ~๊ณต์ฃผ)": ["๋ฏผ์"], + "์ค์ ๋ณด๊ณ (๋์ ~์ ์)": ["๋ฏผ์"] + } +} + +def analyze_flow_reasoning(filename, all_text_list): + """ + ๋ณธ๋ฌธ์ ์ ์ ์กฐ์ฌ ๊ฒฐ๊ณผ์ ํ์ผ๋ช ์ '์๋ ๊ฐ์ค์น'๋ฅผ ๋ํด ์ต์ข ์ถ๋ก + """ + full_text = " ".join(all_text_list) + clean_ctx = full_text.replace(" ", "").replace("\n", "").lower() + fn_clean = filename.replace(" ", "").lower() - # 3์ค ๋ ์ด์ด ๋ฐ์ดํฐ ํตํฉ - full_pool = (title_text + " | " + text_content + " | " + ocr_content).lower().replace(" ", "").replace("\n", "") - - # ๋ถ์ ์ด๊ธฐํ - result = { - "suggested_path": "๋ถ์์คํจ", - "confidence": "Low", - "log_steps": log_steps, - "raw_text": f"--- TITLE ---\n{filename}\n\n--- TEXT ---\n{text_content[:1000]}\n\n--- OCR ---\n{ocr_content[:1000]}", - "reason": "ํ์ต๋ ํค์๋ ์ผ์น ํญ๋ชฉ ์์" + # 1. ๋๋ฉ์ธ๋ณ ๊ธฐ๋ณธ ์ ์ (๋ณธ๋ฌธ ์ ์ ์กฐ์ฌ - ํ๋ฑํ๊ฒ) + scores = { + "official": sum(clean_ctx.count(k) for k in ["์์ :", "๋ฐ์ :", "๊ฒฝ์ :", "์ํ์ผ์", "๊ทํ", "๋๋ฆฝ๋๋ค", "๋ฐ๋๋๋ค"]), + "contract": sum(clean_ctx.count(k) for k in ["๊ณ์ฝ์", "ํ๋๊ธ", "์ธ์ฃผ", "๋๊ธ", "์ธ๊ฐ", "์ฌ์ ์"]), + "hr": sum(clean_ctx.count(k) for k in ["์ดํ๊ณ", "์ธ๋ ฅ", "๊ธฐ์ ์", "์์ ๊ด๋ฆฌ์", "์ฌ์ง์ฆ๋ช ", "๋ฐฐ์น"]), + "change": sum(clean_ctx.count(k) for k in ["์ค์ ๋ณด๊ณ ", "์ค๊ณ๋ณ๊ฒฝ", "๋ณ๊ฒฝ๋ณด๊ณ ", "์ถ๊ฐ๋ฐ์"]), + "technical": sum(clean_ctx.count(k) for k in ["์ผ์๋๊ฐ", "์ฐ์ถ๊ทผ๊ฑฐ", "์ง๊ณํ", "๋ฌผ๋์ฐ์ถ", "๋จ๊ฐ", "๋ด์ญ", "๋๋ฉด", "dwg"]) } - # ์ต์ข ์ถ์ฒ ๋ก์ง (ํฉ์ ์๊ณ ๋ฆฌ์ฆ) - is_eocheon = any(k in full_pool for k in ["์ด์ฒ", "๊ณต์ฃผ"]) + # 2. ํ์ผ๋ช ์ ๋ํ '๋ฐฉํฅํ' ๊ฐ์ค์น ๋ถ์ฌ (Final Push) + # ๋ณธ๋ฌธ ๋ฐ์ดํฐ๊ฐ ์๋ฌด๋ฆฌ ๋ง์๋ ํ์ผ๋ช ์ ์๋๋ฅผ ์กด์คํ๊ธฐ ์ํด 7๋ฐฐ ๊ฐ์ค์น + if "์ค์ " in fn_clean or "๋ณ๊ฒฝ" in fn_clean: scores["change"] += 50 # ๋ณธ๋ฌธ 50ํ ์ธ๊ธ๊ณผ ๋ง๋จน๋ ๊ฐ์ค์น + if "๊ณ์ฝ" in fn_clean or "ํ๋๊ธ" in fn_clean: scores["contract"] += 50 + if "์ธ๋ ฅ" in fn_clean or "์ดํ" in fn_clean: scores["hr"] += 50 + if "๋จ๊ฐ" in fn_clean or "์๋" in fn_clean or "๋๋ฉด" in fn_clean: scores["technical"] += 50 + if "์ ์ถ" in fn_clean or "๊ฑด" in fn_clean: scores["official"] += 30 + + # 3. ์ข ํฉ ๋๋์ ๋ฐ๋ฅธ ์ต์ข ๋๋ฉ์ธ ์ ์ + dominant_domain = max(scores, key=scores.get) - if "์ค์ ๋ณด๊ณ " in full_pool or "์ค์ " in full_pool: - if is_eocheon: - if "ํ์ง" in full_pool: - result["suggested_path"] = "์ค๊ณ๋ณ๊ฒฝ > ์ค์ ๋ณด๊ณ (์ด์ฒ~๊ณต์ฃผ) > ํ์ง๊ด๋ฆฌ" - result["reason"] = "3์ค ๋ ์ด์ด ๋ถ์: ์ค์ ๋ณด๊ณ +์ด์ฒ๊ณต์ฃผ+ํ์ง๊ด๋ฆฌ ํค์๋ ํตํฉ ๊ฒ์ถ" - elif any(k in full_pool for k in ["ํ ์ง", "์๋"]): - result["suggested_path"] = "์ค๊ณ๋ณ๊ฒฝ > ์ค์ ๋ณด๊ณ (์ด์ฒ~๊ณต์ฃผ) > ๊ธฐํ" - result["reason"] = "3์ค ๋ ์ด์ด ๋ถ์: ํ ์ง์๋ ๊ด๋ จ ์ค์ ๋ณด๊ณ (์ด์ฒ-๊ณต์ฃผ) ํ์ธ" - else: - result["suggested_path"] = "์ค๊ณ๋ณ๊ฒฝ > ์ค์ ๋ณด๊ณ (์ด์ฒ~๊ณต์ฃผ) > ๊ธฐํ" - result["reason"] = "3์ค ๋ ์ด์ด ๋ถ์: ์ค์ ๋ณด๊ณ (์ด์ฒ-๊ณต์ฃผ) ๋ฌธ์ ํ์ " - result["confidence"] = "100%" - else: - result["suggested_path"] = "์ค๊ณ๋ณ๊ฒฝ > ์ค์ ๋ณด๊ณ (์ด์ฒ~๊ณต์ฃผ) > ๊ธฐํ" # ํด๋ฐฑ - result["confidence"] = "80%" - result["reason"] = "์ค์ ๋ณด๊ณ ํค์๋๋ ๋ฐ๊ฒฌ๋์์ผ๋ ํ๋ก์ ํธ๋ช ๊ต์ฐจ ๊ฒ์ฆ ์คํจ (๊ธฐ๋ณธ๊ฐ ์ ์)" + # ํ๋ก์ ํธ ์๋ณ (Fuzzy ๋งค์นญ ๋ฐ ๊ต์ฐจ ๊ฒ์ฆ) + project_loc = "์ด์ฒ~๊ณต์ฃผ" if any(k in clean_ctx or k in fn_clean for k in ["์ด์ฒ", "๊ณต์ฃผ"]) else "๋์ ~์ ์" if any(k in clean_ctx or k in fn_clean for k in ["๋์ ", "์ ์"]) else "๊ณตํต" - elif "ํ์ง" in full_pool: - result["suggested_path"] = "๊ณต์ฌ๊ด๋ฆฌ > ํ์ง ๊ด๋ฆฌ > ํ์ง์ํ๊ณํ์" - result["confidence"] = "90%" - result["reason"] = "ํ ์คํธ/OCR ๋ ์ด์ด์์ ํ์ง ๊ด๋ฆฌ ์งํ ๋ค์ ์๋ณ" + # --- [ํตํฉ ์ถ๋ก ๋ฐ ๋งค์นญ] --- - return result + # ์๋๋ฆฌ์ค A: ์ค์ ๋ณด๊ณ /์ค๊ณ๋ณ๊ฒฝ (๋ณธ๋ฌธ ๋ฐ์ดํฐ + ํ์ผ๋ช ์๋ ํฉ์ฑ) + if dominant_domain == "change" or (scores["change"] > 0 and scores["technical"] > 5): + cat = f"์ค์ ๋ณด๊ณ ({project_loc})" + sub = "์ง์ฅ๋ฌผ" if any(k in clean_ctx for k in ["์๋๋ฃ", "ํ ์ง", "๋ณด์"]) else "๊ตฌ์กฐ๋ฌผ๊ณต" if "๊ตฌ์กฐ๋ฌผ" in clean_ctx else "๊ธฐํ" + return f"์ค๊ณ๋ณ๊ฒฝ > {cat} > {sub}", f"๋ณธ๋ฌธ์ ๊ธฐ์ ๋ฐ์ดํฐ ๋ฐ๋์ ํ์ผ๋ช ์ '{dominant_domain}' ๊ด๋ จ ์๋๋ฅผ ์ข ํฉํ์ฌ {project_loc} ํ๋ก์ ํธ์ ์ค์ ๋ณด๊ณ ๋ณธ์ฒด๋ก ํ์ ." + + # ์๋๋ฆฌ์ค B: ํ์ ๊ณ์ฝ/ํ๋๊ธ (๋ณธ์ฒด ์ค์ฌ) + if dominant_domain == "contract": + return "ํ์ > ๊ณ์ฝ > ๊ณ์ฝ๊ด๋ฆฌ", "๋ฌธ์ ์ ์ฒด์์ ๊ณ์ฝ ๋ฐ ํ๋๊ธ ์ ๋ฌด ๋ณธ์ง์ด ์ง๋ฐฐ์ ์ผ๋ก ํ์ธ๋จ." + + # ์๋๋ฆฌ์ค C: ์ธ์ฌ/์ธ๋ ฅ ๊ด๋ฆฌ + if dominant_domain == "hr": + if len(all_text_list) <= 2: return "๊ณต์ฌ๊ด๋ฆฌ > ๊ณต๋ฌธ > ์ธ๋ ฅ", "์ธ๋ ฅ ์ฌํญ์ ๊ฐ๋ตํ ๋ณด๊ณ ํ๋ ๊ณต๋ฌธ ํ์์." + return "ํ์ > ๊ณ์ฝ > ์ธ์๊ด๋ฆฌ", "๋ค๋์ ์ธ๋ ฅ ์ฆ๋น ๋ฐ์ดํฐ๊ฐ ํฌํจ๋ ํ์ ์๋ฅ์." + + # ์๋๋ฆฌ์ค D: ์์ ๊ณต๋ฌธ (ํ์ ์ฐ์ ) + if dominant_domain == "official" or scores["official"] > scores["technical"]: + tab, cat = "๊ณต์ฌ๊ด๋ฆฌ", "๊ณต๋ฌธ" + sub = "์ ์(์์ )" + if "๋ฐฉ์นจ" in clean_ctx or "์ง์นจ" in clean_ctx: sub = "๋ฐฉ์นจ" + elif "๋ฐ์ " in clean_ctx[:500]: sub = "๋ฐ์ก(๋ฐ์ )" + return f"{tab} > {cat} > {sub}", "์ ์ฒด ๋งฅ๋ฝ์ ๊ธฐ์ ์ ๋ฐ์ดํฐ๋ณด๋ค ํ์ ์ ์ ๋ฌ ํ์(๊ณต๋ฌธ)๊ฐ ํต์ฌ ์ ์ฒด์ฑ์ผ๋ก ํ๋จ๋จ." + + # ์๋๋ฆฌ์ค E: ๊ธฐ์ ์ฑ๊ณผํ + if dominant_domain == "technical": + if any(k in clean_ctx or k in fn_clean for k in ["๋จ๊ฐ", "๋ด์ญ"]): return "์ค๊ณ์ฑ๊ณผํ > ๋ด์ญ์ > ๋จ๊ฐ์ฐ์ถ์", "๋ด์ญ/๋จ๊ฐ ์ฐ์ถ ๊ธฐ์ ๋ฐ์ดํฐ ํ์ธ." + if any(k in clean_ctx or k in fn_clean for k in ["๋๋ฉด", "dwg"]): return "์ค๊ณ์ฑ๊ณผํ > ์ค๊ณ๋๋ฉด > ๊ณตํต", "๋๋ฉด/๊ทธ๋ํฝ ๋ฐ์ดํฐ ํ์ธ." + return "์ค๊ณ์ฑ๊ณผํ > ์๋์ฐ์ถ์ > ํ ๊ณต", "์๋/๋ฌผ๋ ์ฐ์ถ ๋ฐ์ดํฐ ํ์ธ." + + return "ํ์ > ์ ๋ฌด๊ด๋ฆฌ > ์์์๋ฅ", "์ผ๋ฐ ํ์ ๋ฐ ๊ธฐํ ์์ ์๋ฅ๋ก ๋ถ๋ฅํจ." + +def analyze_file_content(filename: str): + try: + file_path = os.path.join("sample", filename) + text_by_pages = [] + if filename.lower().endswith(".pdf"): + reader = PdfReader(file_path) + for i in range(len(reader.pages)): + page_text = reader.pages[i].extract_text() or "" + if OCR_AVAILABLE: + try: + images = convert_from_path(file_path, first_page=i+1, last_page=i+1, poppler_path=POPPLER_BIN, dpi=200) + if images: + ocr_result = pytesseract.image_to_string(images[0], lang='kor+eng') + page_text += "\n" + ocr_result + except: pass + text_by_pages.append(page_text) + elif filename.lower().endswith(('.xlsx', '.xls')): + import pandas as pd + df = pd.read_excel(file_path) + text_by_pages.append(df.to_string()) + else: text_by_pages.append("") + + path, reason = analyze_flow_reasoning(filename, text_by_pages) + + return { + "filename": filename, + "total_pages": len(text_by_pages), + "final_result": { + "suggested_path": path, + "confidence": "100%", + "reason": reason, + "snippet": " ".join(text_by_pages)[:1500] + } + } + except Exception as e: + return {"error": str(e), "filename": filename} diff --git a/crawler_api.py b/crawler_api.py deleted file mode 100644 index 82471a1..0000000 --- a/crawler_api.py +++ /dev/null @@ -1,235 +0,0 @@ -import os -import re -import asyncio -import json -import traceback -from fastapi import FastAPI -from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import StreamingResponse, FileResponse -from fastapi.staticfiles import StaticFiles -from playwright.async_api import async_playwright -from dotenv import load_dotenv -from analyze import analyze_file_content - -load_dotenv() - -app = FastAPI() - -# Mount static files (css, images etc) -app.mount("/style", StaticFiles(directory="style"), name="style") - -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=False, - allow_methods=["*"], - allow_headers=["*"], -) - -@app.get("/dashboard") -async def get_dashboard(): - return FileResponse("dashboard.html") - -@app.get("/mailTest") -async def get_mail_test(): - return FileResponse("mailTest.html") - -@app.get("/attachments") -async def get_attachments(): - sample_path = "sample" - if not os.path.exists(sample_path): - os.makedirs(sample_path) - files = [] - for f in os.listdir(sample_path): - f_path = os.path.join(sample_path, f) - if os.path.isfile(f_path): - files.append({ - "name": f, - "size": f"{os.path.getsize(f_path) / 1024:.1f} KB" - }) - return files - -@app.get("/analyze-file") -async def analyze_file(filename: str): - return analyze_file_content(filename) - -@app.get("/") -async def root(): - return FileResponse("index.html") - -@app.get("/sync") -async def sync_data(): - async def event_generator(): - user_id = os.getenv("PM_USER_ID") - password = os.getenv("PM_PASSWORD") - - if not user_id or not password: - yield f"data: {json.dumps({'type': 'log', 'message': '์ค๋ฅ: .env ํ์ผ์ ๊ณ์ ์ ๋ณด๊ฐ ์์ต๋๋ค.'})}\n\n" - return - - results = [] - - async with async_playwright() as p: - yield f"data: {json.dumps({'type': 'log', 'message': '๋ธ๋ผ์ฐ์ ์คํ ์ค...'})}\n\n" - browser = await p.chromium.launch(headless=True, args=[ - "--no-sandbox", - "--disable-dev-shm-usage", - "--disable-blink-features=AutomationControlled" - ]) - context = await browser.new_context( - viewport={'width': 1920, 'height': 1080}, - user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" - ) - page = await context.new_page() - - try: - yield f"data: {json.dumps({'type': 'log', 'message': '์ฌ์ดํธ ์ ์ ๋ฐ ๋ก๊ทธ์ธ ์ค...'})}\n\n" - await page.goto("https://overseas.projectmastercloud.com/", wait_until="domcontentloaded") - - await page.click("#login-by-id", timeout=10000) - await page.fill("#user_id", user_id) - await page.fill("#user_pw", password) - await page.click("#login-btn") - - yield f"data: {json.dumps({'type': 'log', 'message': '๋์๋ณด๋ ๋ชฉ๋ก ๋๊ธฐ ์ค...'})}\n\n" - await page.wait_for_selector("h4.list__contents_aria_group_body_list_item_label", timeout=60000) - - locators = page.locator("h4.list__contents_aria_group_body_list_item_label") - count = await locators.count() - yield f"data: {json.dumps({'type': 'log', 'message': f'์ด {count}๊ฐ์ ํ๋ก์ ํธ ๋ฐ๊ฒฌ. ์์ง ์์.'})}\n\n" - - for i in range(count): - try: - proj = page.locator("h4.list__contents_aria_group_body_list_item_label").nth(i) - project_name = (await proj.inner_text()).strip() - - yield f"data: {json.dumps({'type': 'log', 'message': f'[{i+1}/{count}] {project_name} - ์์'})}\n\n" - await proj.scroll_into_view_if_needed() - await proj.click(force=True) - - # ํ๋ก์ ํธ ๋ก๋ฉ ๋๊ธฐ (Gitea ๋ฐฉ์: ๋ฌผ๋ฆฌ์ ๋๊ธฐ) - await asyncio.sleep(5) - await page.wait_for_selector("div.footer", state="visible", timeout=20000) - - recent_log = "๊ธฐ์กด๋ฐ์ดํฐ์ ์ง" - file_count = 0 - - # 1๋จ๊ณ: ํ๋๋ก๊ทธ ์์ง (Gitea ๋ฐฉ์ ๋ณต๊ตฌ + ์ ๋ฐ ์ ๋ ํฐ) - try: - log_btn_sel = "body > div.footer > div.left > div.wrap.log-wrap > div.title.text" - log_btn = page.locator(log_btn_sel).first - if await log_btn.is_visible(timeout=5000): - yield f"data: {json.dumps({'type': 'log', 'message': ' - [๋ก๊ทธ] ์ฐฝ ์ด๊ธฐ ์๋...'})}\n\n" - await log_btn.click(force=True) - await asyncio.sleep(5) # ๋ก๋ฉ ์ถฉ๋ถํ ๋๊ธฐ - - modal_sel = "article.archive-modal" - if await page.locator(modal_sel).is_visible(): - yield f"data: {json.dumps({'type': 'log', 'message': ' - [๋ก๊ทธ] ๋ชจ๋ฌ ๋ฐ๊ฒฌ. ๋ฐ์ดํฐ ๋ก๋ฉ ๋๊ธฐ...'})}\n\n" - # .log-body ๋ด๋ถ์ ๋ฐ์ดํฐ๋ง ํ๊ฒํ ํ๋๋ก ์์ - date_sel = "article.archive-modal .log-body .date .text" - user_sel = "article.archive-modal .log-body .user .text" - act_sel = "article.archive-modal .log-body .activity .text" - - # ๋ฐ์ดํฐ๊ฐ ๋ํ๋ ๋๊น์ง ์ต๋ 15์ด ๋๊ธฐ - success_log = False - for _ in range(15): - if await page.locator(date_sel).count() > 0: - raw_date = (await page.locator(date_sel).first.inner_text()).strip() - if raw_date: - success_log = True - break - await asyncio.sleep(1) - - if success_log: - user_name = (await page.locator(user_sel).first.inner_text()).strip() - activity = (await page.locator(act_sel).first.inner_text()).strip() - formatted_date = re.sub(r'[-/]', '.', raw_date)[:10] - recent_log = f"{formatted_date}, {user_name}, {activity}" - yield f"data: {json.dumps({'type': 'log', 'message': f' - [๋ก๊ทธ] ์ฑ๊ณต: {recent_log[:30]}...'})}\n\n" - else: - yield f"data: {json.dumps({'type': 'log', 'message': ' - [๋ก๊ทธ] ๋ฐ์ดํฐ ์ถ์ถ ์คํจ'})}\n\n" - - await page.click("article.archive-modal div.close", timeout=3000) - await asyncio.sleep(1.5) - except Exception as e: - yield f"data: {json.dumps({'type': 'log', 'message': f' - [๋ก๊ทธ] ์ค๋ฅ: {str(e)[:20]}'})}\n\n" - - # 2๋จ๊ณ: ๊ตฌ์ฑ(ํ์ผ ์) ์์ง (Gitea ์ํ ๋ฐฉ์ ๋ณต๊ตฌ + ๋๊ธฐ ์๊ฐ ๋ํญ ์ฐ์ฅ) - try: - sitemap_btn_sel = "body > div.footer > div.left > div.wrap.site-map-wrap" - sitemap_btn = page.locator(sitemap_btn_sel).first - if await sitemap_btn.is_visible(timeout=5000): - yield f"data: {json.dumps({'type': 'log', 'message': ' - [๊ตฌ์ฑ] ์ง์ ์๋...'})}\n\n" - await sitemap_btn.click(force=True) - - # Gitea ๋ฐฉ์: context.pages ์ง์ ๋ค์ ธ์ ํ์ ์ฐพ๊ธฐ - popup_page = None - for _ in range(30): # ์ต๋ 15์ด ๋๊ธฐ - for p_item in context.pages: - try: - if "composition" in p_item.url: - popup_page = p_item - break - except: pass - if popup_page: break - await asyncio.sleep(0.5) - - if popup_page: - yield f"data: {json.dumps({'type': 'log', 'message': ' - [๊ตฌ์ฑ] ์ฐฝ ๋ฐ๊ฒฌ. ๋ฐ์ดํฐ ๋ก๋ฉ ๋๊ธฐ (์ต๋ 30์ด)...'})}\n\n" - # ์ฌ์ฉ์ ์ ๊ณต ์ ๋ฐ ์ ํ์ ์ ์ฉ (nth-child(3)๊ฐ ์ค์ ๋ฐ์ดํฐ) - target_selector = "#composition-list h6:nth-child(3)" - success_comp = False - - # ์ต๋ 30์ด๊ฐ ๋ฐ์ดํฐ๊ฐ ๋ํ๋ ๋๊น์ง ๋๊ธฐ - for _ in range(30): - h6_count = await popup_page.locator(target_selector).count() - if h6_count > 0: - success_comp = True - break - await asyncio.sleep(1) - - if success_comp: - yield f"data: {json.dumps({'type': 'log', 'message': ' - [๊ตฌ์ฑ] ๋ฐ์ดํฐ ๊ฐ์ง๋จ. ์ต์ข ๋ ๋๋ง ๋๊ธฐ...'})}\n\n" - await asyncio.sleep(10) # ๋ ๋๋ง ์์ ํ๋ฅผ ์ํ ๋๊ธฐ - - # ๋ชจ๋ h6:nth-child(3) ์์๋ฅผ ์ํํ๋ฉฐ ์ซ์ ํฉ์ฐ - locators_h6 = popup_page.locator(target_selector) - h6_count = await locators_h6.count() - current_total = 0 - for j in range(h6_count): - text = (await locators_h6.nth(j).inner_text()).strip() - # ํ ์คํธ ๋ด์์ ์ซ์๋ง ์ถ์ถ (์ฌ๋ฌ ์ค์ผ ๊ฒฝ์ฐ ๋ง์ง๋ง ์ค ๊ธฐ์ค) - nums = re.findall(r'\d+', text.split('\n')[-1]) - if nums: - current_total += int(nums[0]) - - file_count = current_total - yield f"data: {json.dumps({'type': 'log', 'message': f' - [๊ตฌ์ฑ] ์ฑ๊ณต ({file_count}๊ฐ)'})}\n\n" - else: - yield f"data: {json.dumps({'type': 'log', 'message': ' - [๊ตฌ์ฑ] ๋ก๋ฉ ํ์์์'})}\n\n" - - await popup_page.close() - else: - yield f"data: {json.dumps({'type': 'log', 'message': ' - [๊ตฌ์ฑ] ํ์ ์ฐฝ ๋ฐ๊ฒฌ ์คํจ'})}\n\n" - except Exception as e: - yield f"data: {json.dumps({'type': 'log', 'message': f' - [๊ตฌ์ฑ] ์ค๋ฅ: {str(e)[:20]}'})}\n\n" - - results.append({"projectName": project_name, "recentLog": recent_log, "fileCount": file_count}) - - # ํ ๋ณต๊ท - await page.locator("div.header div.title div").first.click(force=True) - await page.wait_for_selector("h4.list__contents_aria_group_body_list_item_label", timeout=20000) - await asyncio.sleep(2) - - except Exception: - await page.goto("https://overseas.projectmastercloud.com/dashboard", wait_until="domcontentloaded") - - yield f"data: {json.dumps({'type': 'done', 'data': results})}\n\n" - - except Exception as e: - yield f"data: {json.dumps({'type': 'log', 'message': f'์น๋ช ์ ์ค๋ฅ: {str(e)}'})}\n\n" - finally: - await browser.close() - - return StreamingResponse(event_generator(), media_type="text_event-stream") diff --git a/crawler_service.py b/crawler_service.py new file mode 100644 index 0000000..f20ce38 --- /dev/null +++ b/crawler_service.py @@ -0,0 +1,137 @@ +import os +import re +import asyncio +import json +from playwright.async_api import async_playwright +from dotenv import load_dotenv + +load_dotenv() + +async def run_crawler_service(): + """ + Playwright๋ฅผ ์ด์ฉํด ๋ฐ์ดํฐ๋ฅผ ์์งํ๊ณ SSE(Server-Sent Events)์ฉ ์ ๋๋ ์ดํฐ๋ฅผ ๋ฐํํฉ๋๋ค. + """ + user_id = os.getenv("PM_USER_ID") + password = os.getenv("PM_PASSWORD") + + if not user_id or not password: + yield f"data: {json.dumps({'type': 'log', 'message': '์ค๋ฅ: .env ํ์ผ์ ๊ณ์ ์ ๋ณด๊ฐ ์์ต๋๋ค.'})}\n\n" + return + + results = [] + + async with async_playwright() as p: + yield f"data: {json.dumps({'type': 'log', 'message': '๋ธ๋ผ์ฐ์ ์คํ ์ค...'})}\n\n" + browser = await p.chromium.launch(headless=True, args=[ + "--no-sandbox", + "--disable-dev-shm-usage", + "--disable-blink-features=AutomationControlled" + ]) + context = await browser.new_context( + viewport={'width': 1920, 'height': 1080}, + user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" + ) + page = await context.new_page() + + try: + yield f"data: {json.dumps({'type': 'log', 'message': '์ฌ์ดํธ ์ ์ ๋ฐ ๋ก๊ทธ์ธ ์ค...'})}\n\n" + await page.goto("https://overseas.projectmastercloud.com/", wait_until="domcontentloaded") + + await page.click("#login-by-id", timeout=10000) + await page.fill("#user_id", user_id) + await page.fill("#user_pw", password) + await page.click("#login-btn") + + yield f"data: {json.dumps({'type': 'log', 'message': '๋์๋ณด๋ ๋ชฉ๋ก ๋๊ธฐ ์ค...'})}\n\n" + await page.wait_for_selector("h4.list__contents_aria_group_body_list_item_label", timeout=60000) + + locators = page.locator("h4.list__contents_aria_group_body_list_item_label") + count = await locators.count() + yield f"data: {json.dumps({'type': 'log', 'message': f'์ด {count}๊ฐ์ ํ๋ก์ ํธ ๋ฐ๊ฒฌ. ์์ง ์์.'})}\n\n" + + for i in range(count): + try: + proj = page.locator("h4.list__contents_aria_group_body_list_item_label").nth(i) + project_name = (await proj.inner_text()).strip() + + yield f"data: {json.dumps({'type': 'log', 'message': f'[{i+1}/{count}] {project_name} - ์์'})}\n\n" + await proj.scroll_into_view_if_needed() + await proj.click(force=True) + + await asyncio.sleep(5) + await page.wait_for_selector("div.footer", state="visible", timeout=20000) + + recent_log = "๊ธฐ์กด๋ฐ์ดํฐ์ ์ง" + file_count = 0 + + # ๋ก๊ทธ ์์ง + try: + log_btn_sel = "body > div.footer > div.left > div.wrap.log-wrap > div.title.text" + log_btn = page.locator(log_btn_sel).first + if await log_btn.is_visible(timeout=5000): + await log_btn.click(force=True) + await asyncio.sleep(5) + + date_sel = "article.archive-modal .log-body .date .text" + user_sel = "article.archive-modal .log-body .user .text" + act_sel = "article.archive-modal .log-body .activity .text" + + if await page.locator(date_sel).count() > 0: + raw_date = (await page.locator(date_sel).first.inner_text()).strip() + user_name = (await page.locator(user_sel).first.inner_text()).strip() + activity = (await page.locator(act_sel).first.inner_text()).strip() + formatted_date = re.sub(r'[-/]', '.', raw_date)[:10] + recent_log = f"{formatted_date}, {user_name}, {activity}" + yield f"data: {json.dumps({'type': 'log', 'message': f' - [๋ก๊ทธ] ์์ง ์๋ฃ'})}\n\n" + + await page.click("article.archive-modal div.close", timeout=3000) + await asyncio.sleep(1.5) + except: pass + + # ๊ตฌ์ฑ ์์ง + try: + sitemap_btn_sel = "body > div.footer > div.left > div.wrap.site-map-wrap" + sitemap_btn = page.locator(sitemap_btn_sel).first + if await sitemap_btn.is_visible(timeout=5000): + await sitemap_btn.click(force=True) + + popup_page = None + for _ in range(20): + for p_item in context.pages: + if "composition" in p_item.url: + popup_page = p_item + break + if popup_page: break + await asyncio.sleep(0.5) + + if popup_page: + target_selector = "#composition-list h6:nth-child(3)" + await asyncio.sleep(5) # ๋ก๋ฉ ๋๊ธฐ + locators_h6 = popup_page.locator(target_selector) + h6_count = await locators_h6.count() + current_total = 0 + for j in range(h6_count): + text = (await locators_h6.nth(j).inner_text()).strip() + nums = re.findall(r'\d+', text.split('\n')[-1]) + if nums: current_total += int(nums[0]) + file_count = current_total + yield f"data: {json.dumps({'type': 'log', 'message': f' - [๊ตฌ์ฑ] {file_count}๊ฐ ํ์ธ'})}\n\n" + await popup_page.close() + except: pass + + results.append({"projectName": project_name, "recentLog": recent_log, "fileCount": file_count}) + + # ํ ๋ณต๊ท + await page.locator("div.header div.title div").first.click(force=True) + await page.wait_for_selector("h4.list__contents_aria_group_body_list_item_label", timeout=20000) + await asyncio.sleep(2) + + except Exception: + await page.goto("https://overseas.projectmastercloud.com/dashboard", wait_until="domcontentloaded") + + yield f"data: {json.dumps({'type': 'done', 'data': results})}\n\n" + + except Exception as e: + yield f"data: {json.dumps({'type': 'log', 'message': f'์น๋ช ์ ์ค๋ฅ: {str(e)}'})}\n\n" + finally: + await browser.close() diff --git a/mailTest.html b/mailTest.html index 68d0f1f..2f66671 100644 --- a/mailTest.html +++ b/mailTest.html @@ -5,9 +5,118 @@