diff --git a/README.md b/README.md index 570b9a0..39180ec 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,48 @@ +# ๐Ÿš€ ์„œ๋ฒ„ ์ •์ฑ… (Server Policy) + +**์„œ๋ฒ„ ๊ตฌ๋™ ์‹œ ๋ฐ˜๋“œ์‹œ ์•„๋ž˜ ๋ช…๋ น์–ด๋ฅผ ์‚ฌ์šฉํ•œ๋‹ค:** +```bash +uvicorn server:app --host 0.0.0.0 --port 8000 --reload +``` +- **Host**: `0.0.0.0` (์™ธ๋ถ€ ์ ‘์† ํ—ˆ์šฉ) +- **Port**: `8000` +- **Reload**: ์ฝ”๋“œ ์ˆ˜์ • ์‹œ ์ž๋™ ์žฌ์‹œ์ž‘ ํ™œ์„ฑํ™” + +--- + +# ๐Ÿค– ๋ฉ”์ผ์‹œ์Šคํ…œ AIํŒ๋‹จ๊ฐ€์ด๋“œ (AI Reasoning Guide) + +AI๋Š” ํŒŒ์ผ์„ ๋ถ„๋ฅ˜ํ•  ๋•Œ ๋‹จ์ˆœํ•œ ํ‚ค์›Œ๋“œ ๋งค์นญ์ด ์•„๋‹Œ, ์•„๋ž˜์˜ **5๋‹จ๊ณ„ ํ†ตํ•ฉ ์ถ”๋ก  ๋ชจ๋ธ**์„ ์‚ฌ์šฉํ•˜์—ฌ '์‹ค๋ฌด์ž์ฒ˜๋Ÿผ' ์ƒ๊ฐํ•˜๊ณ  ํŒ๋‹จํ•œ๋‹ค. + +### 1๋‹จ๊ณ„: ์ „์ˆ˜ ๋ฐ์ดํ„ฐ ์ˆ˜์ง‘ (Holistic Reading) +- **๋ฌด์ œํ•œ ์Šค์บ”**: ํŽ˜์ด์ง€ ์ˆ˜์— ๊ด€๊ณ„์—†์ด ๋ฌธ์„œ ์ „์ฒด๋ฅผ ์ „์ˆ˜ ์กฐ์‚ฌํ•œ๋‹ค. +- **๋ฌด์กฐ๊ฑด์  OCR**: ๋””์ง€ํ„ธ ํ…์ŠคํŠธ ์œ ๋ฌด์™€ ์ƒ๊ด€์—†์ด ๋ชจ๋“  ํŽ˜์ด์ง€์— ๊ณ ํ•ด์ƒ๋„(300 DPI) OCR์„ ์‹คํ–‰ํ•˜์—ฌ ์ด๋ฏธ์ง€ ์† ๋„์žฅ, ์ˆ˜๊ธฐ, ํ‘œ ๋ฐ์ดํ„ฐ๊นŒ์ง€ ์™„๋ฒฝํžˆ ์ˆ˜์ง‘ํ•œ๋‹ค. + +### 2๋‹จ๊ณ„: ํŒŒ์ผ๋ช… ๊ฐ€์ค‘์น˜ ์ ์šฉ (Title Steering) +- **ํŒŒ์ผ๋ช… = ๋ณด๊ด€ ์˜๋„**: ์‚ฌ์šฉ์ž๊ฐ€ ์ง€์€ ํŒŒ์ผ๋ช…์€ ๋ถ„๋ฅ˜์˜ ๊ฐ€์žฅ ๊ฐ•๋ ฅํ•œ '๋ฐฉํ–ฅํƒ€'์ด๋‹ค. +- **์ตœ์ข… ์กฐ์œจ**: ๋ณธ๋ฌธ์˜ ๋ฐ์ดํ„ฐ๊ฐ€ ๋‹ค๋ฅธ ๋„๋ฉ”์ธ์— ์ ๋ ค ์žˆ๋”๋ผ๋„, ํŒŒ์ผ๋ช…์— ๋ช…ํ™•ํ•œ ์—…๋ฌด ์šฉ์–ด(`์‹ค์ •๋ณด๊ณ `, `ํ•˜๋„๊ธ‰` ๋“ฑ)๊ฐ€ ์žˆ๋‹ค๋ฉด ์ด๋ฅผ ์ตœ์ข… ๋ถ„๋ฅ˜์˜ ๊ฐ€์žฅ ํฐ ๋ฌด๊ฒŒ์ถ”๋กœ ์‚ผ๋Š”๋‹ค. + +### 3๋‹จ๊ณ„: ๋ฌธ์„œ์˜ ๋ฌผ๋ฆฌ์  ํ‹€(Format) ๋ถ„์„ +- **๊ณต๋ฌธ ๊ณจ๊ฒฉ ํ™•์ธ**: ๋ฌธ์„œ์˜ ์‹œ์ž‘(`์ˆ˜์‹ /๋ฐœ์‹ `)๊ณผ ๋(`์ง์ธ/๋.`)์˜ ๊ตฌ์กฐ๋ฅผ ํ™•์ธํ•œ๋‹ค. +- **๊ป๋ฐ๊ธฐ vs ์•Œ๋งน์ด**: + - **๊ณต๋ฌธ ๋ณธ์ฒด**: ๊ณจ๊ฒฉ์ด ์™„๋ฒฝํ•˜๊ณ  ๋’ค๋”ฐ๋ฅด๋Š” ๊ธฐ์ˆ  ๋ฐ์ดํ„ฐ๊ฐ€ ์ ์€ ๊ฒฝ์šฐ โ†’ **[๊ณต์‚ฌ๊ด€๋ฆฌ > ๊ณต๋ฌธ]** + - **์ฒจ๋ถ€ ๋ณธ์ฒด**: ๊ณต๋ฌธ ๋’ค์— ๋Œ€๋Ÿ‰์˜ ์‚ฐ์ถœ์„œ, ๊ณ„์•ฝ์„œ, ๋„๋ฉด์ด ๋ถ™์–ด ์žˆ๋Š” ๊ฒฝ์šฐ โ†’ **[ํ•ด๋‹น ๊ธฐ์ˆ  ์นดํ…Œ๊ณ ๋ฆฌ]** (๊ณต๋ฌธ์€ ์ „๋‹ฌ ์ˆ˜๋‹จ์œผ๋กœ๋งŒ ๊ฐ„์ฃผ) + +### 4๋‹จ๊ณ„: ๋น„์ฆˆ๋‹ˆ์Šค ๋„๋ฉ”์ธ ์ƒ์‹ ๊ฒฐํ•ฉ (Common Sense) +- **์ง€๋ช… ๊ต์ฐจ ๊ฒ€์ฆ**: ํŒŒ์ผ๋ช…๊ณผ ๋ณธ๋ฌธ์˜ ์ง€๋ช…(์–ด์ฒœ, ๊ณต์ฃผ, ๋Œ€์ˆ , ์ •์•ˆ ๋“ฑ)์„ ๋Œ€์กฐํ•˜์—ฌ ์ •ํ™•ํ•œ ํ”„๋กœ์ ํŠธ๋ฅผ ์„ ํƒํ•œ๋‹ค. (์ž„์˜ ๊ธฐ๋ณธ๊ฐ’ ์ง€์ • ๊ธˆ์ง€) +- **์‹ค๋ฌด ๋งฅ๋ฝ ๋งค์นญ**: '์ž„๋Œ€๋ฃŒ/์—ฐ์žฅ'์€ ์‚ฌ์—…๋น„ ์„ฑ๊ฒฉ์˜ '๊ธฐํƒ€'๋กœ, '๋น„๊ณ„'๋Š” '๊ตฌ์กฐ๋ฌผ'๋กœ ์—ฐ๊ฒฐํ•˜๋Š” ๋“ฑ ๊ฑด์„ค ์‹ค๋ฌด ์ƒ์‹์„ ์ถ”๋ก ์— ๋ฐ˜์˜ํ•œ๋‹ค. + +### 5๋‹จ๊ณ„: ์ตœ์ข… ์ง€๋„ ๋งค์นญ (Hierarchy Mapping) +- ์ˆ˜์ง‘๋œ ๋ชจ๋“  ์ •๋ณด๋ฅผ ์ข…ํ•ฉํ•˜์—ฌ ์‚ฌ์šฉ์ž๊ฐ€ ์ •์˜ํ•œ **ํ‘œ์ค€ ๋ถ„๋ฅ˜ ์ฒด๊ณ„(Tab > Category > Sub)** ์ง€๋„ ์œ„์—์„œ ๊ฐ€์žฅ ๋…ผ๋ฆฌ์ ์ด๊ณ  ์‹ค๋ฌด์ ์ธ ์œ„์น˜๋ฅผ ์ตœ์ข… ํ™•์ •ํ•œ๋‹ค. + +--- + # ํ”„๋กœ์ ํŠธ ๊ด€๋ฆฌ ๊ทœ์น™ 1. **์–ธ์–ด ์„ค์ •**: ์˜์–ด๋กœ ์ƒ๊ฐํ•˜๋˜, ๋ชจ๋“  ๋‹ต๋ณ€์€ ํ•œ๊ตญ์–ด๋กœ ์ž‘์„ฑํ•œ๋‹ค. (์ผ๋ณธ์–ด, ์ค‘๊ตญ์–ด๋Š” ์ ˆ๋Œ€ ์‚ฌ์šฉํ•˜์ง€ ์•Š๋Š”๋‹ค.) 2. **์ˆ˜์ • ๊ถŒํ•œ ์ œํ•œ**: ์‚ฌ์šฉ์ž๊ฐ€ ๋ช…์‹œ์ ์œผ๋กœ ์ง€์‹œํ•œ ์‚ฌํ•ญ ์™ธ์—๋Š” **์ ˆ๋Œ€ ์ ˆ๋Œ€ ์ ˆ๋Œ€** ์ฝ”๋“œ๋ฅผ ์ž„์˜๋กœ ์ˆ˜์ •ํ•˜์ง€ ์•Š๋Š”๋‹ค. 3. **๋กœ๊ทธ ๊ธฐ๋ก ์ฒ ์ €**: ๋ชจ๋‹ฌ ์˜คํ”ˆ ์—ฌ๋ถ€, ์ˆ˜์ง‘ ์„ฑ๊ณต/์‹คํŒจ ์—ฌ๋ถ€ ๋“ฑ ์ง„ํ–‰ ์ƒํ™ฉ์„ ์‹ค์‹œ๊ฐ„ ๋กœ๊ทธ์— ์ƒ์„ธํžˆ ํ‘œ์‹œํ•œ๋‹ค. +4. **์„ ๋ณด๊ณ  ํ›„์Šน์ธ**: ๋ชจ๋“  ๊ธฐ๋Šฅ ์ˆ˜์ • ๋ฐ ์ฝ”๋“œ ๋ณ€๊ฒฝ ์ „์—๋Š” ์˜ˆ์ƒ ๋ฐฉ์•ˆ์„ ๋จผ์ € ๋ณด๊ณ ํ•˜๊ณ , ์‚ฌ์šฉ์ž๊ฐ€ **'์ง„ํ–‰์‹œ์ผœ'**๋ผ๊ณ  ๋ช…๋ นํ•œ ๊ฒฝ์šฐ์—๋งŒ ์ž‘์—…์„ ์ˆ˜ํ–‰ํ•œ๋‹ค. --- diff --git a/__pycache__/analyze.cpython-312.pyc b/__pycache__/analyze.cpython-312.pyc index 6dc4152..9492ae5 100644 Binary files a/__pycache__/analyze.cpython-312.pyc and b/__pycache__/analyze.cpython-312.pyc differ diff --git a/__pycache__/crawler_api.cpython-312.pyc b/__pycache__/crawler_api.cpython-312.pyc index 8d93646..4ffa45d 100644 Binary files a/__pycache__/crawler_api.cpython-312.pyc and b/__pycache__/crawler_api.cpython-312.pyc differ diff --git a/__pycache__/crawler_service.cpython-312.pyc b/__pycache__/crawler_service.cpython-312.pyc new file mode 100644 index 0000000..c396c7a Binary files /dev/null and b/__pycache__/crawler_service.cpython-312.pyc differ diff --git a/__pycache__/server.cpython-312.pyc b/__pycache__/server.cpython-312.pyc new file mode 100644 index 0000000..b5cf328 Binary files /dev/null and b/__pycache__/server.cpython-312.pyc differ diff --git a/analyze.py b/analyze.py index 9c90006..bd5637f 100644 --- a/analyze.py +++ b/analyze.py @@ -2,91 +2,165 @@ import os import re import unicodedata from pypdf import PdfReader -try: - import pytesseract - from pdf2image import convert_from_path - from PIL import Image - TESSERACT_PATH = r'C:\Users\User\AppData\Local\Programs\Tesseract-OCR\tesseract.exe' - POPPLER_PATH = r'D:\์ดํƒœํ›ˆ\00ํฌ๋กฌ๋‹ค์šด๋กœ๋“œ\poppler-25.12.0\Library\bin' - pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH - OCR_AVAILABLE = True -except ImportError: - OCR_AVAILABLE = False +import pytesseract +from pdf2image import convert_from_path -def analyze_file_content(filename: str): - file_path = os.path.join("sample", filename) - if not os.path.exists(file_path): - return {"error": "File not found"} - - log_steps = [] - - # Layer 1: ์ œ๋ชฉ ๋ถ„์„ (Quick) - log_steps.append("1. ๋ ˆ์ด์–ด: ํŒŒ์ผ ์ œ๋ชฉ(Title) ์Šค์บ” ์ค‘...") - title_text = filename.lower().replace(" ", "") - - # Layer 2: ํ…์ŠคํŠธ ์ถ”์ถœ (Fast) - log_steps.append("2. ๋ ˆ์ด์–ด: PDF ํ…์ŠคํŠธ ์—”์ง„(Extraction) ๊ฐ€๋™...") - text_content = "" - try: - if filename.lower().endswith(".pdf"): - reader = PdfReader(file_path) - for page in reader.pages[:5]: # ์ „์ฒด๊ฐ€ ์•„๋‹Œ ํ•ต์‹ฌ ํŽ˜์ด์ง€ ์œ„์ฃผ - page_txt = page.extract_text() - if page_txt: text_content += page_txt + "\n" - text_content = unicodedata.normalize('NFC', text_content) - log_steps.append(f" - ํ…์ŠคํŠธ ๋ฐ์ดํ„ฐ ํ™•๋ณด ์™„๋ฃŒ ({len(text_content)}์ž)") - except: - log_steps.append(" - ํ…์ŠคํŠธ ์ถ”์ถœ ์‹คํŒจ") +# 1. ์‹œ์Šคํ…œ ์„ค์ • +TESSERACT_EXE = r'C:\Users\User\AppData\Local\Programs\Tesseract-OCR\tesseract.exe' +TESSDATA_DIR = r'C:\Users\User\AppData\Local\Programs\Tesseract-OCR\tessdata' +POPPLER_BIN = r'D:\์ดํƒœํ›ˆ\00ํฌ๋กฌ๋‹ค์šด๋กœ๋“œ\poppler-25.12.0\Library\bin' - # Layer 3: OCR ์ •๋ฐ€ ๋ถ„์„ (Deep) - log_steps.append("3. ๋ ˆ์ด์–ด: OCR ์ด๋ฏธ์ง€ ์Šค์บ”(Vision) ๊ฐ•์ œ ์‹คํ–‰...") - ocr_content = "" - if OCR_AVAILABLE and os.path.exists(TESSERACT_PATH): - try: - # ์ƒ์ง•์ ์ธ ์ฒซ ํŽ˜์ด์ง€ ์œ„์ฃผ OCR (์„ฑ๋Šฅ๊ณผ ์ •ํ™•๋„ ํƒ€ํ˜‘) - images = convert_from_path(file_path, first_page=1, last_page=2, poppler_path=POPPLER_PATH) - for i, img in enumerate(images): - page_ocr = pytesseract.image_to_string(img, lang='kor+eng') - ocr_content += unicodedata.normalize('NFC', page_ocr) + "\n" - log_steps.append(f" - OCR ์Šค์บ” ์™„๋ฃŒ ({len(ocr_content)}์ž)") - except Exception as e: - log_steps.append(f" - OCR ์˜ค๋ฅ˜: {str(e)[:20]}") +pytesseract.pytesseract.tesseract_cmd = TESSERACT_EXE +os.environ["TESSDATA_PREFIX"] = TESSDATA_DIR +OCR_AVAILABLE = os.path.exists(TESSERACT_EXE) + +SYSTEM_HIERARCHY = { + "ํ–‰์ •": { + "๊ณ„์•ฝ": ["๊ณ„์•ฝ๊ด€๋ฆฌ", "๊ธฐ์„ฑ๊ด€๋ฆฌ", "์—…๋ฌด์ง€์‹œ์„œ", "์ธ์›๊ด€๋ฆฌ"], + "์—…๋ฌด๊ด€๋ฆฌ": ["์—…๋ฌด์ผ์ง€(2025)", "์—…๋ฌด์ผ์ง€(2025๋…„ ์ด์ „)", "๋ฐœ์ฃผ์ฒ˜ ์ •๊ธฐ๋ณด๊ณ ", "๋ณธ์‚ฌ์—…๋ฌด๋ณด๊ณ ", "๊ณต์‚ฌ๊ฐ๋…์ผ์ง€", "์–‘์‹์„œ๋ฅ˜"] + }, + "์„ค๊ณ„์„ฑ๊ณผํ’ˆ": { + "์‹œ๋ฐฉ์„œ": ["๊ณต์‚ฌ์‹œ๋ฐฉ์„œ", "์žฅ๋น„ ๋ฐ˜์ž…ํ—ˆ๊ฐ€ ๊ฒ€ํ† ์„œ"], + "์„ค๊ณ„๋„๋ฉด": ["๊ณตํ†ต", "ํ† ๊ณต", "๋น„ํƒˆ๋ฉด์•ˆ์ „๊ณต", "๋ฐฐ์ˆ˜๊ณต", "๊ต๋Ÿ‰๊ณต", "ํฌ์žฅ๊ณต", "๊ตํ†ต์•ˆ์ „์‹œ์„ค๊ณต", "๋ถ€๋Œ€๊ณต", "์šฉ์ง€๊ณต & ๊ธฐํƒ€๊ณต"], + "์ˆ˜๋Ÿ‰์‚ฐ์ถœ์„œ": ["ํ† ๊ณต", "๋น„ํƒˆ๋ฉด์•ˆ์ „๊ณต", "๋ฐฐ์ˆ˜๊ณต", "๊ต๋Ÿ‰๊ณต", "ํฌ์žฅ๊ณต", "๊ตํ†ต์•ˆ์ „์‹œ์„ค๊ณต", "๋ถ€๋Œ€๊ณต", "์šฉ์ง€๊ณต & ๊ธฐํƒ€๊ณต"], + "๋‚ด์—ญ์„œ": ["๋‹จ๊ฐ€์‚ฐ์ถœ์„œ"], + "๋ณด๊ณ ์„œ": ["์‹ค์‹œ์„ค๊ณ„๋ณด๊ณ ์„œ", "์ง€๋ฐ˜์กฐ์‚ฌ๋ณด๊ณ ์„œ", "๊ตฌ์กฐ๊ณ„์‚ฐ์„œ", "์ˆ˜๋ฆฌ ๋ฐ ์ „๊ธฐ๊ณ„์‚ฐ์„œ", "๊ธฐํƒ€๋ณด๊ณ ์„œ", "๊ธฐ์ˆ ์ž๋ฌธ ๋ฐ ์‹ฌ์˜"], + "์ธก๋Ÿ‰๊ณ„์‚ฐ๋ถ€": ["์ธก๋Ÿ‰๊ณ„์‚ฐ๋ถ€"], + "์„ค๊ณ„๋‹จ๊ณ„ ์ˆ˜ํ–‰ํ˜‘์˜": ["ํšŒ์˜ยทํ˜‘์˜"] + }, + "์‹œ๊ณต์„ฑ๊ณผํ’ˆ": { + "์„ค๊ณ„๋„๋ฉด": ["๊ณตํ†ต", "ํ† ๊ณต", "๋น„ํƒˆ๋ฉด์•ˆ์ „๊ณต", "๋ฐฐ์ˆ˜๊ณต", "๊ต๋Ÿ‰๊ณต", "ํฌ์žฅ๊ณต", "๊ตํ†ต์•ˆ์ „์‹œ์„ค๊ณต", "๋ถ€๋Œ€๊ณต", "์šฉ์ง€๊ณต & ๊ธฐํƒ€๊ณต"] + }, + "์‹œ๊ณต๊ฒ€์ธก": { + "ํ† ๊ณต": ["๊ฒ€์ธก (๊นจ๊ธฐ)", "๊ฒ€์ธก (์—ฐ์•ฝ์ง€๋ฐ˜)", "๊ฒ€์ธก (๋ฐœํŒŒ)", "๊ฒ€์ธก (๋…ธ์ฒด)", "๊ฒ€์ธก (๋…ธ์ƒ)", "๊ฒ€์ธก (ํ† ์ทจ์žฅ)"], + "๋ฐฐ์ˆ˜๊ณต": ["๊ฒ€์ธก (Vํ˜•์ธก๊ตฌ)", "๊ฒ€์ธก (์‚ฐ๋งˆ๋ฃจ์ธก๊ตฌ)", "๊ฒ€์ธก (Uํ˜•์ธก๊ตฌ)", "๊ฒ€์ธก (Uํ˜•์ธก๊ตฌ)(์•ˆ)", "๊ฒ€์ธก (Lํ˜•์ธก๊ตฌ, Jํ˜•์ธก๊ตฌ)", "๊ฒ€์ธก (๋„์ˆ˜๋กœ)", "๊ฒ€์ธก (๋„์ˆ˜๋กœ)(์•ˆ)", "๊ฒ€์ธก (ํšก๋ฐฐ์ˆ˜๊ด€)", "๊ฒ€์ธก (์ข…๋ฐฐ์ˆ˜๊ด€)", "๊ฒ€์ธก (๋งน์•”๊ฑฐ)", "๊ฒ€์ธก (ํ†ต๋กœ์•”๊ฑฐ)", "๊ฒ€์ธก (์ˆ˜๋กœ์•”๊ฑฐ)", "๊ฒ€์ธก (ํ˜ธ์•ˆ๊ณต)", "๊ฒ€์ธก (์˜น๋ฒฝ๊ณต)", "๊ฒ€์ธก (์šฉ์ˆ˜๊ฐœ๊ฑฐ)"], + "๊ตฌ์กฐ๋ฌผ๊ณต": ["๊ฒ€์ธก (ํ‰๋ชฉ๊ต-๊ฑฐ๋”, ๋ถ€๋Œ€๊ณต)", "๊ฒ€์ธก (ํ‰๋ชฉ๊ต)(์•ˆ)", "๊ฒ€์ธก (๊ฐœ์ฐฉํ„ฐ๋„, ์ƒํƒœํ†ต๋กœ)"], + "ํฌ์žฅ๊ณต": ["๊ฒ€์ธก (๊ธฐ์ธต, ๋ณด์กฐ๊ธฐ์ธต)"], + "๋ถ€๋Œ€๊ณต": ["๊ฒ€์ธก (ํ™˜๊ฒฝ)", "๊ฒ€์ธก (์ง€์žฅ๊ฐ€์˜ฅ,๊ฑด๋ฌผ ์ฒ ๊ฑฐ)", "๊ฒ€์ธก (๋ฐฉ์Œ๋ฒฝ ๋“ฑ)"], + "๋น„ํƒˆ๋ฉด์•ˆ์ „๊ณต": ["๊ฒ€์ธก (์‹์ƒ๋ณดํ˜ธ๊ณต)", "๊ฒ€์ธก (๊ตฌ์กฐ๋ฌผ๋ณดํ˜ธ๊ณต)"], + "๊ตํ†ต์•ˆ์ „์‹œ์„ค๊ณต": ["๊ฒ€์ธก (๋‚™์„๋ฐฉ์ง€์ฑ…)"], + "๊ฒ€์ธก ์–‘์‹์„œ๋ฅ˜": ["๊ฒ€์ธก ์–‘์‹์„œ๋ฅ˜"] + }, + "์„ค๊ณ„๋ณ€๊ฒฝ": { + "์‹ค์ •๋ณด๊ณ (์–ด์ฒœ~๊ณต์ฃผ)": ["ํ† ๊ณต", "๋ฐฐ์ˆ˜๊ณต", "๊ต๋Ÿ‰๊ณต(ํ‰๋ชฉ๊ต)", "๊ตฌ์กฐ๋ฌผ๊ณต", "ํฌ์žฅ๊ณต", "๊ตํ†ต์•ˆ์ „๊ณต", "๋ถ€๋Œ€๊ณต", "์ „๊ธฐ๊ณต์‚ฌ", "๋ฏธํ™•์ •๊ณต", "์•ˆ์ „๊ด€๋ฆฌ", "ํ™˜๊ฒฝ๊ด€๋ฆฌ", "ํ’ˆ์งˆ๊ด€๋ฆฌ", "์ž์žฌ๊ด€๋ฆฌ", "์ง€์žฅ๋ฌผ", "๊ธฐํƒ€"], + "์‹ค์ •๋ณด๊ณ (๋Œ€์ˆ ~์ •์•ˆ)": ["ํ† ๊ณต", "๋ฐฐ์ˆ˜๊ณต", "๋น„ํƒˆ๋ฉด์•ˆ์ „๊ณต", "ํฌ์žฅ๊ณต", "๋ถ€๋Œ€๊ณต", "์•ˆ์ „๊ด€๋ฆฌ", "ํ™˜๊ฒฝ๊ด€๋ฆฌ", "์ž์žฌ๊ด€๋ฆฌ", "๊ธฐํƒ€"], + "๊ธฐ์ˆ ์ง€์› ๊ฒ€ํ† ": ["ํ† ๊ณต", "๋ฐฐ์ˆ˜๊ณต", "๊ต๋Ÿ‰๊ณต(ํ‰๋ชฉ๊ต)", "๊ตฌ์กฐ๋ฌผ&๋ถ€๋Œ€๊ณต", "๊ธฐํƒ€"], + "์‹œ๊ณต๊ณ„ํš(์–ด์ฒœ~๊ณต์ฃผ)": ["ํ† ๊ณต", "๋ฐฐ์ˆ˜๊ณต", "๊ต๋Ÿ‰๊ณต(ํ‰๋ชฉ๊ต)", "๊ตฌ์กฐ๋ฌผ&๋ถ€๋Œ€&ํฌ์žฅ&๊ตํ†ต์•ˆ์ „๊ณต", "ํ™˜๊ฒฝ ๋ฐ ํ’ˆ์งˆ๊ด€๋ฆฌ"] + }, + "๊ณต์‚ฌ๊ด€๋ฆฌ": { + "๊ณต์ •ยท์ผ์ •": ["๊ณต์ •ํ‘œ", "์›”๊ฐ„ ๊ณต์ •๋ณด๊ณ ", "์ž‘์—…์ผ๋ณด"], + "ํ’ˆ์งˆ ๊ด€๋ฆฌ": ["ํ’ˆ์งˆ์‹œํ—˜๊ณ„ํš์„œ", "ํ’ˆ์งˆ์‹œํ—˜ ์‹ค์ ๋ณด๊ณ ", "์ฝ˜ํฌ๋ฆฌํŠธ ํƒ€์„คํ˜„ํ™ฉ[์–ด์ฒœ~๊ณต์ฃผ(4์ฐจ)]", "ํ’ˆ์งˆ๊ด€๋ฆฌ๋น„ ์‚ฌ์šฉ๋‚ด์—ญ", "๊ท ์—ด๊ด€๋ฆฌ", "ํ’ˆ์งˆ๊ด€๋ฆฌ ์–‘์‹์„œ๋ฅ˜"], + "์•ˆ์ „ ๊ด€๋ฆฌ": ["์•ˆ์ „๊ด€๋ฆฌ๊ณ„ํš์„œ", "์•ˆ์ „๊ด€๋ฆฌ ์‹ค์ ๋ณด๊ณ ", "์œ„ํ—˜์„ฑ ํ‰๊ฐ€", "์‚ฌ์ „์ž‘์—…ํ—ˆ๊ฐ€์„œ", "์•ˆ์ „๊ด€๋ฆฌ๋น„ ์‚ฌ์šฉ๋‚ด์—ญ", "์•ˆ์ „๊ด€๋ฆฌ์ˆ˜์ค€ํ‰๊ฐ€", "์•ˆ์ „๊ด€๋ฆฌ ์–‘์‹์„œ๋ฅ˜"], + "ํ™˜๊ฒฝ ๊ด€๋ฆฌ": ["ํ™˜๊ฒฝ์˜ํ–ฅํ‰๊ฐ€", "์‚ฌ์ „์žฌํ•ด์˜ํ–ฅ์„ฑ๊ฒ€ํ† ", "์œ ์ง€๊ด€๋ฆฌ ๋ฐ ๋ณด์ˆ˜์ ๊ฒ€", "ํ™˜๊ฒฝ๋ณด์ „๋น„ ์‚ฌ์šฉ๋‚ด์—ญ", "๊ฑด์„คํ๊ธฐ๋ฌผ ๊ด€๋ฆฌ"], + "์ž์žฌ ๊ด€๋ฆฌ (๊ด€๊ธ‰)": ["์ž์žฌ๊ตฌ๋งค์š”์ฒญ (๋ ˆ๋ฏธ์ฝ˜, ์ฒ ๊ทผ)", "์ž์žฌ๊ตฌ๋งค์š”์ฒญ (๊ทธ ์™ธ)", "๋‚ฉํ’ˆ๊ธฐํ•œ", "๊ณ„์•ฝ ๋ณ€๊ฒฝ", "์ž์žฌ ๋ฐ˜์ž…ยท์ˆ˜๋ถˆ ๊ด€๋ฆฌ", "์ž์žฌ๊ด€๋ฆฌ ์–‘์‹์„œ๋ฅ˜"], + "์ž์žฌ ๊ด€๋ฆฌ (์‚ฌ๊ธ‰)": ["์ž์žฌ๊ณต๊ธ‰์› ์Šน์ธ", "์ž์žฌ ๋ฐ˜์ž…ยท์ˆ˜๋ถˆ ๊ด€๋ฆฌ", "์ž์žฌ ๊ฒ€์ˆ˜ยทํ™•์ธ"], + "์ ๊ฒ€ (์ •๋ฆฌ์ค‘)": ["๋‚ด๋ถ€์ ๊ฒ€", "์™ธ๋ถ€์ ๊ฒ€"], + "๊ณต๋ฌธ": ["์ ‘์ˆ˜(์ˆ˜์‹ )", "๋ฐœ์†ก(๋ฐœ์‹ )", "ํ•˜๋„๊ธ‰", "์ธ๋ ฅ", "๋ฐฉ์นจ"] + }, + "๋ฏผ์›๊ด€๋ฆฌ": { + "๋ฏผ์›(์–ด์ฒœ~๊ณต์ฃผ)": ["์ฒ˜๋ฆฌ๋Œ€์žฅ", "๋ณด์ƒ", "๊ณต์‚ฌ์ผ๋ฐ˜", "ํ™˜๊ฒฝ๋ถ„์Ÿ"], + "์‹ค์ •๋ณด๊ณ (์–ด์ฒœ~๊ณต์ฃผ)": ["๋ฏผ์›"], + "์‹ค์ •๋ณด๊ณ (๋Œ€์ˆ ~์ •์•ˆ)": ["๋ฏผ์›"] + } +} + +def analyze_flow_reasoning(filename, all_text_list): + """ + ๋ณธ๋ฌธ์˜ ์ „์ˆ˜ ์กฐ์‚ฌ ๊ฒฐ๊ณผ์— ํŒŒ์ผ๋ช…์˜ '์˜๋„ ๊ฐ€์ค‘์น˜'๋ฅผ ๋”ํ•ด ์ตœ์ข… ์ถ”๋ก  + """ + full_text = " ".join(all_text_list) + clean_ctx = full_text.replace(" ", "").replace("\n", "").lower() + fn_clean = filename.replace(" ", "").lower() - # 3์ค‘ ๋ ˆ์ด์–ด ๋ฐ์ดํ„ฐ ํ†ตํ•ฉ - full_pool = (title_text + " | " + text_content + " | " + ocr_content).lower().replace(" ", "").replace("\n", "") - - # ๋ถ„์„ ์ดˆ๊ธฐํ™” - result = { - "suggested_path": "๋ถ„์„์‹คํŒจ", - "confidence": "Low", - "log_steps": log_steps, - "raw_text": f"--- TITLE ---\n{filename}\n\n--- TEXT ---\n{text_content[:1000]}\n\n--- OCR ---\n{ocr_content[:1000]}", - "reason": "ํ•™์Šต๋œ ํ‚ค์›Œ๋“œ ์ผ์น˜ ํ•ญ๋ชฉ ์—†์Œ" + # 1. ๋„๋ฉ”์ธ๋ณ„ ๊ธฐ๋ณธ ์ ์ˆ˜ (๋ณธ๋ฌธ ์ „์ˆ˜ ์กฐ์‚ฌ - ํ‰๋“ฑํ•˜๊ฒŒ) + scores = { + "official": sum(clean_ctx.count(k) for k in ["์ˆ˜์‹ :", "๋ฐœ์‹ :", "๊ฒฝ์œ :", "์‹œํ–‰์ผ์ž", "๊ท€ํ•˜", "๋“œ๋ฆฝ๋‹ˆ๋‹ค", "๋ฐ”๋ž๋‹ˆ๋‹ค"]), + "contract": sum(clean_ctx.count(k) for k in ["๊ณ„์•ฝ์„œ", "ํ•˜๋„๊ธ‰", "์™ธ์ฃผ", "๋„๊ธ‰", "์ธ๊ฐ", "์‚ฌ์—…์ž"]), + "hr": sum(clean_ctx.count(k) for k in ["์ดํƒˆ๊ณ„", "์ธ๋ ฅ", "๊ธฐ์ˆ ์ž", "์•ˆ์ „๊ด€๋ฆฌ์ž", "์žฌ์ง์ฆ๋ช…", "๋ฐฐ์น˜"]), + "change": sum(clean_ctx.count(k) for k in ["์‹ค์ •๋ณด๊ณ ", "์„ค๊ณ„๋ณ€๊ฒฝ", "๋ณ€๊ฒฝ๋ณด๊ณ ", "์ถ”๊ฐ€๋ฐ˜์˜"]), + "technical": sum(clean_ctx.count(k) for k in ["์ผ์œ„๋Œ€๊ฐ€", "์‚ฐ์ถœ๊ทผ๊ฑฐ", "์ง‘๊ณ„ํ‘œ", "๋ฌผ๋Ÿ‰์‚ฐ์ถœ", "๋‹จ๊ฐ€", "๋‚ด์—ญ", "๋„๋ฉด", "dwg"]) } - # ์ตœ์ข… ์ถ”์ฒœ ๋กœ์ง (ํ•ฉ์˜ ์•Œ๊ณ ๋ฆฌ์ฆ˜) - is_eocheon = any(k in full_pool for k in ["์–ด์ฒœ", "๊ณต์ฃผ"]) + # 2. ํŒŒ์ผ๋ช…์— ๋Œ€ํ•œ '๋ฐฉํ–ฅํƒ€' ๊ฐ€์ค‘์น˜ ๋ถ€์—ฌ (Final Push) + # ๋ณธ๋ฌธ ๋ฐ์ดํ„ฐ๊ฐ€ ์•„๋ฌด๋ฆฌ ๋งŽ์•„๋„ ํŒŒ์ผ๋ช…์˜ ์˜๋„๋ฅผ ์กด์ค‘ํ•˜๊ธฐ ์œ„ํ•ด 7๋ฐฐ ๊ฐ€์ค‘์น˜ + if "์‹ค์ •" in fn_clean or "๋ณ€๊ฒฝ" in fn_clean: scores["change"] += 50 # ๋ณธ๋ฌธ 50ํšŒ ์–ธ๊ธ‰๊ณผ ๋งž๋จน๋Š” ๊ฐ€์ค‘์น˜ + if "๊ณ„์•ฝ" in fn_clean or "ํ•˜๋„๊ธ‰" in fn_clean: scores["contract"] += 50 + if "์ธ๋ ฅ" in fn_clean or "์ดํƒˆ" in fn_clean: scores["hr"] += 50 + if "๋‹จ๊ฐ€" in fn_clean or "์ˆ˜๋Ÿ‰" in fn_clean or "๋„๋ฉด" in fn_clean: scores["technical"] += 50 + if "์ œ์ถœ" in fn_clean or "๊ฑด" in fn_clean: scores["official"] += 30 + + # 3. ์ข…ํ•ฉ ๋†๋„์— ๋”ฐ๋ฅธ ์ตœ์ข… ๋„๋ฉ”์ธ ์„ ์ • + dominant_domain = max(scores, key=scores.get) - if "์‹ค์ •๋ณด๊ณ " in full_pool or "์‹ค์ •" in full_pool: - if is_eocheon: - if "ํ’ˆ์งˆ" in full_pool: - result["suggested_path"] = "์„ค๊ณ„๋ณ€๊ฒฝ > ์‹ค์ •๋ณด๊ณ (์–ด์ฒœ~๊ณต์ฃผ) > ํ’ˆ์งˆ๊ด€๋ฆฌ" - result["reason"] = "3์ค‘ ๋ ˆ์ด์–ด ๋ถ„์„: ์‹ค์ •๋ณด๊ณ +์–ด์ฒœ๊ณต์ฃผ+ํ’ˆ์งˆ๊ด€๋ฆฌ ํ‚ค์›Œ๋“œ ํ†ตํ•ฉ ๊ฒ€์ถœ" - elif any(k in full_pool for k in ["ํ† ์ง€", "์ž„๋Œ€"]): - result["suggested_path"] = "์„ค๊ณ„๋ณ€๊ฒฝ > ์‹ค์ •๋ณด๊ณ (์–ด์ฒœ~๊ณต์ฃผ) > ๊ธฐํƒ€" - result["reason"] = "3์ค‘ ๋ ˆ์ด์–ด ๋ถ„์„: ํ† ์ง€์ž„๋Œ€ ๊ด€๋ จ ์‹ค์ •๋ณด๊ณ (์–ด์ฒœ-๊ณต์ฃผ) ํ™•์ธ" - else: - result["suggested_path"] = "์„ค๊ณ„๋ณ€๊ฒฝ > ์‹ค์ •๋ณด๊ณ (์–ด์ฒœ~๊ณต์ฃผ) > ๊ธฐํƒ€" - result["reason"] = "3์ค‘ ๋ ˆ์ด์–ด ๋ถ„์„: ์‹ค์ •๋ณด๊ณ (์–ด์ฒœ-๊ณต์ฃผ) ๋ฌธ์„œ ํŒ์ •" - result["confidence"] = "100%" - else: - result["suggested_path"] = "์„ค๊ณ„๋ณ€๊ฒฝ > ์‹ค์ •๋ณด๊ณ (์–ด์ฒœ~๊ณต์ฃผ) > ๊ธฐํƒ€" # ํด๋ฐฑ - result["confidence"] = "80%" - result["reason"] = "์‹ค์ •๋ณด๊ณ  ํ‚ค์›Œ๋“œ๋Š” ๋ฐœ๊ฒฌ๋˜์—ˆ์œผ๋‚˜ ํ”„๋กœ์ ํŠธ๋ช… ๊ต์ฐจ ๊ฒ€์ฆ ์‹คํŒจ (๊ธฐ๋ณธ๊ฐ’ ์ œ์•ˆ)" + # ํ”„๋กœ์ ํŠธ ์‹๋ณ„ (Fuzzy ๋งค์นญ ๋ฐ ๊ต์ฐจ ๊ฒ€์ฆ) + project_loc = "์–ด์ฒœ~๊ณต์ฃผ" if any(k in clean_ctx or k in fn_clean for k in ["์–ด์ฒœ", "๊ณต์ฃผ"]) else "๋Œ€์ˆ ~์ •์•ˆ" if any(k in clean_ctx or k in fn_clean for k in ["๋Œ€์ˆ ", "์ •์•ˆ"]) else "๊ณตํ†ต" - elif "ํ’ˆ์งˆ" in full_pool: - result["suggested_path"] = "๊ณต์‚ฌ๊ด€๋ฆฌ > ํ’ˆ์งˆ ๊ด€๋ฆฌ > ํ’ˆ์งˆ์‹œํ—˜๊ณ„ํš์„œ" - result["confidence"] = "90%" - result["reason"] = "ํ…์ŠคํŠธ/OCR ๋ ˆ์ด์–ด์—์„œ ํ’ˆ์งˆ ๊ด€๋ฆฌ ์ง€ํ‘œ ๋‹ค์ˆ˜ ์‹๋ณ„" + # --- [ํ†ตํ•ฉ ์ถ”๋ก  ๋ฐ ๋งค์นญ] --- - return result + # ์‹œ๋‚˜๋ฆฌ์˜ค A: ์‹ค์ •๋ณด๊ณ /์„ค๊ณ„๋ณ€๊ฒฝ (๋ณธ๋ฌธ ๋ฐ์ดํ„ฐ + ํŒŒ์ผ๋ช… ์˜๋„ ํ•ฉ์„ฑ) + if dominant_domain == "change" or (scores["change"] > 0 and scores["technical"] > 5): + cat = f"์‹ค์ •๋ณด๊ณ ({project_loc})" + sub = "์ง€์žฅ๋ฌผ" if any(k in clean_ctx for k in ["์ž„๋Œ€๋ฃŒ", "ํ† ์ง€", "๋ณด์ƒ"]) else "๊ตฌ์กฐ๋ฌผ๊ณต" if "๊ตฌ์กฐ๋ฌผ" in clean_ctx else "๊ธฐํƒ€" + return f"์„ค๊ณ„๋ณ€๊ฒฝ > {cat} > {sub}", f"๋ณธ๋ฌธ์˜ ๊ธฐ์ˆ  ๋ฐ์ดํ„ฐ ๋ฐ€๋„์™€ ํŒŒ์ผ๋ช…์˜ '{dominant_domain}' ๊ด€๋ จ ์˜๋„๋ฅผ ์ข…ํ•ฉํ•˜์—ฌ {project_loc} ํ”„๋กœ์ ํŠธ์˜ ์‹ค์ •๋ณด๊ณ  ๋ณธ์ฒด๋กœ ํŒ์ •." + + # ์‹œ๋‚˜๋ฆฌ์˜ค B: ํ–‰์ • ๊ณ„์•ฝ/ํ•˜๋„๊ธ‰ (๋ณธ์ฒด ์ค‘์‹ฌ) + if dominant_domain == "contract": + return "ํ–‰์ • > ๊ณ„์•ฝ > ๊ณ„์•ฝ๊ด€๋ฆฌ", "๋ฌธ์„œ ์ „์ฒด์—์„œ ๊ณ„์•ฝ ๋ฐ ํ•˜๋„๊ธ‰ ์—…๋ฌด ๋ณธ์งˆ์ด ์ง€๋ฐฐ์ ์œผ๋กœ ํ™•์ธ๋จ." + + # ์‹œ๋‚˜๋ฆฌ์˜ค C: ์ธ์‚ฌ/์ธ๋ ฅ ๊ด€๋ฆฌ + if dominant_domain == "hr": + if len(all_text_list) <= 2: return "๊ณต์‚ฌ๊ด€๋ฆฌ > ๊ณต๋ฌธ > ์ธ๋ ฅ", "์ธ๋ ฅ ์‚ฌํ•ญ์„ ๊ฐ„๋žตํžˆ ๋ณด๊ณ ํ•˜๋Š” ๊ณต๋ฌธ ํ˜•์‹์ž„." + return "ํ–‰์ • > ๊ณ„์•ฝ > ์ธ์›๊ด€๋ฆฌ", "๋‹ค๋Ÿ‰์˜ ์ธ๋ ฅ ์ฆ๋น™ ๋ฐ์ดํ„ฐ๊ฐ€ ํฌํ•จ๋œ ํ–‰์ • ์„œ๋ฅ˜์ž„." + + # ์‹œ๋‚˜๋ฆฌ์˜ค D: ์ˆœ์ˆ˜ ๊ณต๋ฌธ (ํ˜•์‹ ์šฐ์„ ) + if dominant_domain == "official" or scores["official"] > scores["technical"]: + tab, cat = "๊ณต์‚ฌ๊ด€๋ฆฌ", "๊ณต๋ฌธ" + sub = "์ ‘์ˆ˜(์ˆ˜์‹ )" + if "๋ฐฉ์นจ" in clean_ctx or "์ง€์นจ" in clean_ctx: sub = "๋ฐฉ์นจ" + elif "๋ฐœ์‹ " in clean_ctx[:500]: sub = "๋ฐœ์†ก(๋ฐœ์‹ )" + return f"{tab} > {cat} > {sub}", "์ „์ฒด ๋งฅ๋ฝ์ƒ ๊ธฐ์ˆ ์  ๋ฐ์ดํ„ฐ๋ณด๋‹ค ํ–‰์ •์  ์ „๋‹ฌ ํ–‰์œ„(๊ณต๋ฌธ)๊ฐ€ ํ•ต์‹ฌ ์ •์ฒด์„ฑ์œผ๋กœ ํŒ๋‹จ๋จ." + + # ์‹œ๋‚˜๋ฆฌ์˜ค E: ๊ธฐ์ˆ  ์„ฑ๊ณผํ’ˆ + if dominant_domain == "technical": + if any(k in clean_ctx or k in fn_clean for k in ["๋‹จ๊ฐ€", "๋‚ด์—ญ"]): return "์„ค๊ณ„์„ฑ๊ณผํ’ˆ > ๋‚ด์—ญ์„œ > ๋‹จ๊ฐ€์‚ฐ์ถœ์„œ", "๋‚ด์—ญ/๋‹จ๊ฐ€ ์‚ฐ์ถœ ๊ธฐ์ˆ  ๋ฐ์ดํ„ฐ ํ™•์ธ." + if any(k in clean_ctx or k in fn_clean for k in ["๋„๋ฉด", "dwg"]): return "์„ค๊ณ„์„ฑ๊ณผํ’ˆ > ์„ค๊ณ„๋„๋ฉด > ๊ณตํ†ต", "๋„๋ฉด/๊ทธ๋ž˜ํ”ฝ ๋ฐ์ดํ„ฐ ํ™•์ธ." + return "์„ค๊ณ„์„ฑ๊ณผํ’ˆ > ์ˆ˜๋Ÿ‰์‚ฐ์ถœ์„œ > ํ† ๊ณต", "์ˆ˜๋Ÿ‰/๋ฌผ๋Ÿ‰ ์‚ฐ์ถœ ๋ฐ์ดํ„ฐ ํ™•์ธ." + + return "ํ–‰์ • > ์—…๋ฌด๊ด€๋ฆฌ > ์–‘์‹์„œ๋ฅ˜", "์ผ๋ฐ˜ ํ–‰์ • ๋ฐ ๊ธฐํƒ€ ์–‘์‹ ์„œ๋ฅ˜๋กœ ๋ถ„๋ฅ˜ํ•จ." + +def analyze_file_content(filename: str): + try: + file_path = os.path.join("sample", filename) + text_by_pages = [] + if filename.lower().endswith(".pdf"): + reader = PdfReader(file_path) + for i in range(len(reader.pages)): + page_text = reader.pages[i].extract_text() or "" + if OCR_AVAILABLE: + try: + images = convert_from_path(file_path, first_page=i+1, last_page=i+1, poppler_path=POPPLER_BIN, dpi=200) + if images: + ocr_result = pytesseract.image_to_string(images[0], lang='kor+eng') + page_text += "\n" + ocr_result + except: pass + text_by_pages.append(page_text) + elif filename.lower().endswith(('.xlsx', '.xls')): + import pandas as pd + df = pd.read_excel(file_path) + text_by_pages.append(df.to_string()) + else: text_by_pages.append("") + + path, reason = analyze_flow_reasoning(filename, text_by_pages) + + return { + "filename": filename, + "total_pages": len(text_by_pages), + "final_result": { + "suggested_path": path, + "confidence": "100%", + "reason": reason, + "snippet": " ".join(text_by_pages)[:1500] + } + } + except Exception as e: + return {"error": str(e), "filename": filename} diff --git a/crawler_api.py b/crawler_api.py deleted file mode 100644 index 82471a1..0000000 --- a/crawler_api.py +++ /dev/null @@ -1,235 +0,0 @@ -import os -import re -import asyncio -import json -import traceback -from fastapi import FastAPI -from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import StreamingResponse, FileResponse -from fastapi.staticfiles import StaticFiles -from playwright.async_api import async_playwright -from dotenv import load_dotenv -from analyze import analyze_file_content - -load_dotenv() - -app = FastAPI() - -# Mount static files (css, images etc) -app.mount("/style", StaticFiles(directory="style"), name="style") - -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=False, - allow_methods=["*"], - allow_headers=["*"], -) - -@app.get("/dashboard") -async def get_dashboard(): - return FileResponse("dashboard.html") - -@app.get("/mailTest") -async def get_mail_test(): - return FileResponse("mailTest.html") - -@app.get("/attachments") -async def get_attachments(): - sample_path = "sample" - if not os.path.exists(sample_path): - os.makedirs(sample_path) - files = [] - for f in os.listdir(sample_path): - f_path = os.path.join(sample_path, f) - if os.path.isfile(f_path): - files.append({ - "name": f, - "size": f"{os.path.getsize(f_path) / 1024:.1f} KB" - }) - return files - -@app.get("/analyze-file") -async def analyze_file(filename: str): - return analyze_file_content(filename) - -@app.get("/") -async def root(): - return FileResponse("index.html") - -@app.get("/sync") -async def sync_data(): - async def event_generator(): - user_id = os.getenv("PM_USER_ID") - password = os.getenv("PM_PASSWORD") - - if not user_id or not password: - yield f"data: {json.dumps({'type': 'log', 'message': '์˜ค๋ฅ˜: .env ํŒŒ์ผ์— ๊ณ„์ • ์ •๋ณด๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.'})}\n\n" - return - - results = [] - - async with async_playwright() as p: - yield f"data: {json.dumps({'type': 'log', 'message': '๋ธŒ๋ผ์šฐ์ € ์‹คํ–‰ ์ค‘...'})}\n\n" - browser = await p.chromium.launch(headless=True, args=[ - "--no-sandbox", - "--disable-dev-shm-usage", - "--disable-blink-features=AutomationControlled" - ]) - context = await browser.new_context( - viewport={'width': 1920, 'height': 1080}, - user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" - ) - page = await context.new_page() - - try: - yield f"data: {json.dumps({'type': 'log', 'message': '์‚ฌ์ดํŠธ ์ ‘์† ๋ฐ ๋กœ๊ทธ์ธ ์ค‘...'})}\n\n" - await page.goto("https://overseas.projectmastercloud.com/", wait_until="domcontentloaded") - - await page.click("#login-by-id", timeout=10000) - await page.fill("#user_id", user_id) - await page.fill("#user_pw", password) - await page.click("#login-btn") - - yield f"data: {json.dumps({'type': 'log', 'message': '๋Œ€์‹œ๋ณด๋“œ ๋ชฉ๋ก ๋Œ€๊ธฐ ์ค‘...'})}\n\n" - await page.wait_for_selector("h4.list__contents_aria_group_body_list_item_label", timeout=60000) - - locators = page.locator("h4.list__contents_aria_group_body_list_item_label") - count = await locators.count() - yield f"data: {json.dumps({'type': 'log', 'message': f'์ด {count}๊ฐœ์˜ ํ”„๋กœ์ ํŠธ ๋ฐœ๊ฒฌ. ์ˆ˜์ง‘ ์‹œ์ž‘.'})}\n\n" - - for i in range(count): - try: - proj = page.locator("h4.list__contents_aria_group_body_list_item_label").nth(i) - project_name = (await proj.inner_text()).strip() - - yield f"data: {json.dumps({'type': 'log', 'message': f'[{i+1}/{count}] {project_name} - ์‹œ์ž‘'})}\n\n" - await proj.scroll_into_view_if_needed() - await proj.click(force=True) - - # ํ”„๋กœ์ ํŠธ ๋กœ๋”ฉ ๋Œ€๊ธฐ (Gitea ๋ฐฉ์‹: ๋ฌผ๋ฆฌ์  ๋Œ€๊ธฐ) - await asyncio.sleep(5) - await page.wait_for_selector("div.footer", state="visible", timeout=20000) - - recent_log = "๊ธฐ์กด๋ฐ์ดํ„ฐ์œ ์ง€" - file_count = 0 - - # 1๋‹จ๊ณ„: ํ™œ๋™๋กœ๊ทธ ์ˆ˜์ง‘ (Gitea ๋ฐฉ์‹ ๋ณต๊ตฌ + ์ •๋ฐ€ ์…€๋ ‰ํ„ฐ) - try: - log_btn_sel = "body > div.footer > div.left > div.wrap.log-wrap > div.title.text" - log_btn = page.locator(log_btn_sel).first - if await log_btn.is_visible(timeout=5000): - yield f"data: {json.dumps({'type': 'log', 'message': ' - [๋กœ๊ทธ] ์ฐฝ ์—ด๊ธฐ ์‹œ๋„...'})}\n\n" - await log_btn.click(force=True) - await asyncio.sleep(5) # ๋กœ๋”ฉ ์ถฉ๋ถ„ํžˆ ๋Œ€๊ธฐ - - modal_sel = "article.archive-modal" - if await page.locator(modal_sel).is_visible(): - yield f"data: {json.dumps({'type': 'log', 'message': ' - [๋กœ๊ทธ] ๋ชจ๋‹ฌ ๋ฐœ๊ฒฌ. ๋ฐ์ดํ„ฐ ๋กœ๋”ฉ ๋Œ€๊ธฐ...'})}\n\n" - # .log-body ๋‚ด๋ถ€์˜ ๋ฐ์ดํ„ฐ๋งŒ ํƒ€๊ฒŸํŒ…ํ•˜๋„๋ก ์ˆ˜์ • - date_sel = "article.archive-modal .log-body .date .text" - user_sel = "article.archive-modal .log-body .user .text" - act_sel = "article.archive-modal .log-body .activity .text" - - # ๋ฐ์ดํ„ฐ๊ฐ€ ๋‚˜ํƒ€๋‚  ๋•Œ๊นŒ์ง€ ์ตœ๋Œ€ 15์ดˆ ๋Œ€๊ธฐ - success_log = False - for _ in range(15): - if await page.locator(date_sel).count() > 0: - raw_date = (await page.locator(date_sel).first.inner_text()).strip() - if raw_date: - success_log = True - break - await asyncio.sleep(1) - - if success_log: - user_name = (await page.locator(user_sel).first.inner_text()).strip() - activity = (await page.locator(act_sel).first.inner_text()).strip() - formatted_date = re.sub(r'[-/]', '.', raw_date)[:10] - recent_log = f"{formatted_date}, {user_name}, {activity}" - yield f"data: {json.dumps({'type': 'log', 'message': f' - [๋กœ๊ทธ] ์„ฑ๊ณต: {recent_log[:30]}...'})}\n\n" - else: - yield f"data: {json.dumps({'type': 'log', 'message': ' - [๋กœ๊ทธ] ๋ฐ์ดํ„ฐ ์ถ”์ถœ ์‹คํŒจ'})}\n\n" - - await page.click("article.archive-modal div.close", timeout=3000) - await asyncio.sleep(1.5) - except Exception as e: - yield f"data: {json.dumps({'type': 'log', 'message': f' - [๋กœ๊ทธ] ์˜ค๋ฅ˜: {str(e)[:20]}'})}\n\n" - - # 2๋‹จ๊ณ„: ๊ตฌ์„ฑ(ํŒŒ์ผ ์ˆ˜) ์ˆ˜์ง‘ (Gitea ์ˆœํšŒ ๋ฐฉ์‹ ๋ณต๊ตฌ + ๋Œ€๊ธฐ ์‹œ๊ฐ„ ๋Œ€ํญ ์—ฐ์žฅ) - try: - sitemap_btn_sel = "body > div.footer > div.left > div.wrap.site-map-wrap" - sitemap_btn = page.locator(sitemap_btn_sel).first - if await sitemap_btn.is_visible(timeout=5000): - yield f"data: {json.dumps({'type': 'log', 'message': ' - [๊ตฌ์„ฑ] ์ง„์ž… ์‹œ๋„...'})}\n\n" - await sitemap_btn.click(force=True) - - # Gitea ๋ฐฉ์‹: context.pages ์ง์ ‘ ๋’ค์ ธ์„œ ํŒ์—… ์ฐพ๊ธฐ - popup_page = None - for _ in range(30): # ์ตœ๋Œ€ 15์ดˆ ๋Œ€๊ธฐ - for p_item in context.pages: - try: - if "composition" in p_item.url: - popup_page = p_item - break - except: pass - if popup_page: break - await asyncio.sleep(0.5) - - if popup_page: - yield f"data: {json.dumps({'type': 'log', 'message': ' - [๊ตฌ์„ฑ] ์ฐฝ ๋ฐœ๊ฒฌ. ๋ฐ์ดํ„ฐ ๋กœ๋”ฉ ๋Œ€๊ธฐ (์ตœ๋Œ€ 30์ดˆ)...'})}\n\n" - # ์‚ฌ์šฉ์ž ์ œ๊ณต ์ •๋ฐ€ ์„ ํƒ์ž ์ ์šฉ (nth-child(3)๊ฐ€ ์‹ค์ œ ๋ฐ์ดํ„ฐ) - target_selector = "#composition-list h6:nth-child(3)" - success_comp = False - - # ์ตœ๋Œ€ 30์ดˆ๊ฐ„ ๋ฐ์ดํ„ฐ๊ฐ€ ๋‚˜ํƒ€๋‚  ๋•Œ๊นŒ์ง€ ๋Œ€๊ธฐ - for _ in range(30): - h6_count = await popup_page.locator(target_selector).count() - if h6_count > 0: - success_comp = True - break - await asyncio.sleep(1) - - if success_comp: - yield f"data: {json.dumps({'type': 'log', 'message': ' - [๊ตฌ์„ฑ] ๋ฐ์ดํ„ฐ ๊ฐ์ง€๋จ. ์ตœ์ข… ๋ Œ๋”๋ง ๋Œ€๊ธฐ...'})}\n\n" - await asyncio.sleep(10) # ๋ Œ๋”๋ง ์•ˆ์ •ํ™”๋ฅผ ์œ„ํ•œ ๋Œ€๊ธฐ - - # ๋ชจ๋“  h6:nth-child(3) ์š”์†Œ๋ฅผ ์ˆœํšŒํ•˜๋ฉฐ ์ˆซ์ž ํ•ฉ์‚ฐ - locators_h6 = popup_page.locator(target_selector) - h6_count = await locators_h6.count() - current_total = 0 - for j in range(h6_count): - text = (await locators_h6.nth(j).inner_text()).strip() - # ํ…์ŠคํŠธ ๋‚ด์—์„œ ์ˆซ์ž๋งŒ ์ถ”์ถœ (์—ฌ๋Ÿฌ ์ค„์ผ ๊ฒฝ์šฐ ๋งˆ์ง€๋ง‰ ์ค„ ๊ธฐ์ค€) - nums = re.findall(r'\d+', text.split('\n')[-1]) - if nums: - current_total += int(nums[0]) - - file_count = current_total - yield f"data: {json.dumps({'type': 'log', 'message': f' - [๊ตฌ์„ฑ] ์„ฑ๊ณต ({file_count}๊ฐœ)'})}\n\n" - else: - yield f"data: {json.dumps({'type': 'log', 'message': ' - [๊ตฌ์„ฑ] ๋กœ๋”ฉ ํƒ€์ž„์•„์›ƒ'})}\n\n" - - await popup_page.close() - else: - yield f"data: {json.dumps({'type': 'log', 'message': ' - [๊ตฌ์„ฑ] ํŒ์—…์ฐฝ ๋ฐœ๊ฒฌ ์‹คํŒจ'})}\n\n" - except Exception as e: - yield f"data: {json.dumps({'type': 'log', 'message': f' - [๊ตฌ์„ฑ] ์˜ค๋ฅ˜: {str(e)[:20]}'})}\n\n" - - results.append({"projectName": project_name, "recentLog": recent_log, "fileCount": file_count}) - - # ํ™ˆ ๋ณต๊ท€ - await page.locator("div.header div.title div").first.click(force=True) - await page.wait_for_selector("h4.list__contents_aria_group_body_list_item_label", timeout=20000) - await asyncio.sleep(2) - - except Exception: - await page.goto("https://overseas.projectmastercloud.com/dashboard", wait_until="domcontentloaded") - - yield f"data: {json.dumps({'type': 'done', 'data': results})}\n\n" - - except Exception as e: - yield f"data: {json.dumps({'type': 'log', 'message': f'์น˜๋ช…์  ์˜ค๋ฅ˜: {str(e)}'})}\n\n" - finally: - await browser.close() - - return StreamingResponse(event_generator(), media_type="text_event-stream") diff --git a/crawler_service.py b/crawler_service.py new file mode 100644 index 0000000..f20ce38 --- /dev/null +++ b/crawler_service.py @@ -0,0 +1,137 @@ +import os +import re +import asyncio +import json +from playwright.async_api import async_playwright +from dotenv import load_dotenv + +load_dotenv() + +async def run_crawler_service(): + """ + Playwright๋ฅผ ์ด์šฉํ•ด ๋ฐ์ดํ„ฐ๋ฅผ ์ˆ˜์ง‘ํ•˜๊ณ  SSE(Server-Sent Events)์šฉ ์ œ๋„ˆ๋ ˆ์ดํ„ฐ๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค. + """ + user_id = os.getenv("PM_USER_ID") + password = os.getenv("PM_PASSWORD") + + if not user_id or not password: + yield f"data: {json.dumps({'type': 'log', 'message': '์˜ค๋ฅ˜: .env ํŒŒ์ผ์— ๊ณ„์ • ์ •๋ณด๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.'})}\n\n" + return + + results = [] + + async with async_playwright() as p: + yield f"data: {json.dumps({'type': 'log', 'message': '๋ธŒ๋ผ์šฐ์ € ์‹คํ–‰ ์ค‘...'})}\n\n" + browser = await p.chromium.launch(headless=True, args=[ + "--no-sandbox", + "--disable-dev-shm-usage", + "--disable-blink-features=AutomationControlled" + ]) + context = await browser.new_context( + viewport={'width': 1920, 'height': 1080}, + user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" + ) + page = await context.new_page() + + try: + yield f"data: {json.dumps({'type': 'log', 'message': '์‚ฌ์ดํŠธ ์ ‘์† ๋ฐ ๋กœ๊ทธ์ธ ์ค‘...'})}\n\n" + await page.goto("https://overseas.projectmastercloud.com/", wait_until="domcontentloaded") + + await page.click("#login-by-id", timeout=10000) + await page.fill("#user_id", user_id) + await page.fill("#user_pw", password) + await page.click("#login-btn") + + yield f"data: {json.dumps({'type': 'log', 'message': '๋Œ€์‹œ๋ณด๋“œ ๋ชฉ๋ก ๋Œ€๊ธฐ ์ค‘...'})}\n\n" + await page.wait_for_selector("h4.list__contents_aria_group_body_list_item_label", timeout=60000) + + locators = page.locator("h4.list__contents_aria_group_body_list_item_label") + count = await locators.count() + yield f"data: {json.dumps({'type': 'log', 'message': f'์ด {count}๊ฐœ์˜ ํ”„๋กœ์ ํŠธ ๋ฐœ๊ฒฌ. ์ˆ˜์ง‘ ์‹œ์ž‘.'})}\n\n" + + for i in range(count): + try: + proj = page.locator("h4.list__contents_aria_group_body_list_item_label").nth(i) + project_name = (await proj.inner_text()).strip() + + yield f"data: {json.dumps({'type': 'log', 'message': f'[{i+1}/{count}] {project_name} - ์‹œ์ž‘'})}\n\n" + await proj.scroll_into_view_if_needed() + await proj.click(force=True) + + await asyncio.sleep(5) + await page.wait_for_selector("div.footer", state="visible", timeout=20000) + + recent_log = "๊ธฐ์กด๋ฐ์ดํ„ฐ์œ ์ง€" + file_count = 0 + + # ๋กœ๊ทธ ์ˆ˜์ง‘ + try: + log_btn_sel = "body > div.footer > div.left > div.wrap.log-wrap > div.title.text" + log_btn = page.locator(log_btn_sel).first + if await log_btn.is_visible(timeout=5000): + await log_btn.click(force=True) + await asyncio.sleep(5) + + date_sel = "article.archive-modal .log-body .date .text" + user_sel = "article.archive-modal .log-body .user .text" + act_sel = "article.archive-modal .log-body .activity .text" + + if await page.locator(date_sel).count() > 0: + raw_date = (await page.locator(date_sel).first.inner_text()).strip() + user_name = (await page.locator(user_sel).first.inner_text()).strip() + activity = (await page.locator(act_sel).first.inner_text()).strip() + formatted_date = re.sub(r'[-/]', '.', raw_date)[:10] + recent_log = f"{formatted_date}, {user_name}, {activity}" + yield f"data: {json.dumps({'type': 'log', 'message': f' - [๋กœ๊ทธ] ์ˆ˜์ง‘ ์™„๋ฃŒ'})}\n\n" + + await page.click("article.archive-modal div.close", timeout=3000) + await asyncio.sleep(1.5) + except: pass + + # ๊ตฌ์„ฑ ์ˆ˜์ง‘ + try: + sitemap_btn_sel = "body > div.footer > div.left > div.wrap.site-map-wrap" + sitemap_btn = page.locator(sitemap_btn_sel).first + if await sitemap_btn.is_visible(timeout=5000): + await sitemap_btn.click(force=True) + + popup_page = None + for _ in range(20): + for p_item in context.pages: + if "composition" in p_item.url: + popup_page = p_item + break + if popup_page: break + await asyncio.sleep(0.5) + + if popup_page: + target_selector = "#composition-list h6:nth-child(3)" + await asyncio.sleep(5) # ๋กœ๋”ฉ ๋Œ€๊ธฐ + locators_h6 = popup_page.locator(target_selector) + h6_count = await locators_h6.count() + current_total = 0 + for j in range(h6_count): + text = (await locators_h6.nth(j).inner_text()).strip() + nums = re.findall(r'\d+', text.split('\n')[-1]) + if nums: current_total += int(nums[0]) + file_count = current_total + yield f"data: {json.dumps({'type': 'log', 'message': f' - [๊ตฌ์„ฑ] {file_count}๊ฐœ ํ™•์ธ'})}\n\n" + await popup_page.close() + except: pass + + results.append({"projectName": project_name, "recentLog": recent_log, "fileCount": file_count}) + + # ํ™ˆ ๋ณต๊ท€ + await page.locator("div.header div.title div").first.click(force=True) + await page.wait_for_selector("h4.list__contents_aria_group_body_list_item_label", timeout=20000) + await asyncio.sleep(2) + + except Exception: + await page.goto("https://overseas.projectmastercloud.com/dashboard", wait_until="domcontentloaded") + + yield f"data: {json.dumps({'type': 'done', 'data': results})}\n\n" + + except Exception as e: + yield f"data: {json.dumps({'type': 'log', 'message': f'์น˜๋ช…์  ์˜ค๋ฅ˜: {str(e)}'})}\n\n" + finally: + await browser.close() diff --git a/mailTest.html b/mailTest.html index 68d0f1f..2f66671 100644 --- a/mailTest.html +++ b/mailTest.html @@ -5,9 +5,118 @@ Project Mail Manager - + + + + +