diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..652d80b --- /dev/null +++ b/.gitignore @@ -0,0 +1,89 @@ +# Python virtual environment +.venv/ +venv/ +env/ +ENV/ + +# Python cache +__pycache__/ +*.py[cod] +*$py.class +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +*.manifest +*.spec + +# Unit test / coverage +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.pytest_cache/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db +Desktop.ini + +# Environment variables +.env +.env.* +!.env.example + +# Project specific +backup_epub/ +input/ +output/ +back/ + +# Large binary files +*.epub +*.pdf +*.png +*.jpg +*.jpeg +*.gif +*.bmp +*.tiff +*.zip +*.tar +*.tar.gz +*.rar + +!README.md diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..14bc30d --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,47 @@ +# CLAUDE.md — documan 프로젝트 규칙 + +## 작업 히스토리 기록 규칙 + +모든 작업 세션이 끝나면 반드시 히스토리 파일을 작성해야 한다. + +### 기록 위치 +``` +docs/history/YYYY-MM-DD_{작업명}.md +``` + +### 필수 포함 항목 (누락 시 저장 차단됨) +```markdown +**소요 시간**: X분 +**Context 사용량**: input Xk / output Xk tokens +``` + +### 선택 포함 항목 +```markdown +**이슈**: #N +``` +- 작업이 특정 Gitea 이슈와 연관된 경우 이슈 번호를 기재 +- 나중에 이슈별 토큰 사용량 집계에 활용됨 + +### 히스토리 파일 작성 기준 +- 사용자 요청이 완료된 직후, 응답 마지막 단계에서 작성 +- 작업명은 핵심 내용을 한국어로 간결하게 (예: `2026-03-31_PDF2단변환기능추가.md`) +- 변경한 파일 목록, 주요 결정사항, 발생한 문제와 해결 방법 포함 + +### 히스토리 파일 템플릿 +```markdown +**이슈**: #N +**소요 시간**: X분 +**Context 사용량**: input Xk / output Xk tokens + +## 작업 내용 +{작업 요약} + +## 변경 파일 +- `파일경로` : 변경 내용 + +## 주요 결정사항 +{설계 판단, 선택한 이유} + +## 문제 및 해결 +{발생한 문제와 해결 방법} +``` diff --git a/convert_epub.py b/convert_epub.py new file mode 100644 index 0000000..4bb1ec6 --- /dev/null +++ b/convert_epub.py @@ -0,0 +1,205 @@ +#!/usr/bin/env python3 +""" +EPUB to Markdown converter using ebooklib and html2text +""" + +import os +import json +import re +from pathlib import Path +import ebooklib +from ebooklib import epub +from bs4 import BeautifulSoup + + +def html_to_markdown(soup): + """Convert BeautifulSoup HTML to Markdown format""" + + def process_element(element): + if isinstance(element, str): + text = element.strip() + if text: + return text + return "" + + tag = element.name + + # Headers + if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: + level_num = int(tag[1]) + text = element.get_text().strip() + return '\n' + '#' * level_num + ' ' + text + '\n' + + # Paragraphs + elif tag == 'p': + text = ''.join(process_element(child) for child in element.children) + return '\n' + text.strip() + '\n' + + # Line breaks + elif tag == 'br': + return '\n' + + # Bold + elif tag in ['strong', 'b']: + text = ''.join(process_element(child) for child in element.children) + return '**' + text.strip() + '**' + + # Italic + elif tag in ['em', 'i']: + text = ''.join(process_element(child) for child in element.children) + return '*' + text.strip() + '*' + + # Links + elif tag == 'a': + text = ''.join(process_element(child) for child in element.children) + href = element.get('href', '') + if href: + return f'[{text.strip()}]({href})' + return text.strip() + + # Images + elif tag == 'img': + src = element.get('src', '') + alt = element.get('alt', '') + return f'![{alt}]({src})' + + # Lists + elif tag == 'ul': + items = [] + for li in element.find_all('li', recursive=False): + text = ''.join(process_element(child) for child in li.children) + items.append('- ' + text.strip()) + return '\n' + '\n'.join(items) + '\n' + + elif tag == 'ol': + items = [] + for i, li in enumerate(element.find_all('li', recursive=False), 1): + text = ''.join(process_element(child) for child in li.children) + items.append(f'{i}. ' + text.strip()) + return '\n' + '\n'.join(items) + '\n' + + # Blockquote + elif tag == 'blockquote': + text = ''.join(process_element(child) for child in element.children) + lines = text.strip().split('\n') + return '\n' + '\n'.join('> ' + line for line in lines) + '\n' + + # Code + elif tag == 'code': + text = element.get_text() + return '`' + text + '`' + + elif tag == 'pre': + text = element.get_text() + return '\n```\n' + text + '\n```\n' + + # Div and span - just process children + elif tag in ['div', 'span', 'section', 'article']: + return ''.join(process_element(child) for child in element.children) + + # Default - process children + else: + return ''.join(process_element(child) for child in element.children) + + # Process body or entire soup + body = soup.find('body') if soup.find('body') else soup + markdown = process_element(body) + + # Clean up multiple newlines + markdown = re.sub(r'\n{3,}', '\n\n', markdown) + + return markdown.strip() + + +def convert_epub_to_markdown(epub_path: str, output_dir: str = "output"): + """ + Convert EPUB file to Markdown + + Args: + epub_path: Path to the EPUB file + output_dir: Directory to save the output (default: "output") + """ + # Create output directory if it doesn't exist + os.makedirs(output_dir, exist_ok=True) + + # Get the base filename without extension + epub_file = Path(epub_path) + base_name = epub_file.stem + + print(f"Converting {epub_path} to Markdown...") + + # Read the EPUB file + book = epub.read_epub(epub_path) + + # Extract all text content + chapters = [] + images = {} + image_counter = 0 + + for item in book.get_items(): + if item.get_type() == ebooklib.ITEM_DOCUMENT: + # Get HTML content + html_content = item.get_content().decode('utf-8') + + # Parse with BeautifulSoup + soup = BeautifulSoup(html_content, 'html.parser') + + # Convert to markdown-like format + markdown_content = html_to_markdown(soup) + + # Clean up the markdown + markdown_content = markdown_content.strip() + + if markdown_content: + chapters.append(markdown_content) + + elif item.get_type() == ebooklib.ITEM_IMAGE: + # Save image + image_counter += 1 + img_name = item.get_name().split('/')[-1] + if not img_name: + img_name = f"image_{image_counter}.{item.media_type.split('/')[-1]}" + images[img_name] = item.get_content() + + # Combine all chapters + full_markdown = "\n\n---\n\n".join(chapters) + + # Save as markdown + output_path = os.path.join(output_dir, f"{base_name}.md") + with open(output_path, "w", encoding="utf-8") as f: + f.write(full_markdown) + + print(f"OK Conversion complete!") + print(f"OK Output saved to: {output_path}") + print(f"OK Total chapters: {len(chapters)}") + + # Save images if any + if images: + images_dir = os.path.join(output_dir, f"{base_name}_images") + os.makedirs(images_dir, exist_ok=True) + for img_name, img_data in images.items(): + img_path = os.path.join(images_dir, img_name) + with open(img_path, "wb") as f: + f.write(img_data) + print(f"OK {len(images)} images saved to: {images_dir}") + + # Save metadata if available + metadata = { + 'title': book.get_metadata('DC', 'title'), + 'creator': book.get_metadata('DC', 'creator'), + 'language': book.get_metadata('DC', 'language'), + 'publisher': book.get_metadata('DC', 'publisher'), + 'description': book.get_metadata('DC', 'description'), + } + + if any(metadata.values()): + metadata_path = os.path.join(output_dir, f"{base_name}_metadata.json") + with open(metadata_path, "w", encoding="utf-8") as f: + json.dump(metadata, f, indent=2, ensure_ascii=False) + print(f"OK Metadata saved to: {metadata_path}") + + +if __name__ == "__main__": + # Convert the EPUB file in the input directory + epub_path = "input/the-art-of-spending-money.epub" + convert_epub_to_markdown(epub_path) diff --git a/convert_pdfs.py b/convert_pdfs.py new file mode 100644 index 0000000..83a61e1 --- /dev/null +++ b/convert_pdfs.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 +""" +Batch PDF to Markdown converter using marker-pdf library +""" + +import os +import glob +from pathlib import Path +from marker.converters.pdf import PdfConverter +from marker.models import create_model_dict +from marker.output import text_from_rendered + + +def convert_pdf_to_markdown(pdf_path: str, output_dir: str = "output"): + """ + Convert PDF file to Markdown + + Args: + pdf_path: Path to the PDF file + output_dir: Directory to save the output (default: "output") + """ + # Create output directory if it doesn't exist + os.makedirs(output_dir, exist_ok=True) + + # Get the base filename without extension + pdf_file = Path(pdf_path) + base_name = pdf_file.stem + + print(f"\nConverting {pdf_file.name} to Markdown...") + + try: + # Initialize the converter with model dictionary + converter = PdfConverter( + artifact_dict=create_model_dict(), + ) + + # Convert the PDF file + rendered = converter(pdf_path) + + # Extract text and images from rendered output + text, metadata, images = text_from_rendered(rendered) + + # Save as markdown + output_path = os.path.join(output_dir, f"{base_name}.md") + with open(output_path, "w", encoding="utf-8") as f: + f.write(text) + + print(f" OK Output saved to: {output_path}") + + # Save images if any + if images: + images_dir = os.path.join(output_dir, f"{base_name}_images") + os.makedirs(images_dir, exist_ok=True) + for img_name, img_data in images.items(): + img_path = os.path.join(images_dir, img_name) + with open(img_path, "wb") as f: + f.write(img_data) + print(f" OK {len(images)} images saved to: {images_dir}") + + # Save metadata if available + if metadata: + metadata_path = os.path.join(output_dir, f"{base_name}_metadata.json") + import json + with open(metadata_path, "w", encoding="utf-8") as f: + json.dump(metadata, f, indent=2, ensure_ascii=False) + print(f" OK Metadata saved to: {metadata_path}") + + return True + + except Exception as e: + print(f" ERROR: Failed to convert {pdf_file.name}: {e}") + return False + + +def convert_all_pdfs(input_dir: str = "input", output_dir: str = "output"): + """ + Convert all PDF files in the input directory to Markdown + + Args: + input_dir: Directory containing PDF files + output_dir: Directory to save the output + """ + # Find all PDF files + pdf_pattern = os.path.join(input_dir, "*.pdf") + pdf_files = sorted(glob.glob(pdf_pattern)) + + if not pdf_files: + print(f"No PDF files found in {input_dir}") + return + + print(f"Found {len(pdf_files)} PDF files to convert") + print("=" * 60) + + successful = 0 + failed = 0 + + for pdf_file in pdf_files: + if convert_pdf_to_markdown(pdf_file, output_dir): + successful += 1 + else: + failed += 1 + + print("\n" + "=" * 60) + print(f"Conversion complete!") + print(f" Successful: {successful}") + print(f" Failed: {failed}") + print(f" Total: {len(pdf_files)}") + + +if __name__ == "__main__": + convert_all_pdfs() diff --git a/convert_pdfs_fast.py b/convert_pdfs_fast.py new file mode 100644 index 0000000..547b5a5 --- /dev/null +++ b/convert_pdfs_fast.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 +""" +Fast PDF to Markdown converter - optimized for text-heavy documents +""" + +import argparse +import os +import glob +from pathlib import Path +from marker.converters.pdf import PdfConverter +from marker.models import create_model_dict +from marker.output import text_from_rendered +from marker.config.parser import ConfigParser + + +def convert_pdf_to_markdown_fast(pdf_path: str, output_dir: str = "output", languages: str = None): + """ + Convert PDF file to Markdown with speed optimizations for text-heavy documents + + Args: + pdf_path: Path to the PDF file + output_dir: Directory to save the output (default: "output") + languages: Comma-separated language codes for OCR (e.g. "ko", "ko,en") + """ + # Create output directory if it doesn't exist + os.makedirs(output_dir, exist_ok=True) + + # Get the base filename without extension + pdf_file = Path(pdf_path) + base_name = pdf_file.stem + + print(f"\nConverting {pdf_file.name} to Markdown...") + if languages: + print(f" Languages: {languages}") + + try: + # Configure for speed - text-focused processing + config = { + "output_format": "markdown", + # Disable image extraction for speed (images won't be saved separately) + # "disable_image_extraction": True, # Uncomment if you want to skip all images + } + + if languages: + config["languages"] = languages.split(",") + + config_parser = ConfigParser(config) + + # Initialize the converter with optimized settings + converter = PdfConverter( + config=config_parser.generate_config_dict(), + artifact_dict=create_model_dict(), + processor_list=config_parser.get_processors(), + renderer=config_parser.get_renderer(), + ) + + # Convert the PDF file + rendered = converter(pdf_path) + + # Extract text and images from rendered output + text, metadata, images = text_from_rendered(rendered) + + # Save as markdown + output_path = os.path.join(output_dir, f"{base_name}.md") + with open(output_path, "w", encoding="utf-8") as f: + f.write(text) + + print(f" OK Output saved to: {output_path}") + + # Save images + if images: + images_dir = os.path.join(output_dir, f"{base_name}_images") + os.makedirs(images_dir, exist_ok=True) + for img_name, img_data in images.items(): + img_path = os.path.join(images_dir, img_name) + with open(img_path, "wb") as f: + f.write(img_data) + print(f" OK {len(images)} images saved to: {images_dir}") + + # Skip metadata saving for speed + # if metadata: + # metadata_path = os.path.join(output_dir, f"{base_name}_metadata.json") + # import json + # with open(metadata_path, "w", encoding="utf-8") as f: + # json.dump(metadata, f, indent=2, ensure_ascii=False) + # print(f" OK Metadata saved to: {metadata_path}") + + return (True, pdf_file.name) + + except Exception as e: + print(f" ERROR: Failed to convert {pdf_file.name}: {e}") + return (False, pdf_file.name) + + +def convert_all_pdfs_fast(input_dir: str = "input", output_dir: str = "output", languages: str = None): + """ + Convert all PDF files in the input directory to Markdown (sequential, memory-safe) + + Args: + input_dir: Directory containing PDF files + output_dir: Directory to save the output + languages: Comma-separated language codes for OCR (e.g. "ko", "ko,en") + """ + # Find all PDF files + pdf_pattern = os.path.join(input_dir, "*.pdf") + pdf_files = sorted(glob.glob(pdf_pattern)) + + if not pdf_files: + print(f"No PDF files found in {input_dir}") + return + + print(f"Found {len(pdf_files)} PDF files to convert") + print("Mode: FAST (text-focused, sequential processing)") + if languages: + print(f"Languages: {languages}") + print("=" * 60) + + successful = 0 + failed = 0 + failed_files = [] + + for i, pdf_file in enumerate(pdf_files, 1): + print(f"\n[{i}/{len(pdf_files)}]", end=" ") + success, filename = convert_pdf_to_markdown_fast(pdf_file, output_dir, languages) + if success: + successful += 1 + else: + failed += 1 + failed_files.append(filename) + + print("\n" + "=" * 60) + print(f"Conversion complete!") + print(f" Successful: {successful}") + print(f" Failed: {failed}") + print(f" Total: {len(pdf_files)}") + + if failed_files: + print(f"\nFailed files:") + for filename in failed_files: + print(f" - {filename}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Fast PDF to Markdown converter") + parser.add_argument("--input_dir", default="input", help="Input directory containing PDF files") + parser.add_argument("--output_dir", default="output", help="Output directory for markdown files") + parser.add_argument("--languages", default=None, help="Comma-separated language codes for OCR (e.g. ko, ko,en)") + args = parser.parse_args() + + convert_all_pdfs_fast(args.input_dir, args.output_dir, args.languages) diff --git a/convert_pdfs_parallel.py b/convert_pdfs_parallel.py new file mode 100644 index 0000000..d68efa0 --- /dev/null +++ b/convert_pdfs_parallel.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python3 +""" +Batch PDF to Markdown converter with parallel processing using marker-pdf library +""" + +import os +import glob +from pathlib import Path +from marker.converters.pdf import PdfConverter +from marker.models import create_model_dict +from marker.output import text_from_rendered +from concurrent.futures import ProcessPoolExecutor, as_completed +import multiprocessing + + +def convert_pdf_to_markdown(pdf_path: str, output_dir: str = "output"): + """ + Convert PDF file to Markdown + + Args: + pdf_path: Path to the PDF file + output_dir: Directory to save the output (default: "output") + """ + # Create output directory if it doesn't exist + os.makedirs(output_dir, exist_ok=True) + + # Get the base filename without extension + pdf_file = Path(pdf_path) + base_name = pdf_file.stem + + print(f"\nConverting {pdf_file.name} to Markdown...") + + try: + # Initialize the converter with model dictionary + converter = PdfConverter( + artifact_dict=create_model_dict(), + ) + + # Convert the PDF file + rendered = converter(pdf_path) + + # Extract text and images from rendered output + text, metadata, images = text_from_rendered(rendered) + + # Save as markdown + output_path = os.path.join(output_dir, f"{base_name}.md") + with open(output_path, "w", encoding="utf-8") as f: + f.write(text) + + print(f" OK Output saved to: {output_path}") + + # Save images if any + if images: + images_dir = os.path.join(output_dir, f"{base_name}_images") + os.makedirs(images_dir, exist_ok=True) + for img_name, img_data in images.items(): + img_path = os.path.join(images_dir, img_name) + with open(img_path, "wb") as f: + f.write(img_data) + print(f" OK {len(images)} images saved to: {images_dir}") + + # Save metadata if available + if metadata: + metadata_path = os.path.join(output_dir, f"{base_name}_metadata.json") + import json + with open(metadata_path, "w", encoding="utf-8") as f: + json.dump(metadata, f, indent=2, ensure_ascii=False) + print(f" OK Metadata saved to: {metadata_path}") + + return (True, pdf_file.name) + + except Exception as e: + print(f" ERROR: Failed to convert {pdf_file.name}: {e}") + return (False, pdf_file.name) + + +def convert_all_pdfs_parallel(input_dir: str = "input", output_dir: str = "output", max_workers: int = None): + """ + Convert all PDF files in the input directory to Markdown using parallel processing + + Args: + input_dir: Directory containing PDF files + output_dir: Directory to save the output + max_workers: Maximum number of parallel workers (default: CPU count - 1) + """ + # Find all PDF files + pdf_pattern = os.path.join(input_dir, "*.pdf") + pdf_files = sorted(glob.glob(pdf_pattern)) + + if not pdf_files: + print(f"No PDF files found in {input_dir}") + return + + # Determine number of workers + if max_workers is None: + max_workers = max(1, multiprocessing.cpu_count() - 1) + + print(f"Found {len(pdf_files)} PDF files to convert") + print(f"Using {max_workers} parallel workers") + print("=" * 60) + + successful = 0 + failed = 0 + failed_files = [] + + # Process PDFs in parallel + with ProcessPoolExecutor(max_workers=max_workers) as executor: + # Submit all tasks + future_to_pdf = { + executor.submit(convert_pdf_to_markdown, pdf_file, output_dir): pdf_file + for pdf_file in pdf_files + } + + # Process completed tasks as they finish + for future in as_completed(future_to_pdf): + pdf_file = future_to_pdf[future] + try: + success, filename = future.result() + if success: + successful += 1 + else: + failed += 1 + failed_files.append(filename) + except Exception as e: + print(f" ERROR: Exception occurred for {pdf_file}: {e}") + failed += 1 + failed_files.append(Path(pdf_file).name) + + print("\n" + "=" * 60) + print(f"Conversion complete!") + print(f" Successful: {successful}") + print(f" Failed: {failed}") + print(f" Total: {len(pdf_files)}") + + if failed_files: + print(f"\nFailed files:") + for filename in failed_files: + print(f" - {filename}") + + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser(description="Parallel PDF to Markdown converter") + parser.add_argument("--input_dir", default="input", help="Input directory containing PDF files") + parser.add_argument("--output_dir", default="output", help="Output directory for markdown files") + parser.add_argument("--workers", type=int, default=2, help="Number of parallel workers (default: 2)") + args = parser.parse_args() + + convert_all_pdfs_parallel(args.input_dir, args.output_dir, args.workers) diff --git a/convert_with_cropped_images.py b/convert_with_cropped_images.py new file mode 100644 index 0000000..e2e5922 --- /dev/null +++ b/convert_with_cropped_images.py @@ -0,0 +1,310 @@ +#!/usr/bin/env python3 +""" +PDF to Markdown converter with cropped figure extraction +Uses marker-pdf to detect figures, then crops them from page images. +Supports 2-column (multi-column) → single-column reordering. +""" + +import os +import re +import glob +from pathlib import Path +from marker.converters.pdf import PdfConverter +from marker.models import create_model_dict +from marker.output import text_from_rendered +from PIL import Image +import fitz # PyMuPDF + + +def is_scanned_pdf(pdf_path: str, sample_pages: int = 3) -> bool: + """페이지에 선택 가능한 텍스트가 없으면 스캔 PDF로 판단""" + doc = fitz.open(pdf_path) + total = min(sample_pages, len(doc)) + text_chars = 0 + for i in range(total): + text_chars += len(doc[i].get_text().strip()) + doc.close() + return text_chars < 50 # 글자 수가 매우 적으면 스캔본 + + +def reorder_text_by_columns(pdf_path: str) -> str: + """ + 텍스트 기반 PDF 전용: PyMuPDF 블록 좌표로 2단 → 1단 순서 재정렬. + 각 페이지에서 좌측 컬럼 전체 → 우측 컬럼 전체 순으로 읽음. + """ + doc = fitz.open(pdf_path) + pages_text = [] + + for page in doc: + blocks = page.get_text("blocks", sort=False) + text_blocks = [b for b in blocks if b[6] == 0 and b[4].strip()] + if not text_blocks: + continue + + page_width = page.rect.width + mid_x = page_width / 2 + + left = [b for b in text_blocks if b[2] <= mid_x + 30] + right = [b for b in text_blocks if b[0] >= mid_x - 30] + span = [b for b in text_blocks if b[0] < mid_x - 30 and b[2] > mid_x + 30] + + is_two_col = len(left) >= 2 and len(right) >= 2 and not span + + if is_two_col: + left.sort(key=lambda b: b[1]) + right.sort(key=lambda b: b[1]) + ordered = left + right + else: + ordered = sorted(text_blocks, key=lambda b: (b[1], b[0])) + + pages_text.append("\n\n".join(b[4].strip() for b in ordered)) + + doc.close() + return "\n\n---\n\n".join(pages_text) + + +def extract_figure_images(pdf_path: str, rendered, output_dir: str, base_name: str): + """ + Extract figure images by cropping from page images based on marker's detection + + Args: + pdf_path: Path to PDF file + rendered: Marker's rendered output with figure positions + output_dir: Output directory + base_name: Base filename + + Returns: + dict: Mapping of image names to image data + """ + images_dict = {} + + # Check if rendered has pages with image information + if not hasattr(rendered, 'pages') or not rendered.pages: + print(" No page information in rendered output") + return images_dict + + # Open PDF with PyMuPDF to render pages as images + doc = fitz.open(pdf_path) + + print(f" Processing {len(rendered.pages)} pages for figure extraction...") + + for page_idx, page_data in enumerate(rendered.pages): + page_num = page_idx + 1 + + # Check if page has images/figures + if not hasattr(page_data, 'images') or not page_data.images: + continue + + print(f" Page {page_num}: Found {len(page_data.images)} figure(s)") + + # Render page as image + pdf_page = doc[page_idx] + + # Render at 2x resolution for better quality + mat = fitz.Matrix(2, 2) + pix = pdf_page.get_pixmap(matrix=mat) + + # Convert to PIL Image + import io + img_data = pix.tobytes("png") + page_img = Image.open(io.BytesIO(img_data)) + + # Extract each figure from this page + for fig_idx, fig_info in enumerate(page_data.images): + try: + # Get bounding box (marker stores positions) + if hasattr(fig_info, 'bbox'): + bbox = fig_info.bbox + + # Scale bbox coordinates (marker uses PDF coordinates) + # Adjust for 2x rendering + x0, y0, x1, y1 = bbox + x0, y0, x1, y1 = int(x0 * 2), int(y0 * 2), int(x1 * 2), int(y1 * 2) + + # Crop the figure + cropped = page_img.crop((x0, y0, x1, y1)) + + # Save to bytes + from io import BytesIO + img_bytes = BytesIO() + cropped.save(img_bytes, format='PNG') + + # Generate image name + img_name = f"_page_{page_num}_Figure_{fig_idx + 1}.png" + images_dict[img_name] = img_bytes.getvalue() + + print(f" Cropped figure {fig_idx + 1}: {x1-x0}x{y1-y0}px") + + except Exception as e: + print(f" Warning: Could not crop figure {fig_idx + 1}: {e}") + + doc.close() + return images_dict + + +def convert_pdf_with_cropped_images(pdf_path: str, output_dir: str = "output"): + """ + Convert PDF to Markdown with cropped figure images. + - 스캔 PDF: marker-pdf OCR + 레이아웃 검출 (2단 자동 처리) + - 텍스트 PDF: PyMuPDF 블록 좌표 기반 2단→1단 재정렬 + """ + import io + + os.makedirs(output_dir, exist_ok=True) + + pdf_file = Path(pdf_path) + base_name = pdf_file.stem + + print(f"\nConverting {pdf_file.name}...") + + scanned = is_scanned_pdf(pdf_path) + print(f" PDF type: {'scanned (OCR)' if scanned else 'text-based (PyMuPDF column reorder)'}") + + try: + if not scanned: + # 텍스트 기반 PDF: PyMuPDF로 2단 재정렬 추출 + print(" Extracting text with column reordering...") + text = reorder_text_by_columns(pdf_path) + metadata = None + marker_images = {} + else: + # 스캔 PDF: marker-pdf가 OCR + 레이아웃(2단) 처리 + converter = PdfConverter( + artifact_dict=create_model_dict(), + ) + + print(" Running marker-pdf OCR and layout detection...") + rendered = converter(pdf_path) + + text, metadata, marker_images = text_from_rendered(rendered) + + # Fix image paths: prepend {base_name}_images/ folder to image references + # 공백을 %20으로 인코딩 — Obsidian(CommonMark) 경로 파싱 오류 방지 + safe_base_name = base_name.replace(' ', '%20') + text = re.sub( + r'!\[([^\]]*)\]\(([^/)][^)]*)\)', + rf'![\1]({safe_base_name}_images/\2)', + text + ) + + # Save markdown + output_path = os.path.join(output_dir, f"{base_name}.md") + with open(output_path, "w", encoding="utf-8") as f: + f.write(text) + print(f" OK Markdown saved: {output_path}") + + # Extract cropped figure images + print(" Extracting figures from pages...") + cropped_images = extract_figure_images(pdf_path, rendered, output_dir, base_name) + + if cropped_images: + images_dir = os.path.join(output_dir, f"{base_name}_images") + os.makedirs(images_dir, exist_ok=True) + + for img_name, img_data in cropped_images.items(): + img_path = os.path.join(images_dir, img_name) + with open(img_path, "wb") as f: + f.write(img_data) + + print(f" OK {len(cropped_images)} figures saved to: {images_dir}") + else: + print(" ! No figures extracted (trying alternative method...)") + # Fallback: use marker's images if available + if marker_images: + images_dir = os.path.join(output_dir, f"{base_name}_images") + os.makedirs(images_dir, exist_ok=True) + + saved_count = 0 + for img_name, img_data in marker_images.items(): + try: + from io import BytesIO + if isinstance(img_data, Image.Image): + img_bytes = BytesIO() + img_data.save(img_bytes, format='PNG') + img_bytes = img_bytes.getvalue() + else: + img_bytes = img_data + + if img_bytes and len(img_bytes) > 0: + img_path = os.path.join(images_dir, img_name) + with open(img_path, "wb") as f: + f.write(img_bytes) + saved_count += 1 + except Exception as e: + print(f" Warning: Could not save {img_name}: {e}") + + if saved_count > 0: + print(f" OK {saved_count} images from marker saved") + else: + print(" ! No valid images to save") + + # Save metadata + if metadata: + import json + metadata_path = os.path.join(output_dir, f"{base_name}_metadata.json") + with open(metadata_path, "w", encoding="utf-8") as f: + json.dump(metadata, f, indent=2, ensure_ascii=False) + + return True + + except Exception as e: + print(f" ERROR: {e}") + import traceback + traceback.print_exc() + return False + + +def convert_all_pdfs(input_dir: str = "input", output_dir: str = "output"): + """ + Convert all PDFs with cropped figure extraction + Each PDF is converted in a separate process to avoid multiprocessing issues + """ + pdf_pattern = os.path.join(input_dir, "*.pdf") + pdf_files = sorted(glob.glob(pdf_pattern)) + + if not pdf_files: + print(f"No PDF files found in {input_dir}") + return + + print(f"Found {len(pdf_files)} PDF file(s)") + print("=" * 60) + + successful = 0 + failed = 0 + + import subprocess + import sys + + for pdf_file in pdf_files: + print(f"\nStarting conversion of: {os.path.basename(pdf_file)}") + + result = subprocess.run( + [sys.executable, __file__, "--single", pdf_file, output_dir], + capture_output=False + ) + + if result.returncode == 0: + successful += 1 + else: + failed += 1 + print(f" FAILED: {os.path.basename(pdf_file)}") + + print("\n" + "=" * 60) + print(f"Conversion complete!") + print(f" Successful: {successful}") + print(f" Failed: {failed}") + print(f" Total: {len(pdf_files)}") + + +if __name__ == "__main__": + import sys + + # Check if running in single-file mode (called by subprocess) + if len(sys.argv) >= 4 and sys.argv[1] == "--single": + pdf_file = sys.argv[2] + output_dir = sys.argv[3] + success = convert_pdf_with_cropped_images(pdf_file, output_dir) + sys.exit(0 if success else 1) + else: + # Normal batch mode + convert_all_pdfs() diff --git a/debug_marker_images.py b/debug_marker_images.py new file mode 100644 index 0000000..f9af016 --- /dev/null +++ b/debug_marker_images.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 +""" +Debug marker-pdf image extraction +""" + +import os +from pathlib import Path +from marker.converters.pdf import PdfConverter +from marker.models import create_model_dict +from marker.output import text_from_rendered + + +def debug_image_extraction(pdf_path: str): + """ + Debug why images are not being extracted properly + """ + pdf_file = Path(pdf_path) + print(f"Debugging image extraction for: {pdf_file.name}") + print("=" * 60) + + try: + # Initialize converter + converter = PdfConverter( + artifact_dict=create_model_dict(), + ) + + # Convert + print("\nConverting PDF...") + rendered = converter(pdf_path) + print(f" Rendered type: {type(rendered)}") + print(f" Rendered attributes: {dir(rendered)}") + + # Check what's in rendered + if hasattr(rendered, 'images'): + print(f"\n rendered.images exists: {len(rendered.images) if rendered.images else 0} images") + if rendered.images: + for idx, (key, val) in enumerate(list(rendered.images.items())[:3]): + print(f" Image {idx}: {key}, data size: {len(val) if val else 0}") + + # Extract text and images + print("\nExtracting text and images...") + text, metadata, images = text_from_rendered(rendered) + + print(f"\n Text length: {len(text)} characters") + print(f" Metadata: {type(metadata)}") + print(f" Images dict: {len(images) if images else 0} items") + + if images: + print("\n Detailed image info:") + for idx, (img_name, img_data) in enumerate(images.items()): + print(f" {idx + 1}. Name: {img_name}") + print(f" Data type: {type(img_data)}") + print(f" Data size: {len(img_data) if img_data else 0} bytes") + if img_data: + print(f" First 20 bytes: {img_data[:20]}") + else: + print(f" WARNING: Empty data!") + else: + print("\n WARNING: No images returned!") + + # Check rendered object for image data + print("\n Checking rendered object structure:") + if hasattr(rendered, '__dict__'): + for key, val in rendered.__dict__.items(): + if 'image' in key.lower(): + print(f" {key}: {type(val)}, length: {len(val) if hasattr(val, '__len__') else 'N/A'}") + + # Try to access images directly from rendered + if hasattr(rendered, 'images') and rendered.images: + print("\n Attempting direct image access:") + print(f" Total images in rendered: {len(rendered.images)}") + for idx, (img_name, img_obj) in enumerate(list(rendered.images.items())[:3]): + print(f"\n Image {idx + 1}: {img_name}") + print(f" Type: {type(img_obj)}") + print(f" Attributes: {dir(img_obj) if hasattr(img_obj, '__dir__') else 'None'}") + if hasattr(img_obj, 'tobytes'): + img_bytes = img_obj.tobytes() + print(f" Bytes: {len(img_bytes)}") + elif hasattr(img_obj, 'save'): + print(f" Has save method (PIL Image?)") + + except Exception as e: + print(f"\n ERROR: {e}") + import traceback + traceback.print_exc() + + +if __name__ == "__main__": + # Debug the first PDF in input folder + import glob + pdf_files = glob.glob("input/*.pdf") + if pdf_files: + debug_image_extraction(pdf_files[0]) + else: + print("No PDF files found in input folder") diff --git a/debug_single_page.py b/debug_single_page.py new file mode 100644 index 0000000..aee4426 --- /dev/null +++ b/debug_single_page.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 +""" +Debug image extraction for a single page +""" + +import os +from pathlib import Path +from marker.converters.pdf import PdfConverter +from marker.models import create_model_dict +from marker.output import text_from_rendered +import pypdfium2 as pdfium + + +def debug_single_page(pdf_path: str, page_num: int = 1): + """ + Debug image extraction for a specific page (page_num is 1-indexed) + """ + pdf_file = Path(pdf_path) + print(f"Debugging page {page_num} of: {pdf_file.name}") + print("=" * 60) + + # First check what PyPDFium2 sees + print("\n1. Checking with PyPDFium2:") + try: + pdf = pdfium.PdfDocument(pdf_path) + page = pdf[page_num - 1] # 0-indexed + + print(f" Page {page_num} objects:") + obj_count = 0 + for obj in page.get_objects(): + obj_count += 1 + if hasattr(pdfium, 'FPDF_PAGEOBJ_IMAGE'): + if obj.type == pdfium.FPDF_PAGEOBJ_IMAGE: + print(f" - Image object found (old API)") + else: + print(f" - Object type: {obj.type}") + + print(f" Total objects on page: {obj_count}") + pdf.close() + except Exception as e: + print(f" PyPDFium2 error: {e}") + + # Now check marker-pdf + print("\n2. Checking with marker-pdf:") + try: + converter = PdfConverter( + artifact_dict=create_model_dict(), + ) + + print(" Converting...") + rendered = converter(pdf_path) + + # Check rendered object + print(f"\n Rendered type: {type(rendered)}") + + if hasattr(rendered, 'images'): + print(f" rendered.images: {len(rendered.images) if rendered.images else 0} images") + if rendered.images: + for img_name, img_data in list(rendered.images.items())[:5]: + print(f" - {img_name}: {len(img_data) if img_data else 0} bytes") + + # Extract using text_from_rendered + print("\n3. Extracting with text_from_rendered:") + text, metadata, images = text_from_rendered(rendered) + + print(f" Extracted images: {len(images) if images else 0}") + if images: + for img_name, img_data in images.items(): + print(f" - {img_name}: {len(img_data) if img_data else 0} bytes") + if not img_data or len(img_data) == 0: + print(f" ⚠️ WARNING: Empty image data!") + + # Save a test image if available + if images: + output_dir = "output/debug_test" + os.makedirs(output_dir, exist_ok=True) + + for img_name, img_data in images.items(): + if img_data and len(img_data) > 0: + img_path = os.path.join(output_dir, img_name) + with open(img_path, "wb") as f: + f.write(img_data) + print(f"\n ✓ Saved test image: {img_path}") + break + + except Exception as e: + print(f" marker-pdf error: {e}") + import traceback + traceback.print_exc() + + +if __name__ == "__main__": + import glob + pdf_files = glob.glob("input/*.pdf") + if pdf_files: + # Test page 2 (should have Figure 1.2, 1.3 according to the markdown) + debug_single_page(pdf_files[0], page_num=2) + else: + print("No PDF files found in input folder") diff --git a/docs/history/.gitkeep b/docs/history/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/docs/history/2026-03-31_GPU설정및전체MD변환시작.md b/docs/history/2026-03-31_GPU설정및전체MD변환시작.md new file mode 100644 index 0000000..bae6ced --- /dev/null +++ b/docs/history/2026-03-31_GPU설정및전체MD변환시작.md @@ -0,0 +1,33 @@ +**이슈**: #1 +**소요 시간**: 90분 +**Context 사용량**: input 80k / output 10k tokens + +## 작업 내용 +- torch CPU 버전(2.9.1)을 GPU 버전(2.7.0+cu126)으로 교체하여 RTX 3060 GPU OCR 활성화 +- MSEW3.0 매뉴얼 96페이지 중 01~12 MD 변환 완료 +- 01~12 변환된 MD 파일의 이미지 참조 아래에 파라미터 설명 삽입 완료 +- 나머지 13~96은 PowerShell 명령어로 야간 자동 변환 예정 + +## 변경 파일 +- `.venv` : torch 2.9.1+cpu → 2.7.0+cu126 교체 +- `output/MSWE3.0 Manual-01~12.md` : 생성 완료 +- `output/MSWE3.0 Manual-03,04,09,10,11,12.md` : 이미지 파라미터 설명 삽입 + +## 주요 결정사항 +- torch pip 설치 시 `.venv/Scripts/python.exe -m pip` 사용해야 올바른 venv에 설치됨 (단순 pip 명령은 다른 venv에 설치됨) +- 병렬 변환(2개 동시)은 GPU 과부하 위험 → 1개씩 순차 처리로 변경 +- 야간 변환: 완료 파일 자동 건너뜀 로직 포함 PowerShell 스크립트 사용 + +## 추출된 파라미터 (03~12 추가분) +- Available Connection Strength: Confining Stress Sigma [kPa], CRu-1, CRs-1, GEOGRID 타입(1.4T~5.15T) +- Project Identification: Project Title(필수), Designer(필수), Date/Time, Company/Firm, Project No. +- Program Manager 메인화면: AASHTO(ASD/LRFD)/NCMA 설계모드 선택 +- Reinforcement Layout: LAYER#, Geogrid Height [m], Geogrid Type#, Vertical distance [m] +- FACIA(Blocks): Depth Wu [m]=0.3, Height Hu [m]=0.2, Unit weight γ [kN/m³]=24, Gu [m]=0.15 +- Reduction factors at connection: RF d(내구성), RF c(크리프), BREAK Overall Fs=1.5, PULLOUT Overall Fs=1.5, 지진시 Tc-pullout 감소율=80% +- Results 화면: Final Values for Design, 층별 강도/연결부 체크 결과 + +## 문제 및 해결 +- `pip install` 이 docuConverter01 venv에 설치되는 문제 → `python -m pip install` 로 해결 +- torch 2.6.0: marker-pdf 호환 안됨(>=2.7.0 필요) → 2.7.0+cu126 사용 +- 병렬 실행 과부하 → 순차 처리로 변경 diff --git a/docs/history/2026-03-31_MSEW매뉴얼31-47이미지읽기시도중단.md b/docs/history/2026-03-31_MSEW매뉴얼31-47이미지읽기시도중단.md new file mode 100644 index 0000000..e8425df --- /dev/null +++ b/docs/history/2026-03-31_MSEW매뉴얼31-47이미지읽기시도중단.md @@ -0,0 +1,25 @@ +**이슈**: #1 +**소요 시간**: 약 30분 (추정) +**Context 사용량**: input 약 180k / output 약 3k tokens (추정 — 컨텍스트 초과로 정확한 수치 기록 불가) + +## 작업 내용 +MSEW3.0 Manual 31~47번 이미지 파라미터 분석 시도. +이미지 Read 툴로 31~33번 일부 이미지를 읽던 중 컨텍스트 한도 초과로 강제 종료. +MD 파일에 실제 삽입(Edit)은 한 건도 이루어지지 않음 — 다음 세션에서 전체 재처리됨. + +## 변경 파일 +- 없음 (컨텍스트 초과로 Edit 도달 전 종료) + +## 읽은 이미지 목록 (삽입 미완료) +- `output/MSWE3.0 Manual-31_images/` : Figure_1, Figure_3, Picture_5, Picture_13 (4개) +- `output/MSWE3.0 Manual-32_images/` : Figure_1, Figure_8, Picture_22 (3개) +- `output/MSWE3.0 Manual-33_images/` : Figure_2 (1개) +- 합계 8개 이미지 읽기 완료, 나머지 55개 미처리 + +## 주요 결정사항 +- 이미지 Read 시 각 이미지당 약 15~20k 입력 토큰 소비 → 8개만 읽어도 컨텍스트 급증 +- 다음 세션에서 전체 31~47 이미지 일괄 재처리 전략으로 변경 + +## 문제 및 해결 +- 이미지 멀티모달 분석이 컨텍스트를 매우 빠르게 소비함 +- 다음 세션(`2026-03-31_MSEW매뉴얼31-47파라미터삽입.md`)에서 31~47 전체 완료 diff --git a/docs/history/2026-03-31_MSEW매뉴얼31-47파라미터삽입.md b/docs/history/2026-03-31_MSEW매뉴얼31-47파라미터삽입.md new file mode 100644 index 0000000..8afa09b --- /dev/null +++ b/docs/history/2026-03-31_MSEW매뉴얼31-47파라미터삽입.md @@ -0,0 +1,34 @@ +**이슈**: #1 +**소요 시간**: 40분 +**Context 사용량**: input 120k / output 8k tokens + +## 작업 내용 +MSEW3.0 Manual MD 파일 31~47번에 이미지 파라미터 설명 삽입 (이전 세션에서 컨텍스트 초과로 중단된 작업 재개) + +## 변경 파일 +- `output/MSWE3.0 Manual-31.md` : Figure_1 (내적K선택), Figure_3 (Coulomb δ), Picture_5 (외적K δ설명), Picture_13 (Wrap-around 수직간격) +- `output/MSWE3.0 Manual-32.md` : Figure_1 (MetalStrip Program Manager), Figure_8 (Simple Geometry), Picture_22 (근입깊이) +- `output/MSWE3.0 Manual-33.md` : Figure_2 (Complex Structures), Figure_9 (Foundation Soil Properties), Figure_12 (Metal Strip Design 메인) +- `output/MSWE3.0 Manual-34.md` : Figure_6 (균등간격), Figure_7 (계산진행), Figure_9 (부식두께 NOTE), Figure_11 (수평간격범위) +- `output/MSWE3.0 Manual-35.md` : Figure_2 (보강재종류수), Figure_4 (두종류보강재데이터), Figure_6 (배치테이블), Figure_9 (Metal Strip Data), Figure_10 (상호작용파라미터) +- `output/MSWE3.0 Manual-36.md` : Figure_9 (토압계수변화), Picture_11 (Fw배치), Picture_13 (외적K선택) +- `output/MSWE3.0 Manual-37.md` : Figure_0 (패널물성), Figure_5 (연결부강도관계), Figure_7 (연결부감소계수) +- `output/MSWE3.0 Manual-38.md` : Figure_1 (동적하중설계), Figure_6 (지층기본설정) +- `output/MSWE3.0 Manual-39.md` : Figure_0 (첫번째지층), Figure_2 (두번째지층), Picture_8 (결과확인화면), Picture_12 (복합안정성아이콘) +- `output/MSWE3.0 Manual-40.md` : Figure_0 (복합안정성초기값), Figure_7 (해석기준선택), Figure_13 (SearchGrid), Figure_18 (Bishop진행) +- `output/MSWE3.0 Manual-41.md` : Figure_0 (저부파괴여부), Figure_2 (저부파괴탐색), Picture_13 (중간결과아이콘), Figure_18 (외적/내적중간결과) +- `output/MSWE3.0 Manual-42.md` : Figure_6 (지지력정적), Figure_14 (지지력동적), Figure_17 (활동정적) +- `output/MSWE3.0 Manual-43.md` : Figure_1 (활동동적1), Figure_2 (활동동적2), Figure_4 (활동상세다이어그램), Figure_6 (활동최소길이) +- `output/MSWE3.0 Manual-44.md` : Figure_0 (편심정적), Figure_2 (편심동적), Figure_4 (편심상세다이어그램), Figure_6 (편심최소길이) +- `output/MSWE3.0 Manual-45.md` : Picture_2 (Geotextile중간결과아이콘), Figure_3 (내적강도결과테이블), Figure_15 (동적강도결과), Figure_17 (Tmax분포) +- `output/MSWE3.0 Manual-46.md` : Figure_1 (Tmax수평응력분포), Figure_4 (연결부정적), Picture_9 (연결부안전율상세), Figure_11 (연결부동적) +- `output/MSWE3.0 Manual-47.md` : Figure_0 (인발정적), Figure_7 (인발동적1), Figure_8 (인발동적2), Figure_11 (최종설계결과) + +## 주요 결정사항 +- 총 63개 이미지를 Read 툴로 멀티모달 분석 후 각 MD 파일에 삽입 +- 31-47 범위의 내용은 Metal Strip 설계(31-38), 전체안정해석(38-41), 결과확인(41-47) +- 결과 확인 화면(42-47)은 입력 파라미터보다 출력 결과 컬럼명을 설명하는 방식으로 기술 + +## 문제 및 해결 +- 이전 세션에서 Manual-31 이미지 4개와 Manual-33 Figure_2 이미지를 이미 읽었으나 컨텍스트 초과로 삽입 전 중단 +- 이번 세션에서 나머지 이미지(33 Figure_9~47 전체)를 병렬 Read로 한꺼번에 분석 후 순차 삽입 diff --git a/docs/history/2026-03-31_MSEW매뉴얼60-83파라미터삽입.md b/docs/history/2026-03-31_MSEW매뉴얼60-83파라미터삽입.md new file mode 100644 index 0000000..a071ca1 --- /dev/null +++ b/docs/history/2026-03-31_MSEW매뉴얼60-83파라미터삽입.md @@ -0,0 +1,42 @@ +**이슈**: #1 +**소요 시간**: 약 90분 +**Context 사용량**: input 약 220k / output 약 15k tokens + +## 작업 내용 +MSEW3.0 Manual MD 파일 60~83번에 이미지 파라미터 설명 삽입. +컨텍스트 초과로 종료된 이전 세션에서 이어받아, 사용자 요청으로 60번부터 진행. +이전 세션의 누락된 히스토리(`2026-03-31_MSEW매뉴얼31-47이미지읽기시도중단.md`)도 추정 작성. + +## 변경 파일 +- `output/MSWE3.0 Manual-60.md` : Figure_0 (외부안정분석), Figure_6 (내적안정테이블), Figure_15 (Tmax분포) +- `output/MSWE3.0 Manual-61.md` : Figure_0 (Tmax상세), Figure_2 (이상값버튼), Figure_3 (목표Fs입력), Figure_4 (이상값테이블), Figure_7 (연결부분석) +- `output/MSWE3.0 Manual-62.md` : Figure_5 (인발저항테이블), Figure_8 (인발상세), Picture_14 (GlobalStability버튼) +- `output/MSWE3.0 Manual-63.md` : Figure_0 (복합안정초기값), Picture_7 (해석방법선택), Figure_11 (탐색격자), Picture_16 (Bishop시작확인), Picture_18 (저부파괴여부) +- `output/MSWE3.0 Manual-64.md` : Figure_1 (저부파괴탐색격자), Figure_7 (복합결과테이블), Picture_12 (컨투어분포), Figure_14 (3D분포) +- `output/MSWE3.0 Manual-65.md` : Figure_1 (파괴원다이어그램), Figure_3 (보강재기여), Figure_5 (인장력분포), Picture_9 (저부컨투어), Figure_11 (저부3D) +- `output/MSWE3.0 Manual-66.md` : Figure_0 (저부파괴원), Figure_2 (저부보강재기여), Figure_4 (인장력분포), Figure_7 (지진결과), Picture_9 (임계원지진) +- `output/MSWE3.0 Manual-67.md` : Figure_20 (전면블록데이터) +- `output/MSWE3.0 Manual-68.md` : Figure_3 (연결부감소계수), Figure_8 (연결부강도입력), Figure_12 (전단저항입력) +- `output/MSWE3.0 Manual-69.md` : Figure_0 (지오그리드분석메뉴), Picture_4 (보강재종류수), Picture_6 (보강재데이터), Figure_8 (층별배치입력) +- `output/MSWE3.0 Manual-70.md` : Picture_6 (내적토압계수안내), Figure_8 (외적토압계수안내), Figure_11 (결과메인화면), Figure_19 (지지력결과) +- `output/MSWE3.0 Manual-71.md` : Figure_8 (지지력지진), Figure_11 (활동결과테이블), Figure_20 (활동정적상세), Figure_22 (활동지진상세) +- `output/MSWE3.0 Manual-72.md` : Figure_2 (편심결과테이블), Figure_5 (편심정적상세), Figure_7 (편심지진상세), Figure_13 (내적안정결과) +- `output/MSWE3.0 Manual-73.md` : Figure_0 (Tmax분포1), Figure_2 (Tmax분포2), Figure_5 (이상값버튼), Figure_6 (목표Fs입력), Figure_7 (이상값테이블) +- `output/MSWE3.0 Manual-74.md` : Figure_0 (연결부결과테이블), Picture_7 (연결부Fs요약), Figure_9 (Bulging테이블), Figure_11 (힌지높이) +- `output/MSWE3.0 Manual-75.md` : Figure_1 (최대비보강높이), Figure_5 (인발결과테이블), Figure_8 (인발상세테이블) +- `output/MSWE3.0 Manual-76.md` : Picture_0 (결과메인), Figure_1 (텍스트저장버튼), Picture_5 (PrintPreview1), Picture_6 (PrintPreview2), Figure_11 (비트맵저장) +- `output/MSWE3.0 Manual-80.md` : Figure_5 (연결력비율테이블), Figure_6 (연결력그래프) +- `output/MSWE3.0 Manual-81.md` : Picture_8 (보강재배치도) +- `output/MSWE3.0 Manual-83.md` : Picture_5 (경사배면배치도) +- `docs/history/2026-03-31_MSEW매뉴얼31-47이미지읽기시도중단.md` : 누락 히스토리 추정 작성 + +## 주요 결정사항 +- 77~79, 82, 84~86번은 _images 폴더 없어 건너뜀 +- 결과 화면(분석 결과 테이블, 다이어그램)은 출력 컬럼명과 의미 위주로 기술 +- 입력 다이얼로그는 파라미터명·단위·샘플값 위주로 기술 +- 60~76번: NCMA 방식 Geogrid/Geotextile 결과 확인 화면이 주를 이룸 +- 80~83번: 텍스트 출력 결과 파일 관련 배치도 및 집계 그래프 + +## 문제 및 해결 +- Manual-65 Picture_9/Figure_11 삽입 시 공백 라인 차이로 첫 시도 실패 → 파일 재읽기 후 정확한 문자열로 수정 성공 +- 파일 86번까지만 존재(사용자가 98번까지라고 했으나 실제 변환 파일은 86번이 마지막) diff --git a/docs/history/2026-03-31_MSEW매뉴얼이미지파라미터추출.md b/docs/history/2026-03-31_MSEW매뉴얼이미지파라미터추출.md new file mode 100644 index 0000000..f997500 --- /dev/null +++ b/docs/history/2026-03-31_MSEW매뉴얼이미지파라미터추출.md @@ -0,0 +1,31 @@ +**이슈**: #1 +**소요 시간**: 30분 +**Context 사용량**: input 45k / output 6k tokens + +## 작업 내용 +MSEW3.0 매뉴얼 샘플 PDF 3페이지(06, 07, 08)에서 추출된 이미지를 Claude Code의 Read 툴(멀티모달)로 직접 분석하여, MD 파일의 이미지 참조 바로 아래에 파라미터명과 샘플값을 삽입. +API 키 없이 Claude Code 구독으로 처리하는 워크플로우 검증 완료. + +## 변경 파일 +- `output/MSWE3.0 Manual-06.md` : 이미지 3개 아래 파라미터 설명 삽입 +- `output/MSWE3.0 Manual-07.md` : 이미지 1개 아래 파라미터 설명 삽입 +- `output/MSWE3.0 Manual-08.md` : 이미지 3개 아래 파라미터 설명 삽입 + +## 주요 결정사항 +- Python 스크립트 대신 Claude Code가 직접 Read(이미지) → Edit(MD) 수행 +- API 키 불필요 — Claude Code 구독으로 이미지 분석 가능 +- 삽입 형식: `> **[화면명]** \n> - \`파라미터명\`: 샘플값` +- 세션당 약 15~20페이지 처리 가능 (컨텍스트 한계) + +## 추출된 파라미터 목록 +- 메인 메뉴: General Information, Geometry and Surcharge, Soil Data, Reinforcement (Geogrid), FACIA (Blocks), Seismic Parameters, Strata for Global Stability Analysis, Target Performance Criteria +- Wall Embedment: Type in front of wall, Embedded depth E [m] +- Geometry/Surcharge: Height H [m], BackSlope [deg], Batter, BackSlope ris [m] +- Geogrid Design: Le [m], L/Hd, L [m], 보강재 길이 옵션(Uniform/Minimum), 강도·간격 옵션, Internal/External Stability K +- Reinforcement Types: Number of reinforcement types +- Geogrid DB: Product Name, Ultimate Tensile Strength [kN/m], Strength Reduction factors +- Reinforcement Layout: From/To [m], Geogrid Type #, T-allowable [kN/m] + +## 문제 및 해결 +- pdftoppm 미설치로 Read 툴로 PDF 직접 읽기 불가 → PyMuPDF로 텍스트 추출 후 OCR(marker-pdf)로 보완 +- 온라인 공식 매뉴얼 없음 (MSEW 3.0은 2020년 지원 종료) → 로컬 샘플 PDF 활용 diff --git a/docs/history/2026-03-31_이미지분석코드정리및마무리.md b/docs/history/2026-03-31_이미지분석코드정리및마무리.md new file mode 100644 index 0000000..6cb02d4 --- /dev/null +++ b/docs/history/2026-03-31_이미지분석코드정리및마무리.md @@ -0,0 +1,18 @@ +**이슈**: #1 +**소요 시간**: 15분 +**Context 사용량**: input 28k / output 4k tokens + +## 작업 내용 +convert_with_cropped_images.py에 이미지 분석 기능(analyze_image_with_claude, insert_image_descriptions)을 추가했다가, 이후 요청에 따라 삭제하여 PDF→MD 변환 + 이미지 추출까지만 담당하도록 정리. +이미지 분석은 별도 파일로 특화 개발 예정. + +## 변경 파일 +- `convert_with_cropped_images.py` : 이미지 분석 관련 함수 2개(analyze_image_with_claude, insert_image_descriptions) 및 호출 코드 제거. base64/dotenv import 제거. + +## 주요 결정사항 +- convert_with_cropped_images.py는 PDF→MD 변환 + 이미지 파일 추출까지만 담당 +- 이미지 분석(멀티모달 AI)은 이 파일을 복제한 별도 스크립트에서 특화 구현 예정 +- 분리 이유: 매뉴얼 이미지는 범용 분석이 아닌 특화된 프롬프트/로직이 필요 + +## 문제 및 해결 +없음 diff --git a/docs/history/2026-03-31_훅설정및PDF2단변환기능추가.md b/docs/history/2026-03-31_훅설정및PDF2단변환기능추가.md new file mode 100644 index 0000000..43a9bf4 --- /dev/null +++ b/docs/history/2026-03-31_훅설정및PDF2단변환기능추가.md @@ -0,0 +1,28 @@ +**이슈**: #1 +**소요 시간**: 40분 +**Context 사용량**: input 35k / output 8k tokens + +## 작업 내용 +1. common/.claude/hooks 훅을 프로젝트에 적용 +2. convert_with_cropped_images.py에 2단(다단) → 1단 변환 기능 추가 +3. 샘플 PDF(MSWE3.0 Manual-06.pdf) 변환 테스트 +4. 히스토리 훅 미작동 원인 분석 및 수정 + +## 변경 파일 +- `.claude/settings.json` : 신규 생성 — UserPromptSubmit/PostToolUse/Stop 훅 등록 +- `.claude/hooks/` : common에서 훅 파일 4개 복사 (session-context.sh, guard-history-fields.sh/.py, guard-history-reminder.sh) +- `.claude/hooks/session-context.sh` : 히스토리 기록 지시 문구 추가 (stdout으로 Claude에게 전달) +- `convert_with_cropped_images.py` : `is_scanned_pdf()`, `reorder_text_by_columns()` 함수 추가 — 스캔/텍스트 PDF 자동 판별 후 2단→1단 처리 +- `CLAUDE.md` : 신규 생성 — 히스토리 작성 규칙 및 템플릿 정의 +- `docs/history/.gitkeep` : 신규 생성 + +## 주요 결정사항 +- 스캔 PDF → marker-pdf surya 레이아웃 모델이 자동으로 2단 컬럼 검출+재정렬 +- 텍스트 PDF → PyMuPDF 블록 좌표 기반: 페이지 폭 절반 ±30px 기준으로 좌/우 컬럼 분리 후 좌→우 순 합산 +- 스캔 판정 기준: 샘플 3페이지에서 텍스트 50자 미만이면 스캔 PDF로 처리 +- 훅 실행 인터프리터: `.venv/Scripts/python.exe` 사용 (python/python3 명령은 다른 Python 환경을 가리킴) + +## 문제 및 해결 +- **훅 미작동 원인**: CLAUDE.md 없음 + session-context.sh에 작성 지시 없음 + guard-history-reminder.sh가 stderr 출력으로 Claude에게 전달 안 됨 → session-context.sh stdout에 지시 문구 추가 + CLAUDE.md 생성으로 해결 +- **ModuleNotFoundError(marker)**: python/python3 명령이 marker 미설치 Python 가리킴 → .venv/Scripts/python.exe 직접 지정으로 해결 +- **샘플 PDF 1페이지, 이미지 기반**: PyMuPDF 텍스트 블록 0개 확인 → marker-pdf OCR 경로로 처리, 정상 변환 완료 diff --git a/docs/history/2026-04-01_MD파일병합및이미지경로통합.md b/docs/history/2026-04-01_MD파일병합및이미지경로통합.md new file mode 100644 index 0000000..962c5c3 --- /dev/null +++ b/docs/history/2026-04-01_MD파일병합및이미지경로통합.md @@ -0,0 +1,19 @@ +**소요 시간**: 10분 +**Context 사용량**: input 18k / output 2k tokens + +## 작업 내용 +96개 MD 파일을 하나로 병합하는 방법 설계 및 테스트 (06~08 페이지 3개). +이미지 파일명 충돌 문제를 해결하기 위해 페이지 번호 prefix를 붙여 단일 폴더로 통합하는 방식 채택. + +## 변경 파일 +- `merge_markdown.py` : 전면 재작성 — 이미지 통합 폴더 생성, 파일명 rename, MD 내 경로 치환, file_range 파라미터 지원 + +## 주요 결정사항 +- 이미지 rename 규칙: `{stem}_images/_page_0_Figure_3.jpeg` → `images/p006_Figure_3.jpeg` + - `_page_0_` 접두사 제거, 페이지 번호(zero-padded)를 prefix로 +- 병합 파일은 `output/` 안에 저장 → 상대경로 `images/` 그대로 유효 +- `file_range` 파라미터로 테스트 범위 지정 가능 + +## 문제 및 해결 +- 문제: 모든 MD가 단일 페이지이므로 `_page_0_Figure_X` 이름이 96개 파일에서 중복 +- 해결: 이미지를 단일 `images/` 폴더로 복사할 때 `p{pagenum}_` 접두사 추가하여 고유명 보장 diff --git a/docs/tutorial.md b/docs/tutorial.md new file mode 100644 index 0000000..b2ceb48 --- /dev/null +++ b/docs/tutorial.md @@ -0,0 +1,39 @@ +# documan 사용법 + +## MD 파일 병합 (merge_markdown.py) + +PDF에서 변환된 페이지별 MD 파일들을 하나의 파일로 합친다. +이미지도 `output/images/` 폴더로 통합되고, MD 내 경로가 자동으로 업데이트된다. + +### 전체 병합 + +```bash +python merge_markdown.py +``` + +- 입력: `output/MSWE3.0 Manual-01.md` ~ `output/MSWE3.0 Manual-96.md` +- 출력: `output/merged_all.md` +- 이미지: `output/images/p01_Figure_0.jpeg` 형식으로 통합 + +### 일부 페이지만 테스트 + +`merge_markdown.py` 하단의 `__main__` 블록에서 `file_range` 지정: + +```python +merge_markdown_files( + input_dir="output", + output_file="merged_test.md", + images_subdir="images", + file_range=(6, 8), # 06~08 페이지만 +) +``` + +### 이미지 이름 규칙 + +| 원본 | 변환 후 | +|------|---------| +| `MSWE3.0 Manual-06_images/_page_0_Figure_0.jpeg` | `images/p06_Figure_0.jpeg` | +| `MSWE3.0 Manual-15_images/_page_0_Picture_12.jpeg` | `images/p15_Picture_12.jpeg` | + +- `_page_0_` 접두사 제거 +- 페이지 번호를 `p{NN}_` 형식으로 앞에 붙여 파일명 충돌 방지 diff --git a/extract_images.py b/extract_images.py new file mode 100644 index 0000000..a4c6f03 --- /dev/null +++ b/extract_images.py @@ -0,0 +1,175 @@ +#!/usr/bin/env python3 +""" +Extract embedded images from PDF files +""" + +import os +import glob +from pathlib import Path + +def extract_images_pypdfium2(pdf_path: str, output_dir: str = "output"): + """ + Extract images using pypdfium2 + """ + try: + import pypdfium2 as pdfium + from PIL import Image + import io + + pdf_file = Path(pdf_path) + base_name = pdf_file.stem + images_dir = os.path.join(output_dir, f"{base_name}_extracted_images") + os.makedirs(images_dir, exist_ok=True) + + print(f"\nExtracting images from {pdf_file.name}...") + + pdf = pdfium.PdfDocument(pdf_path) + image_count = 0 + + for page_num in range(len(pdf)): + page = pdf[page_num] + + # Get images from page + for obj_index, obj in enumerate(page.get_objects()): + if obj.type == pdfium.FPDF_PAGEOBJ_IMAGE: + try: + # Extract image + bitmap = obj.get_bitmap() + pil_image = bitmap.to_pil() + + # Skip very small images (likely noise or artifacts) + if pil_image.width < 50 or pil_image.height < 50: + continue + + image_count += 1 + img_filename = f"page_{page_num + 1}_img_{obj_index + 1}.png" + img_path = os.path.join(images_dir, img_filename) + pil_image.save(img_path) + print(f" Saved: {img_filename} ({pil_image.width}x{pil_image.height})") + + except Exception as e: + print(f" Warning: Could not extract image {obj_index} from page {page_num + 1}: {e}") + + pdf.close() + + if image_count > 0: + print(f" OK Total {image_count} images extracted to: {images_dir}") + return True + else: + print(f" INFO: No images found in {pdf_file.name}") + return True + + except Exception as e: + print(f" ERROR: Failed with pypdfium2: {e}") + return False + + +def extract_images_pymupdf(pdf_path: str, output_dir: str = "output"): + """ + Extract images using PyMuPDF (fitz) - fallback method + """ + try: + import fitz # PyMuPDF + + pdf_file = Path(pdf_path) + base_name = pdf_file.stem + images_dir = os.path.join(output_dir, f"{base_name}_extracted_images") + os.makedirs(images_dir, exist_ok=True) + + print(f"\nExtracting images from {pdf_file.name} using PyMuPDF...") + + doc = fitz.open(pdf_path) + image_count = 0 + + for page_num in range(len(doc)): + page = doc[page_num] + image_list = page.get_images(full=True) + + for img_index, img_info in enumerate(image_list): + xref = img_info[0] + + try: + # Extract image + base_image = doc.extract_image(xref) + image_bytes = base_image["image"] + image_ext = base_image["ext"] + + # Skip very small images + if len(image_bytes) < 1000: # Less than 1KB + continue + + image_count += 1 + img_filename = f"page_{page_num + 1}_img_{img_index + 1}.{image_ext}" + img_path = os.path.join(images_dir, img_filename) + + with open(img_path, "wb") as f: + f.write(image_bytes) + + print(f" Saved: {img_filename} ({len(image_bytes)} bytes)") + + except Exception as e: + print(f" Warning: Could not extract image {img_index} from page {page_num + 1}: {e}") + + doc.close() + + if image_count > 0: + print(f" OK Total {image_count} images extracted to: {images_dir}") + return True + else: + print(f" INFO: No images found in {pdf_file.name}") + return True + + except ImportError: + print(" ERROR: PyMuPDF not installed. Install with: pip install PyMuPDF") + return False + except Exception as e: + print(f" ERROR: Failed with PyMuPDF: {e}") + return False + + +def extract_images_from_pdf(pdf_path: str, output_dir: str = "output"): + """ + Try to extract images using available methods + """ + # Try pypdfium2 first (already installed) + success = extract_images_pypdfium2(pdf_path, output_dir) + + if not success: + print("\nTrying PyMuPDF as fallback...") + success = extract_images_pymupdf(pdf_path, output_dir) + + return success + + +def extract_all_images(input_dir: str = "input", output_dir: str = "output"): + """ + Extract images from all PDF files in the input directory + """ + pdf_pattern = os.path.join(input_dir, "*.pdf") + pdf_files = sorted(glob.glob(pdf_pattern)) + + if not pdf_files: + print(f"No PDF files found in {input_dir}") + return + + print(f"Found {len(pdf_files)} PDF files") + print("=" * 60) + + successful = 0 + failed = 0 + + for pdf_file in pdf_files: + if extract_images_from_pdf(pdf_file, output_dir): + successful += 1 + else: + failed += 1 + + print("\n" + "=" * 60) + print(f"Image extraction complete!") + print(f" Successful: {successful}") + print(f" Failed: {failed}") + print(f" Total: {len(pdf_files)}") + + +if __name__ == "__main__": + extract_all_images() diff --git a/main.py b/main.py new file mode 100644 index 0000000..427daf9 --- /dev/null +++ b/main.py @@ -0,0 +1,327 @@ +#!/usr/bin/env python3 +""" +docuConverter — 문서 → Markdown 변환 도구 모음 + +지원 포맷: + PDF → Markdown (marker-pdf 기반, 이미지 유/무 선택) + EPUB → Markdown (ebooklib + BeautifulSoup 기반) + +시나리오: + 1. PDF 단일 변환 (이미지 포함, 고품질) + 2. PDF 단일 변환 (텍스트 전용, 빠름) + 3. PDF 배치 변환 (이미지 포함, 순차) + 4. PDF 배치 변환 (텍스트 전용, 순차, 빠름) + 5. PDF 배치 변환 (병렬 처리, 멀티코어) + 6. EPUB 단일 변환 + 7. EPUB 배치 변환 + 8. 이미지만 추출 (PDF → 이미지 파일) + 9. Markdown 병합 (output/ 폴더의 .md 파일들을 하나로) + 10. 이미지 경로 업데이트 (Markdown 내 이미지 링크 재연결) +""" + +import os +import sys +import glob +from pathlib import Path + + +# ─── 시나리오 함수들 ────────────────────────────────────────────────────────── + +def scenario_pdf_single_with_images(): + """PDF 단일 변환 — 이미지 포함 (고품질, 느림)""" + from convert_with_cropped_images import convert_pdf_with_cropped_images + + pdf_path = input("변환할 PDF 경로를 입력하세요: ").strip() + if not pdf_path: + pdf_files = sorted(glob.glob("input/*.pdf")) + if not pdf_files: + print("ERROR: input/ 폴더에 PDF 파일이 없습니다.") + return + pdf_path = pdf_files[0] + print(f" → 자동 선택: {pdf_path}") + + output_dir = input("출력 폴더 [기본: output]: ").strip() or "output" + convert_pdf_with_cropped_images(pdf_path, output_dir) + + +def scenario_pdf_single_fast(): + """PDF 단일 변환 — 텍스트 전용 (빠름)""" + from convert_pdfs_fast import convert_pdf_to_markdown_fast + + pdf_path = input("변환할 PDF 경로를 입력하세요: ").strip() + if not pdf_path: + pdf_files = sorted(glob.glob("input/*.pdf")) + if not pdf_files: + print("ERROR: input/ 폴더에 PDF 파일이 없습니다.") + return + pdf_path = pdf_files[0] + print(f" → 자동 선택: {pdf_path}") + + output_dir = input("출력 폴더 [기본: output]: ").strip() or "output" + convert_pdf_to_markdown_fast(pdf_path, output_dir) + + +def scenario_pdf_batch_with_images(): + """PDF 배치 변환 — 이미지 포함 (순차, input/ → output/)""" + from convert_with_cropped_images import convert_all_pdfs + + input_dir = input("입력 폴더 [기본: input]: ").strip() or "input" + output_dir = input("출력 폴더 [기본: output]: ").strip() or "output" + convert_all_pdfs(input_dir, output_dir) + + +def scenario_pdf_batch_fast(): + """PDF 배치 변환 — 텍스트 전용 (순차, 빠름)""" + from convert_pdfs_fast import convert_all_pdfs_fast + + input_dir = input("입력 폴더 [기본: input]: ").strip() or "input" + output_dir = input("출력 폴더 [기본: output]: ").strip() or "output" + convert_all_pdfs_fast(input_dir, output_dir) + + +def scenario_pdf_batch_parallel(): + """PDF 배치 변환 — 병렬 처리 (멀티코어)""" + from convert_pdfs_parallel import convert_all_pdfs_parallel + import multiprocessing + + input_dir = input("입력 폴더 [기본: input]: ").strip() or "input" + output_dir = input("출력 폴더 [기본: output]: ").strip() or "output" + cpu_count = multiprocessing.cpu_count() + workers_input = input(f"병렬 워커 수 [기본: 2, CPU: {cpu_count}]: ").strip() + max_workers = int(workers_input) if workers_input.isdigit() else 2 + convert_all_pdfs_parallel(input_dir, output_dir, max_workers) + + +def scenario_epub_single(): + """EPUB 단일 변환 → Markdown""" + from convert_epub import convert_epub_to_markdown + + epub_path = input("변환할 EPUB 경로를 입력하세요: ").strip() + if not epub_path: + epub_files = sorted(glob.glob("input/*.epub")) + if not epub_files: + print("ERROR: input/ 폴더에 EPUB 파일이 없습니다.") + return + epub_path = epub_files[0] + print(f" → 자동 선택: {epub_path}") + + output_dir = input("출력 폴더 [기본: output]: ").strip() or "output" + convert_epub_to_markdown(epub_path, output_dir) + + +def scenario_epub_batch(): + """EPUB 배치 변환 — input/ 폴더의 모든 .epub 파일""" + from convert_epub import convert_epub_to_markdown + + input_dir = input("입력 폴더 [기본: input]: ").strip() or "input" + output_dir = input("출력 폴더 [기본: output]: ").strip() or "output" + + epub_files = sorted(glob.glob(os.path.join(input_dir, "*.epub"))) + if not epub_files: + print(f"ERROR: {input_dir}/ 폴더에 EPUB 파일이 없습니다.") + return + + print(f"Found {len(epub_files)} EPUB file(s)") + print("=" * 60) + successful = 0 + failed = 0 + for i, epub_file in enumerate(epub_files, 1): + print(f"\n[{i}/{len(epub_files)}] {Path(epub_file).name}") + try: + convert_epub_to_markdown(epub_file, output_dir) + successful += 1 + except Exception as e: + print(f" ERROR: {e}") + failed += 1 + + print("\n" + "=" * 60) + print(f"Conversion complete! Successful: {successful}, Failed: {failed}") + + +def scenario_extract_images(): + """PDF에서 이미지만 추출 (Markdown 변환 없음)""" + from extract_images import extract_all_images, extract_images_from_pdf + + mode = input("모드 선택 — [1] 단일 파일 [2] 배치 (input/ 폴더): ").strip() + output_dir = input("출력 폴더 [기본: output]: ").strip() or "output" + + if mode == "1": + pdf_path = input("PDF 경로를 입력하세요: ").strip() + if not pdf_path: + print("ERROR: 경로가 비어 있습니다.") + return + extract_images_from_pdf(pdf_path, output_dir) + else: + input_dir = input("입력 폴더 [기본: input]: ").strip() or "input" + extract_all_images(input_dir, output_dir) + + +def scenario_merge_markdown(): + """output/ 폴더의 .md 파일들을 하나의 파일로 병합""" + from merge_markdown import merge_markdown_files + + input_dir = input("병합할 Markdown 폴더 [기본: output]: ").strip() or "output" + output_file = input("병합 결과 파일명 [기본: merged_all.md]: ").strip() or "merged_all.md" + separator_choice = input("구분자 — [1] 수평선 (---) [2] 빈줄만: ").strip() + separator = "\n\n---\n\n" if separator_choice != "2" else "\n\n" + merge_markdown_files(input_dir, output_file, separator) + + +def scenario_update_image_paths(): + """Markdown 내 이미지 경로를 추출된 실제 이미지 경로로 업데이트""" + from update_image_paths import update_all_markdown_files + + output_dir = input("Markdown 폴더 [기본: output]: ").strip() or "output" + update_all_markdown_files(output_dir) + + +# ─── 메뉴 ──────────────────────────────────────────────────────────────────── + +SCENARIOS = [ + ("PDF 단일 변환 (이미지 포함, 고품질)", scenario_pdf_single_with_images), + ("PDF 단일 변환 (텍스트 전용, 빠름)", scenario_pdf_single_fast), + ("PDF 배치 변환 (이미지 포함, 순차)", scenario_pdf_batch_with_images), + ("PDF 배치 변환 (텍스트 전용, 순차, 빠름)", scenario_pdf_batch_fast), + ("PDF 배치 변환 (병렬 처리, 멀티코어)", scenario_pdf_batch_parallel), + ("EPUB 단일 변환 → Markdown", scenario_epub_single), + ("EPUB 배치 변환 (input/ 폴더 전체)", scenario_epub_batch), + ("이미지만 추출 (PDF → 이미지 파일)", scenario_extract_images), + ("Markdown 파일 병합 (여러 .md → 하나로)", scenario_merge_markdown), + ("이미지 경로 업데이트 (Markdown 링크 수정)", scenario_update_image_paths), +] + + +def print_menu(): + print("\n" + "=" * 60) + print(" docuConverter — 문서 → Markdown 변환 도구") + print("=" * 60) + for i, (label, _) in enumerate(SCENARIOS, 1): + print(f" {i:2}. {label}") + print(" 0. 종료") + print("=" * 60) + + +def run_interactive(): + """대화형 메뉴 실행""" + while True: + print_menu() + choice = input("시나리오 번호를 선택하세요: ").strip() + + if choice == "0": + print("종료합니다.") + break + + if not choice.isdigit() or not (1 <= int(choice) <= len(SCENARIOS)): + print("잘못된 입력입니다. 다시 선택하세요.") + continue + + idx = int(choice) - 1 + label, fn = SCENARIOS[idx] + print(f"\n▶ {label}") + print("-" * 60) + try: + fn() + except KeyboardInterrupt: + print("\n중단되었습니다.") + except Exception as e: + print(f"\nERROR: {e}") + import traceback + traceback.print_exc() + + input("\n[Enter] 키를 누르면 메뉴로 돌아갑니다...") + + +def run_cli(args): + """CLI 직접 실행 모드 (비대화형) + + 사용 예: + python main.py 1 path/to/file.pdf output/ + python main.py 4 input/ output/ + python main.py 6 path/to/book.epub output/ + """ + if not args: + run_interactive() + return + + scenario_num = args[0] + if not scenario_num.isdigit() or not (1 <= int(scenario_num) <= len(SCENARIOS)): + print(f"ERROR: 시나리오 번호는 1~{len(SCENARIOS)} 사이여야 합니다.") + sys.exit(1) + + idx = int(scenario_num) - 1 + label, fn = SCENARIOS[idx] + print(f"▶ {label}") + + # 인자를 stdin 처럼 흉내 내어 input() 호출을 우회 + # 직접 함수를 시나리오별로 호출 + extra = args[1:] + + if idx == 0: # PDF 단일, 이미지 포함 + from convert_with_cropped_images import convert_pdf_with_cropped_images + pdf_path = extra[0] if len(extra) > 0 else sorted(glob.glob("input/*.pdf"))[0] + out = extra[1] if len(extra) > 1 else "output" + convert_pdf_with_cropped_images(pdf_path, out) + + elif idx == 1: # PDF 단일, fast + from convert_pdfs_fast import convert_pdf_to_markdown_fast + pdf_path = extra[0] if len(extra) > 0 else sorted(glob.glob("input/*.pdf"))[0] + out = extra[1] if len(extra) > 1 else "output" + convert_pdf_to_markdown_fast(pdf_path, out) + + elif idx == 2: # PDF 배치, 이미지 포함 + from convert_with_cropped_images import convert_all_pdfs + inp = extra[0] if len(extra) > 0 else "input" + out = extra[1] if len(extra) > 1 else "output" + convert_all_pdfs(inp, out) + + elif idx == 3: # PDF 배치, fast + from convert_pdfs_fast import convert_all_pdfs_fast + inp = extra[0] if len(extra) > 0 else "input" + out = extra[1] if len(extra) > 1 else "output" + convert_all_pdfs_fast(inp, out) + + elif idx == 4: # PDF 배치, 병렬 + from convert_pdfs_parallel import convert_all_pdfs_parallel + inp = extra[0] if len(extra) > 0 else "input" + out = extra[1] if len(extra) > 1 else "output" + workers = int(extra[2]) if len(extra) > 2 else 2 + convert_all_pdfs_parallel(inp, out, workers) + + elif idx == 5: # EPUB 단일 + from convert_epub import convert_epub_to_markdown + epub_path = extra[0] if len(extra) > 0 else sorted(glob.glob("input/*.epub"))[0] + out = extra[1] if len(extra) > 1 else "output" + convert_epub_to_markdown(epub_path, out) + + elif idx == 6: # EPUB 배치 + from convert_epub import convert_epub_to_markdown + inp = extra[0] if len(extra) > 0 else "input" + out = extra[1] if len(extra) > 1 else "output" + for ep in sorted(glob.glob(os.path.join(inp, "*.epub"))): + print(f"\n→ {Path(ep).name}") + convert_epub_to_markdown(ep, out) + + elif idx == 7: # 이미지 추출 + from extract_images import extract_all_images + inp = extra[0] if len(extra) > 0 else "input" + out = extra[1] if len(extra) > 1 else "output" + extract_all_images(inp, out) + + elif idx == 8: # Markdown 병합 + from merge_markdown import merge_markdown_files + inp = extra[0] if len(extra) > 0 else "output" + out_file = extra[1] if len(extra) > 1 else "merged_all.md" + merge_markdown_files(inp, out_file) + + elif idx == 9: # 이미지 경로 업데이트 + from update_image_paths import update_all_markdown_files + out = extra[0] if len(extra) > 0 else "output" + update_all_markdown_files(out) + + +if __name__ == "__main__": + # docuConverter 폴더를 cwd로 설정 (어느 경로에서 실행해도 input/output 경로 일관) + script_dir = Path(__file__).parent + os.chdir(script_dir) + + run_cli(sys.argv[1:]) diff --git a/merge_markdown.py b/merge_markdown.py new file mode 100644 index 0000000..36914ca --- /dev/null +++ b/merge_markdown.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python3 +""" +Merge multiple Markdown files into a single file, +consolidating all images into a single images/ folder with unique names. + +Image rename rule: + {stem}_images/_page_0_Figure_3.jpeg -> images/p006_Figure_3.jpeg + {stem}_images/_page_0_Picture_12.jpeg -> images/p006_Picture_12.jpeg +""" + +import os +import re +import glob +import shutil +from pathlib import Path + + +def merge_markdown_files( + input_dir: str = "output", + output_file: str = "merged_all.md", + images_subdir: str = "images", + file_range: tuple = None, # e.g. (6, 8) to process only pages 06~08 +): + md_pattern = os.path.join(input_dir, "*.md") + all_md_files = sorted(glob.glob(md_pattern)) + + # Only include files matching Manual-NN pattern (skip merged outputs) + all_md_files = [f for f in all_md_files if re.search(r'-\d+\.md$', f)] + + # Filter by page number range if given + if file_range: + start, end = file_range + md_files = [] + for f in all_md_files: + m = re.search(r'-(\d+)\.md$', f) + if m and start <= int(m.group(1)) <= end: + md_files.append(f) + else: + md_files = all_md_files + + if not md_files: + print(f"No markdown files found in {input_dir}") + return + + print(f"Files to merge: {len(md_files)}") + for f in md_files: + print(f" {Path(f).name}") + print("=" * 60) + + # Create unified images directory + unified_images_path = os.path.join(input_dir, images_subdir) + os.makedirs(unified_images_path, exist_ok=True) + + merged_content = [] + + for md_file in md_files: + file_path = Path(md_file) + + # Extract zero-padded page number from filename (e.g. "06" from "Manual-06.md") + m = re.search(r'-(\d+)\.md$', str(file_path)) + page_num = m.group(1) if m else "000" + + print(f"Processing [{page_num}] {file_path.name} ...") + + with open(md_file, "r", encoding="utf-8") as f: + content = f.read() + + # Replace each image reference + def replace_image(match): + alt = match.group(1) + old_path = match.group(2) + + # Decode %20 → space for filesystem access + old_path_decoded = old_path.replace("%20", " ") + + # Filename only: _page_0_Figure_3.jpeg + img_filename = Path(old_path_decoded).name + + # Strip leading _page_N_ to get: Figure_3.jpeg or Picture_12.jpeg + clean_name = re.sub(r'^_page_\d+_', '', img_filename) + + # New unique name: p006_Figure_3.jpeg + new_name = f"p{page_num}_{clean_name}" + + # Copy image to unified folder + src = os.path.join(input_dir, old_path_decoded) + dst = os.path.join(unified_images_path, new_name) + if os.path.exists(src): + shutil.copy2(src, dst) + else: + print(f" WARNING: image not found: {src}") + + return f"![{alt}]({images_subdir}/{new_name})" + + new_content = re.sub(r'!\[([^\]]*)\]\(([^)]+)\)', replace_image, content) + merged_content.append(new_content) + + if not merged_content: + print("No content to merge") + return + + final_content = "\n\n---\n\n".join(merged_content) + + output_path = os.path.join(input_dir, output_file) + with open(output_path, "w", encoding="utf-8") as f: + f.write(final_content) + + print("\n" + "=" * 60) + print(f"SUCCESS: {output_path}") + print(f" Files merged : {len(merged_content)}") + print(f" Total chars : {len(final_content):,}") + + +if __name__ == "__main__": + merge_markdown_files( + input_dir="output", + output_file="merged_all.md", + images_subdir="images", + ) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f5e7a9e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,96 @@ +annotated-types==0.7.0 +anthropic==0.46.0 +anyio==4.12.1 +beautifulsoup4==4.14.3 +brotli==1.2.0 +certifi==2026.1.4 +cffi==2.0.0 +cfgv==3.5.0 +charset-normalizer==3.4.4 +click==8.3.1 +cobble==0.1.4 +colorama==0.4.6 +cssselect2==0.8.0 +distlib==0.4.0 +distro==1.9.0 +EbookLib==0.18 +einops==0.8.1 +et_xmlfile==2.0.0 +filelock==3.20.3 +filetype==1.2.0 +fonttools==4.61.1 +fsspec==2026.1.0 +ftfy==6.3.1 +google-auth==2.47.0 +google-genai==1.59.0 +h11==0.16.0 +httpcore==1.0.9 +httpx==0.28.1 +huggingface-hub==0.36.0 +identify==2.6.16 +idna==3.11 +Jinja2==3.1.6 +jiter==0.12.0 +joblib==1.5.3 +lxml==6.0.2 +mammoth==1.11.0 +markdown2==2.5.4 +markdownify==1.2.2 +marker-pdf==1.10.1 +MarkupSafe==3.0.3 +mpmath==1.3.0 +networkx==3.6.1 +nodeenv==1.10.0 +numpy==2.4.1 +openai==1.109.1 +opencv-python-headless==4.11.0.86 +openpyxl==3.1.5 +packaging==25.0 +pdftext==0.6.3 +pillow==10.4.0 +platformdirs==4.5.1 +pre_commit==4.5.1 +psutil==7.2.1 +pyasn1==0.6.2 +pyasn1_modules==0.4.2 +pycparser==2.23 +pydantic==2.12.5 +pydantic-settings==2.12.0 +pydantic_core==2.41.5 +pydyf==0.12.1 +PyMuPDF==1.26.7 +pypdfium2==4.30.0 +pyphen==0.17.2 +python-dotenv==1.2.1 +python-pptx==1.0.2 +PyYAML==6.0.3 +RapidFuzz==3.14.3 +regex==2024.11.6 +requests==2.32.5 +rsa==4.9.1 +safetensors==0.7.0 +scikit-learn==1.8.0 +scipy==1.17.0 +six==1.17.0 +sniffio==1.3.1 +soupsieve==2.8.1 +surya-ocr==0.17.0 +sympy==1.14.0 +tenacity==9.1.2 +threadpoolctl==3.6.0 +tinycss2==1.5.1 +tinyhtml5==2.0.0 +tokenizers==0.22.2 +torch==2.9.1 +tqdm==4.67.1 +transformers==4.57.6 +typing-inspection==0.4.2 +typing_extensions==4.15.0 +urllib3==2.6.3 +virtualenv==20.36.1 +wcwidth==0.2.14 +weasyprint==63.1 +webencodings==0.5.1 +websockets==15.0.1 +xlsxwriter==3.2.9 +zopfli==0.4.0 diff --git a/update_image_paths.py b/update_image_paths.py new file mode 100644 index 0000000..66e4a77 --- /dev/null +++ b/update_image_paths.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 +""" +Update image paths in markdown files to point to extracted images +""" + +import os +import re +import glob +from pathlib import Path + + +def update_markdown_image_paths(md_path: str, output_dir: str = "output"): + """ + Update image paths in markdown file to point to extracted images + """ + md_file = Path(md_path) + base_name = md_file.stem + + # Path to extracted images folder + extracted_images_dir = f"{base_name}_extracted_images" + + # Check if extracted images folder exists + extracted_images_path = os.path.join(output_dir, extracted_images_dir) + if not os.path.exists(extracted_images_path): + print(f"No extracted images folder found: {extracted_images_path}") + return False + + # Read markdown content + with open(md_path, 'r', encoding='utf-8') as f: + content = f.read() + + original_content = content + + # Pattern to match image references like ![](_page_1_Figure_1.jpeg) + # Replace with actual extracted images + def replace_image_path(match): + old_path = match.group(1) + + # Extract page number from old path (e.g., _page_1_Figure_1.jpeg -> page 1) + page_match = re.search(r'_page_(\d+)_', old_path) + if page_match: + page_num = page_match.group(1) + # Map to extracted image: page_1_img_1.png + new_path = f"{extracted_images_dir}/page_{page_num}_img_1.png" + return f'![]({new_path})' + + return match.group(0) # Return original if no match + + # Replace all image paths + content = re.sub(r'\!\[\]\(([^)]+\.jpeg)\)', replace_image_path, content) + + if content == original_content: + print(f"No changes needed for {md_file.name}") + return True + + # Save updated markdown + output_path = md_path.replace('.md', '_updated.md') + with open(output_path, 'w', encoding='utf-8') as f: + f.write(content) + + print(f"Updated markdown saved to: {output_path}") + + # Count replacements + old_count = len(re.findall(r'\!\[\]\([^)]+\.jpeg\)', original_content)) + new_count = len(re.findall(r'\!\[\]\([^)]+\.png\)', content)) + print(f" Replaced {new_count} image paths (out of {old_count} references)") + + return True + + +def update_all_markdown_files(output_dir: str = "output"): + """ + Update image paths in all markdown files + """ + md_pattern = os.path.join(output_dir, "*.md") + md_files = [f for f in glob.glob(md_pattern) if not f.endswith('_updated.md')] + + if not md_files: + print(f"No markdown files found in {output_dir}") + return + + print(f"Found {len(md_files)} markdown files") + print("=" * 60) + + for md_file in md_files: + update_markdown_image_paths(md_file, output_dir) + print() + + print("=" * 60) + print("Done!") + + +if __name__ == "__main__": + update_all_markdown_files()