diff --git a/03.Code/업로드용/converters/pipeline/step2_extract.py b/03.Code/업로드용/converters/pipeline/step2_extract.py
index fc4501d..0f4c4f1 100644
--- a/03.Code/업로드용/converters/pipeline/step2_extract.py
+++ b/03.Code/업로드용/converters/pipeline/step2_extract.py
@@ -2,12 +2,9 @@
 """
 extract_1_v2.py
 
-PDF에서 텍스트(md)와 이미지(png)를 추출하는 기능을 담당하는 모듈.
-- 원본 폴더 구조 유지
-- 이미지 추출 시 캡션(예: <그림 1>)과 연결
-- 헤더/푸터 제외 로직 포함
-- OCR 옵션 지원 (Tesseract 설치 필요)
-- JSON 기반 메타데이터 기록 (이미지경로, 캡션 등)
+PDF에서 텍스트(md)와 이미지(png)를 추출
+- 하위 폴더 구조 유지
+- 이미지 메타데이터 JSON 생성 (폴더경로, 파일명, 페이지, 위치, 캡션 등)
 """
 
 import fitz  # PyMuPDF
@@ -30,15 +27,14 @@ try:
     TESSERACT_AVAILABLE = True
 except ImportError:
     TESSERACT_AVAILABLE = False
-    print("[INFO] pytesseract 미설치. 이미지 텍스트 분석 기능이 제한됩니다.")
+    print("[INFO] pytesseract 미설치 - 텍스트 잘림 필터 비활성화")
+
 
-# ===== 설정 및 상수 =====
 CAPTION_PATTERN = re.compile(
-    r'^\s*(?:[<\[\(\{]\s*)?(그림|figure|fig)\s*\.?\s*(?:[<\[\(\{]\s*)?0*\d+(?:\s*[-~]\s*\d+)?',
+    r'^\s*(?:[<\[\(\{]\s*)?(그림|figure|fig)\s*\.?\s*(?:[<\[\(\{]\s*)?0*\d+(?:\s*[-–]\s*\d+)?',
     re.IGNORECASE
 )
 
-# ===== 이미지 추출 및 캡션 매칭 핵심 로직 =====
 
 def get_figure_rects(page):
     """
@@ -49,7 +45,7 @@ def get_figure_rects(page):
 
     blocks = page.get_text("blocks")
     captions = []
-
+     
     for i, b in enumerate(blocks):
         text = b[4]
         if CAPTION_PATTERN.search(text):
@@ -78,7 +74,7 @@ def get_figure_rects(page):
 
     remaining_drawings = filtered_drawings_rects + img_rects
     caption_clusters = {cap['index']: [cap['rect']] for cap in captions}
-
+    
     def is_text_between(r1, r2, text_blocks):
         if r1.intersects(r2):
             return False
@@ -86,7 +82,7 @@ def get_figure_rects(page):
         for b in text_blocks:
             b_rect = fitz.Rect(b[:4])
             text_content = b[4]
-            if len(text_content.strip()) < 20:
+            if len(text_content.strip()) < 20: 
                 continue
             if not b_rect.intersects(union):
                 continue
@@ -99,11 +95,11 @@ def get_figure_rects(page):
     while changed:
         changed = False
         to_remove = []
-
+        
         for d_rect in remaining_drawings:
             best_cluster_key = None
             min_dist = float('inf')
-
+            
             for cap_index, cluster_rects in caption_clusters.items():
                 for r in cluster_rects:
                     dist = 0
@@ -113,57 +109,57 @@ def get_figure_rects(page):
                         x_dist = 0
                         if d_rect.x1 < r.x0: x_dist = r.x0 - d_rect.x1
                         elif d_rect.x0 > r.x1: x_dist = d_rect.x0 - r.x1
-
+                        
                         y_dist = 0
                         if d_rect.y1 < r.y0: y_dist = r.y0 - d_rect.y1
                         elif d_rect.y0 > r.y1: y_dist = d_rect.y0 - r.y1
-
+                        
                         if x_dist < 150 and y_dist < 150:
-                            dist = max(x_dist, y_dist) + 0.1
+                            dist = max(x_dist, y_dist) + 0.1 
                         else:
                             dist = float('inf')
-
+                    
                     if dist < min_dist:
                          if not is_text_between(r, d_rect, blocks):
                              min_dist = dist
                              best_cluster_key = cap_index
-
-                if min_dist == 0:
+                
+                if min_dist == 0: 
                     break
-
+            
             if best_cluster_key is not None and min_dist < 150:
                 caption_clusters[best_cluster_key].append(d_rect)
                 to_remove.append(d_rect)
                 changed = True
-
+        
         for r in to_remove:
             remaining_drawings.remove(r)
-
+            
     figure_regions = []
-
+    
     for cap in captions:
         cluster_rects = caption_clusters[cap['index']]
-        content_rects = cluster_rects[1:]
-
+        content_rects = cluster_rects[1:] 
+        
         if not content_rects:
             continue
-
+            
         union_rect = content_rects[0]
         for r in content_rects[1:]:
             union_rect = union_rect | r
-
+            
         union_rect.x0 = max(0, union_rect.x0 - 5)
         union_rect.x1 = min(page.rect.width, union_rect.x1 + 5)
         union_rect.y0 = max(0, union_rect.y0 - 5)
         union_rect.y1 = min(page.rect.height, union_rect.y1 + 5)
-
+        
         cap_rect = cap['rect']
-
+        
         if cap_rect.y0 + cap_rect.height/2 < union_rect.y0 + union_rect.height/2:
-             if union_rect.y0 < cap_rect.y1: union_rect.y0 = cap_rect.y1 + 2
+             if union_rect.y0 < cap_rect.y1: union_rect.y0 = cap_rect.y1 + 2 
         else:
-             if union_rect.y1 > cap_rect.y0: union_rect.y1 = cap_rect.y0 - 2
-
+             if union_rect.y1 > cap_rect.y0: union_rect.y1 = cap_rect.y0 - 2 
+             
         area = union_rect.get_area()
         page_area = page.rect.get_area()
 
@@ -194,7 +190,7 @@ def get_figure_rects(page):
             'rect': union_rect,
             'caption_index': cap['index'],
             'caption_rect': cap['rect'],
-            'caption_text': cap['text'].strip()  # 원본 캡션 텍스트 유지
+            'caption_text': cap['text'].strip()  # ★ 캡션 텍스트 저장
         })
 
     return figure_regions
@@ -224,28 +220,29 @@ def keep_figure(pix):
     return True, nonwhite_ratio, edge_ratio, var
 
 
-# ===== 추가 이미지 필터링 알고리즘 (v2.1) =====
+# ===== 추가 이미지 필터 함수들 (v2.1) =====
 
 def pix_to_pil(pix):
     """PyMuPDF Pixmap을 PIL Image로 변환"""
-    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-    return img
+    img_data = pix.tobytes("png")
+    return Image.open(io.BytesIO(img_data))
+
 
 def has_cut_text_at_boundary(pix, margin=5):
     """
-    이미지 경계선에 텍스트가 잘려 있는지 확인
-    - 이미지 주변에 근접한 텍스트 박스가 있으면 필터링 대상으로 판단
+    이미지 경계에서 텍스트가 잘렸는지 감지
+    - 이미지 테두리 근처에 텍스트 박스가 있으면 잘린 것으로 판단
     
     Args:
         pix: PyMuPDF Pixmap
-        margin: 경계선으로부터의 여백 (기본 5px)
-        
+        margin: 경계로부터의 여유 픽셀 (기본 5px)
+    
     Returns:
-        bool: 텍스트가 잘린 경우 True
+        bool: 텍스트가 잘렸으면 True
     """
     if not TESSERACT_AVAILABLE:
-        return False  # OCR 없으면 우선 통과
-
+        return False  # OCR 없으면 필터 비활성화
+    
     try:
         img = pix_to_pil(pix)
         width, height = img.size
@@ -255,68 +252,69 @@ def has_cut_text_at_boundary(pix, margin=5):
         
         for i, text in enumerate(data['text']):
             text = str(text).strip()
-            if len(text) < 2:  # 너무 짧은 텍스트 무시
+            if len(text) < 2:  # 너무 짧은 텍스트는 무시
                 continue
-                
+            
             x = data['left'][i]
             y = data['top'][i]
             w = data['width'][i]
             h = data['height'][i]
             
-            # 텍스트가 상하좌우 경계선에 너무 가깝다면 = 잘린 텍스트 박스일 가능성 높음
-            # 좌측 경계
+            # 텍스트가 이미지 경계에 너무 가까우면 = 잘린 것
+            # 왼쪽 경계
             if x <= margin:
                 return True
-            # 우측 경계
+            # 오른쪽 경계
             if x + w >= width - margin:
                 return True
-            # 상단 경계 (제목 형태는 제외하기 위해 높이 제한 추가)
+            # 상단 경계 (헤더 제외를 위해 좀 더 여유)
             if y <= margin and h < height * 0.3:
                 return True
             # 하단 경계
             if y + h >= height - margin:
                 return True
-                
+        
         return False
         
     except Exception as e:
-        # OCR 실패 시 필터링 없이 통과 (보수적 접근)
+        # OCR 실패 시 필터 통과 (이미지 유지)
         return False
 
 
 def is_decorative_background(pix, edge_threshold=0.02, color_var_threshold=500):
     """
-    배경 패턴(장식) 이미지인지 확인
-    - 엣지 비율이 낮고 (복잡한 도형/사진이 아님)
-    - 색상 분산이 낮거나 특정 범위 내인 경우 (단조로운 그라데이션 등)
+    배경 패턴 + 텍스트만 있는 장식용 이미지인지 감지
+    - 엣지가 적고 (복잡한 도표/사진이 아님)
+    - 색상 다양성이 낮으면 (단순 그라데이션 배경)
     
     Args:
         pix: PyMuPDF Pixmap
         edge_threshold: 엣지 비율 임계값 (기본 0.02 = 2%)
         color_var_threshold: 색상 분산 임계값
-        
+    
     Returns:
-        bool: 배경 이미지인 경우 True
+        bool: 장식용 배경이면 True
     """
     try:
         nonwhite_ratio, edge_ratio, var = pixmap_metrics(pix)
         
-        # 엣지 비율이 2% 미만이면서 단조로운 색상 분포라면 배경 패턴 가능성 높음
+        # 엣지가 거의 없고 (단순한 이미지)
+        # 색상 분산도 낮으면 (배경 패턴)
         if edge_ratio < edge_threshold and var < color_var_threshold:
-            # 추가적으로 텍스트 이미지인지 OCR로 체크 가능
+            # 추가 확인: 텍스트만 있는지 OCR로 체크
             if TESSERACT_AVAILABLE:
                 try:
                     img = pix_to_pil(pix)
                     text = pytesseract.image_to_string(img, lang='kor+eng').strip()
                     
-                    # 텍스트가 있고, 엣지 비율이 아주 낮다면 = 텍스트 배경 장식
+                    # 텍스트가 있고, 이미지가 단순하면 = 텍스트 배경
                     if len(text) > 3 and edge_ratio < 0.015:
                         return True
                 except:
                     pass
-                    
-            return True
             
+            return True
+        
         return False
         
     except Exception:
@@ -325,14 +323,15 @@ def is_decorative_background(pix, edge_threshold=0.02, color_var_threshold=500):
 
 def is_header_footer_region(rect, page_rect, height_threshold=0.12):
     """
-    헤더/푸터 영역에 포함되는지 확인
-    - 상단 12% 또는 하단 12%에 위치한 작은 이미지는 필터링
+    헤더/푸터 영역에 있는 이미지인지 감지
+    - 페이지 상단 12% 또는 하단 12%에 위치
+    - 높이가 낮은 strip 형태
     
     Args:
         rect: 이미지 영역 (fitz.Rect)
-        page_rect: 전체 페이지 영역 (fitz.Rect)
+        page_rect: 페이지 전체 영역 (fitz.Rect)
         height_threshold: 헤더/푸터 영역 비율 (기본 12%)
-        
+    
     Returns:
         bool: 헤더/푸터 영역이면 True
     """
@@ -341,68 +340,68 @@ def is_header_footer_region(rect, page_rect, height_threshold=0.12):
     
     # 상단 영역 체크
     if rect.y0 < page_height * height_threshold:
-        # 매우 얇은 이미지(구분선 등)나 작은 로고 등
+        # 높이가 페이지의 15% 미만인 strip이면 헤더
         if img_height < page_height * 0.15:
             return True
-            
+    
     # 하단 영역 체크
     if rect.y1 > page_height * (1 - height_threshold):
-        # 푸터 영역의 작은 이미지
+        # 높이가 페이지의 15% 미만인 strip이면 푸터
         if img_height < page_height * 0.15:
             return True
-            
+    
     return False
 
 
 def should_filter_image(pix, rect, page_rect):
     """
-    여러 필터링 규칙을 종합하여 이미지 보존 여부 결정
+    이미지를 필터링해야 하는지 종합 판단
     
     Args:
         pix: PyMuPDF Pixmap
         rect: 이미지 영역
-        page_rect: 전체 페이지 영역
-        
+        page_rect: 페이지 전체 영역
+    
     Returns:
-        tuple: (필터링 여부, 필터링 이유)
+        tuple: (필터링 여부, 필터링 사유)
     """
     # 1. 헤더/푸터 영역 체크
     if is_header_footer_region(rect, page_rect):
         return True, "header_footer"
-        
-    # 2. 잘린 텍스트 포함 여부 체크
+    
+    # 2. 텍스트 잘림 체크
     if has_cut_text_at_boundary(pix):
         return True, "cut_text"
-        
-    # 3. 배경 장식 여부 체크
+    
+    # 3. 장식용 배경 체크
     if is_decorative_background(pix):
         return True, "decorative_background"
-        
+    
     return False, None
 
 
 def extract_pdf_content(pdf_path, output_md_path, img_dir, metadata):
     """
-    PDF 내용 추출 메인 함수
-
+    PDF 내용 추출
+    
     Args:
-        pdf_path: PDF 경로
-        output_md_path: 출력 MD 경로
+        pdf_path: PDF 파일 경로
+        output_md_path: 출력 MD 파일 경로
         img_dir: 이미지 저장 폴더
-        metadata: 메타데이터 정보 (폴더 경로, 파일명 등)
-
+        metadata: 메타데이터 딕셔너리 (폴더 경로, 파일명 등)
+    
     Returns:
-        image_metadata_list: 추출된 이미지 메타데이터 리스트
+        image_metadata_list: 추출된 이미지들의 메타데이터 리스트
     """
     os.makedirs(img_dir, exist_ok=True)
-
-    image_metadata_list = []  # 이미지 메타데이터 정보 수집
-
+    
+    image_metadata_list = []  # ★ 이미지 메타데이터 수집
+    
     doc = fitz.open(pdf_path)
     total_pages = len(doc)
-
+    
     with open(output_md_path, "w", encoding="utf-8") as md_file:
-        # 문서 메타데이터 정보 추가
+        # ★ 메타데이터 헤더 추가
         md_file.write(f"---\n")
         md_file.write(f"source_pdf: {metadata['pdf_name']}\n")
         md_file.write(f"source_folder: {metadata['relative_folder']}\n")
@@ -410,13 +409,13 @@ def extract_pdf_content(pdf_path, output_md_path, img_dir, metadata):
         md_file.write(f"extracted_at: {datetime.now().isoformat()}\n")
         md_file.write(f"---\n\n")
         md_file.write(f"# {metadata['pdf_name']}\n\n")
-
+        
         for page_num, page in enumerate(doc):
             md_file.write(f"\n## Page {page_num + 1}\n\n")
             img_rel_dir = os.path.basename(img_dir)
-
+            
             figure_regions = get_figure_rects(page)
-
+            
             kept_figures = []
             for i, fig in enumerate(figure_regions):
                 rect = fig['rect']
@@ -427,11 +426,11 @@ def extract_pdf_content(pdf_path, output_md_path, img_dir, metadata):
 
                 pix = page.get_pixmap(clip=rect, dpi=150, colorspace=fitz.csRGB)
                 
-                # 추가 필터링 로직 적용 (v2.1)
+                # ★ 추가 필터 적용 (v2.1)
                 should_filter, filter_reason = should_filter_image(pix, rect, page.rect)
                 if should_filter:
                     continue
-
+                
                 img_name = f"p{page_num + 1:03d}_fig{len(kept_figures):02d}.png"
                 img_path = os.path.join(img_dir, img_name)
                 pix.save(img_path)
@@ -439,8 +438,8 @@ def extract_pdf_content(pdf_path, output_md_path, img_dir, metadata):
                 fig['img_path'] = os.path.join(img_rel_dir, img_name).replace("\\", "/")
                 fig['img_name'] = img_name
                 kept_figures.append(fig)
-
-                # 이미지 메타데이터 수집
+                
+                # ★ 이미지 메타데이터 수집
                 image_metadata_list.append({
                     "image_file": img_name,
                     "image_path": str(Path(img_dir) / img_name),
@@ -541,11 +540,11 @@ def extract_pdf_content(pdf_path, output_md_path, img_dir, metadata):
                         clip=block_rect, dpi=150, colorspace=fitz.csRGB
                     )
                     
-                    # 추가 필터링 로직 적용 (v2.1)
+                    # ★ 추가 필터 적용 (v2.1)
                     should_filter, filter_reason = should_filter_image(pix, block_rect, page.rect)
                     if should_filter:
                         continue
-
+                    
                     img_name = f"p{page_num + 1:03d}_photo{uncaptioned_idx:02d}.png"
                     img_path = os.path.join(img_dir, img_name)
                     pix.save(img_path)
@@ -563,8 +562,8 @@ def extract_pdf_content(pdf_path, output_md_path, img_dir, metadata):
                         "kind_order": 1,
                         "md": md,
                     })
-
-                    # 이미지 메타데이터 수집
+                    
+                    # ★ 캡션 없는 이미지 메타데이터
                     image_metadata_list.append({
                         "image_file": img_name,
                         "image_path": str(Path(img_dir) / img_name),
@@ -586,7 +585,7 @@ def extract_pdf_content(pdf_path, output_md_path, img_dir, metadata):
                     uncaptioned_idx += 1
                     continue
 
-            # 레이아웃 정렬
+            # 읽기 순서 정렬
             text_items = [it for it in items if it["kind"] == "text"]
             page_w = page.rect.width
             mid = page_w / 2.0
@@ -669,18 +668,17 @@ def extract_pdf_content(pdf_path, output_md_path, img_dir, metadata):
 
 def process_all_pdfs(input_dir, output_dir):
     """
-    BASE_DIR 내의 모든 PDF를 순차적으로 처리
-    폴더 구조를 유지하여 OUTPUT_BASE에 저장
+    BASE_DIR 하위의 모든 PDF를 재귀적으로 처리
+    폴더 구조를 유지하면서 OUTPUT_BASE에 저장
     """
     BASE_DIR = Path(input_dir)
     OUTPUT_BASE = Path(output_dir)
-    
     # 출력 폴더 생성
     OUTPUT_BASE.mkdir(parents=True, exist_ok=True)
-
-    # 전체 추출 된 이미지 메타데이터 통합
+    
+    # 전체 이미지 메타데이터 수집
     all_image_metadata = []
-
+    
     # 처리 통계
     stats = {
         "total_pdfs": 0,
@@ -688,22 +686,22 @@ def process_all_pdfs(input_dir, output_dir):
         "failed": 0,
         "total_images": 0
     }
-
+    
     # 실패 로그
     failed_files = []
-
+    
     print(f"=" * 60)
-    print(f"PDF 콘텐츠 추출 시작")
-    print(f"소스 폴더: {BASE_DIR}")
+    print(f"PDF 추출 시작")
+    print(f"원본 폴더: {BASE_DIR}")
     print(f"출력 폴더: {OUTPUT_BASE}")
     print(f"=" * 60)
-
+    
     # 모든 PDF 파일 찾기
-    pdf_files = list(BASE_DIR.rglob("*.pdf")) + list(BASE_DIR.rglob("*.PDF"))
+    pdf_files = list(BASE_DIR.rglob("*.pdf"))
     stats["total_pdfs"] = len(pdf_files)
-
-    print(f"발견된 PDF: {len(pdf_files)}개\n")
-
+    
+    print(f"\n총 {len(pdf_files)}개 PDF 발견\n")
+    
     for idx, pdf_path in enumerate(pdf_files, 1):
         try:
             # 상대 경로 계산
@@ -714,14 +712,14 @@ def process_all_pdfs(input_dir, output_dir):
             
             pdf_name = pdf_path.name
             pdf_stem = pdf_path.stem
-
+            
             # 출력 경로 설정 (폴더 구조 유지)
             output_folder = OUTPUT_BASE / relative_path.parent
             output_folder.mkdir(parents=True, exist_ok=True)
-
+            
             output_md = output_folder / f"{pdf_stem}.md"
             img_folder = output_folder / f"{pdf_stem}_img"
-
+            
             # 메타데이터 준비
             metadata = {
                 "pdf_name": pdf_name,
@@ -729,37 +727,37 @@ def process_all_pdfs(input_dir, output_dir):
                 "relative_folder": relative_folder,
                 "full_path": str(relative_path),
             }
-
+            
             print(f"[{idx}/{len(pdf_files)}] {relative_path}")
-
+            
             # PDF 처리
             image_metas = extract_pdf_content(
-                str(pdf_path), 
-                str(output_md), 
-                str(img_folder), 
+                str(pdf_path),
+                str(output_md),
+                str(img_folder),
                 metadata
             )
-
+            
             all_image_metadata.extend(image_metas)
             stats["success"] += 1
             stats["total_images"] += len(image_metas)
-
-            print(f"    완료 (약 {len(image_metas)}개 이미지 추출)")
-
+            
+            print(f"    ✓ 완료 (이미지 {len(image_metas)}개)")
+            
         except Exception as e:
             stats["failed"] += 1
             failed_files.append({
                 "file": str(pdf_path),
                 "error": str(e)
             })
-            print(f"    오류 발생: {e}")
-
+            print(f"    ✗ 실패: {e}")
+    
     # 전체 이미지 메타데이터 저장
     meta_output_path = OUTPUT_BASE / "image_metadata.json"
     with open(meta_output_path, "w", encoding="utf-8") as f:
         json.dump(all_image_metadata, f, ensure_ascii=False, indent=2)
-
-    # 처리 결과 요약 저장
+    
+    # 처리 요약 저장
     summary = {
         "processed_at": datetime.now().isoformat(),
         "source_dir": str(BASE_DIR),
@@ -767,21 +765,27 @@ def process_all_pdfs(input_dir, output_dir):
         "statistics": stats,
         "failed_files": failed_files
     }
-
+    
     summary_path = OUTPUT_BASE / "extraction_summary.json"
     with open(summary_path, "w", encoding="utf-8") as f:
         json.dump(summary, f, ensure_ascii=False, indent=2)
-
+    
     # 결과 출력
     print(f"\n" + "=" * 60)
-    print(f"추출 작업 완료!")
+    print(f"추출 완료!")
     print(f"=" * 60)
-    print(f"총 대상: {stats['total_pdfs']}개")
+    print(f"총 PDF: {stats['total_pdfs']}개")
     print(f"성공: {stats['success']}개")
     print(f"실패: {stats['failed']}개")
     print(f"추출된 이미지: {stats['total_images']}개")
     print(f"\n이미지 메타데이터: {meta_output_path}")
     print(f"처리 요약: {summary_path}")
-
+    
     if failed_files:
-        print(f"\n실패한 파일 목록은 summary_path에서 확인 가능합니다.")
+        print(f"\n실패한 파일:")
+        for f in failed_files:
+            print(f"  - {f['file']}: {f['error']}")
+
+
+if __name__ == "__main__":
+    process_all_pdfs()