파일분류, csv output, 세션유지 기능 추가

2025-08-07 11:15:51 +09:00
parent 47b7ecf34e
commit 1b96840c83
5 changed files with 328 additions and 81 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -31,3 +31,4 @@ ENV/
 /results
 script_run.log
 /ocr_data
 /workspace/shared_sessions
--- a/copy_files.py
+++ b/copy_files.py
@@ -0,0 +1,112 @@
 import os
 import shutil
 def copy_target_files_with_structure():
    # 소스 디렉토리와 대상 디렉토리 설정
    root_source_dir = '/home/jackjack/test/ocr_macro/ocr_data'
    dest_dir = '/home/jackjack/test/ocr_macro/filtered_data'
    # 복사할 파일 목록
    files_to_copy = [
        "20250701101504-789-402-926.jpg", "20250707164925-895-935-673.jpg",
        "20250708092450-131-769-592.jpg", "20250708124502-268-927-842.jpg",
        "20250709105123-169-457-765.jpg", "20250714150847-882-936-950.jpg",
        "20250721090407-508-883-473.jpg", "20250724145851-721-283-914.jpg",
        "20250729105852-697-150-153.jpg", "20250730180509-798-917-821.jpg",
        "20170619133715-134-321-633.jpg", "20171017141811-255-321-370.jpg",
        "20180103094436-462-212-348.jpg", "20180131103459-868-481-465.jpg",
        "20180411134455-600-132-301.jpg", "20180412092830-356-712-939.jpg",
        "20180807102155-126-746-229.jpg", "20190507165642-222-795-363.jpg",
        "20191227103340-434-827-409.jpg", "20200113103330-999-251-437.png",
        "20200313140454-282-318-706.jpg", "20201203162517-973-818-382.jpg",
        "20250305150305-354-816-193.jpg", "2018-0319102207-217049.pdf",
        "2018-0319114254-217049.pdf", "2021-0713114710-219044.pdf",
        "2021-0713114843-219044.pdf", "2024-1129132456-223033.pdf",
        "2024-1202134504-223033.pdf", "2024-1202134828-223033.pdf",
        "2024-1216141625-211046.pdf", "2024-1231131430-223033.pdf",
        "2025-0102114806-223033.pdf", "2025-0102115602-223033.pdf",
        "20250715092937-779-181-466.jpg", "20250715110944-951-537-524.jpg",
        "20250715111622-358-588-698.jpg", "20250715112411-186-289-669.jpg",
        "20250715135137-801-844-961.jpg", "20250715161950-712-251-637.jpg",
        "20250715162045-552-568-375.jpg", "20250715165509-176-474-591.jpg",
        "20250715172557-573-573-629.jpg", "20250716093130-913-217-747.jpg",
        "20250716105706-162-939-389.jpg", "20250716110134-808-994-942.jpg",
        "20250716134023-322-796-383.jpg", "20250716163458-700-360-433.jpg",
        "20250717093052-782-277-690.jpg", "20250717103222-584-701-241.jpg",
        "20250717103712-214-193-157.jpg", "20250717110901-449-871-865.jpg",
        "20250717155048-253-564-315.jpg", "20250717172043-664-630-683.jpg",
        "20250718080610-968-626-824.jpg", "20250718093242-193-502-326.jpg",
        "20250718105942-802-175-536.jpg", "20250718154510-618-961-614.jpg",
        "20250718171201-832-262-559.jpg", "20250721103440-887-127-453.jpg",
        "20250721103440-949-954-201.jpg", "20250721103556-832-150-503.jpg",
        "20250721111443-531-701-811.jpg", "20250721111443-912-880-634.jpg",
        "20250721112249-956-647-309.jpg", "20250721130808-958-549-703.jpg",
        "20250721133831-152-461-423.jpg", "20250721145455-511-434-514.jpg",
        "20250721145455-875-554-320.jpg", "20250721145456-782-822-874.jpg",
        "20250721155757-121-923-232.jpg", "20250721160111-763-493-901.jpg",
        "20250721160359-227-567-869.jpg", "20250721160359-337-126-571.jpg",
        "20250721172118-534-854-174.jpg", "20250722083248-564-741-719.jpg",
        "20250722101426-428-671-780.jpg", "20250722101619-869-994-366.jpg",
        "20250722113040-790-828-516.jpg", "20250722113435-988-461-994.jpg",
        "20250722132834-142-640-698.jpg", "20250722151220-665-449-414.jpg",
        "20250722151447-194-809-212.jpg", "20250722151659-492-562-414.jpg",
        "20250722155515-295-661-246.jpg", "20250722164044-771-951-768.jpg",
        "20250723090127-752-277-978.jpg", "20250723103830-197-217-803.jpg",
        "20250723110935-882-617-879.jpg", "20250723113848-341-499-399.jpg",
        "20250723113849-860-361-766.jpg", "20250723135403-994-597-524.jpg",
        "20250723135644-957-724-435.jpg", "20250723140727-539-276-326.jpg",
        "20250723151024-958-230-632.jpg", "20250723160751-628-951-424.jpg",
        "20250723160846-651-369-917.jpg", "20250723162424-328-470-393.jpg",
        "20250724083131-482-629-632.jpg", "20250724084439-705-558-529.jpg",
        "20250724085219-940-177-263.jpg", "20250724112248-515-638-257.jpg",
        "20250724140126-814-266-218.jpg", "20250724165128-348-167-761.jpg",
        "20250724170756-316-660-852.jpg", "20250725084748-172-127-509.jpg",
        "20250725090550-647-253-595.jpg", "20250725103854-127-797-609.jpg",
        "20250725112611-877-225-953.jpg", "20250725150958-785-430-943.jpg",
        "20250725160005-618-961-614.jpg", "20250725160006-645-814-611.jpg",
        "20250728110536-229-869-218.jpg", "20250728110536-422-535-360.jpg",
        "20250728110536-848-126-746.jpg", "20250728133331-290-838-249.jpg",
        "20250728133631-893-551-661.jpg", "20250728133731-800-849-608.jpg",
        "20250728133919-745-435-884.jpg", "20250728141244-723-384-786.jpg",
        "20250728163719-158-329-264.jpg", "20250729091304-312-462-757.jpg",
        "20250729101639-845-837-748.jpg", "20250729150847-216-665-480.jpg",
        "20250729152047-863-915-863.jpg", "20250729152047-872-458-985.jpg",
        "20250729152047-915-601-759.jpg", "20250730093300-400-680-981.jpg",
        "20250730101956-808-881-885.jpg"
    ]
    # 대상 디렉토리가 없으면 생성
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)
        print(f"'{dest_dir}' 디렉토리를 생성했습니다.")
    copied_files = set()
    # root_source_dir부터 시작해서 모든 하위 디렉토리를 재귀적으로 탐색
    for dirpath, _, filenames in os.walk(root_source_dir):
        for filename in filenames:
            if filename in files_to_copy and filename not in copied_files:
                source_file = os.path.join(dirpath, filename)
                # 원본 디렉토리 구조를 유지하기 위한 경로 계산
                relative_path = os.path.relpath(dirpath, root_source_dir)
                new_dest_dir = os.path.join(dest_dir, relative_path)
                # 새로운 목적지 디렉토리 생성
                os.makedirs(new_dest_dir, exist_ok=True)
                dest_file = os.path.join(new_dest_dir, filename)
                shutil.copy2(source_file, dest_file)
                print(f"'{filename}'을(를) '{new_dest_dir}'(으)로 복사했습니다.")
                copied_files.add(filename)
    # 복사되지 않은 파일 확인
    missing_files = set(files_to_copy) - copied_files
    if missing_files:
        print("\n다음 파일들은 찾지 못했습니다:")
        for filename in sorted(list(missing_files)):
            print(f"- {filename}")
 if __name__ == "__main__":
    copy_target_files_with_structure()
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,3 +2,4 @@
 streamlit
 requests
 python-dotenv
 pandas
--- a/workspace/app.py
+++ b/workspace/app.py
@@ -1,19 +1,59 @@
-# app.py (개별 네비게이션 및 선택 드롭다운 동시 지원)
+
 # app.py (시드 기반 서버 사이드 세션 공유 기능)
 import streamlit as st
 import json
 from pathlib import Path
 import base64
 import uuid
 import shutil
 # --- 상수 ---
 # 스크립트 파일의 위치를 기준으로 경로 설정
 SESSION_BASE_PATH = Path(__file__).parent / "shared_sessions"
 # --- 헬퍼 함수 ---
-def match_uploaded_files(doc_files, json_files):
+def get_session_path(seed):
-    """
+    """시드에 해당하는 세션 디렉토리 경로를 반환합니다."""
-    업로드된 두 파일 목록을 받아, 이름(확장자 제외)을 기준으로 매칭하고
+    return SESSION_BASE_PATH / seed
-    결과를 딕셔너리로 반환합니다.
+
-    """
+def save_files_to_session(seed, doc_files, json_files):
    """업로드된 파일들을 서버의 세션 디렉토리에 저장합니다."""
    session_path = get_session_path(seed)
    doc_path = session_path / "docs"
    json_path = session_path / "jsons"
    # 기존 디렉토리가 있으면 삭제하고 새로 생성
    if session_path.exists():
        shutil.rmtree(session_path)
    doc_path.mkdir(parents=True, exist_ok=True)
    json_path.mkdir(parents=True, exist_ok=True)
    for file in doc_files:
        with open(doc_path / file.name, "wb") as f:
            f.write(file.getbuffer())
    for file in json_files:
        with open(json_path / file.name, "wb") as f:
            f.write(file.getbuffer())
 def load_files_from_session(seed):
    """서버의 세션 디렉토리에서 파일 목록을 로드합니다."""
    session_path = get_session_path(seed)
    doc_path = session_path / "docs"
    json_path = session_path / "jsons"
    if not session_path.is_dir():
        return None, None
    doc_files = sorted(list(doc_path.iterdir()))
    json_files = sorted(list(json_path.iterdir()))
    return doc_files, json_files
 def match_disk_files(doc_files, json_files):
    """디스크에 저장된 두 파일 목록(Path 객체)을 매칭합니다."""
    matched_pairs = {}
-    docs_map = {Path(f.name).stem: f for f in doc_files}
+    docs_map = {f.stem: f for f in doc_files}
-    jsons_map = {Path(f.name).stem: f for f in json_files}
+    jsons_map = {f.stem: f for f in json_files}
    for stem, doc_file in docs_map.items():
        if stem in jsons_map:
@@ -23,111 +63,116 @@ def match_uploaded_files(doc_files, json_files):
            }
    return matched_pairs
-def display_pdf(file_object):
+def display_pdf(file_path_or_obj):
-    """
+    """파일 경로 또는 업로드된 파일 객체를 받아 PDF를 표시합니다."""
    업로드된 파일 객체(UploadedFile)를 읽어 PDF를 표시합니다.
    """
    try:
-        file_object.seek(0)
+        if isinstance(file_path_or_obj, Path):
-        base64_pdf = base64.b64encode(file_object.read()).decode('utf-8')
+            with open(file_path_or_obj, "rb") as f:
                bytes_data = f.read()
        else: # UploadedFile
            file_path_or_obj.seek(0)
            bytes_data = file_path_or_obj.read()
        base64_pdf = base64.b64encode(bytes_data).decode('utf-8')
        pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="800" type="application/pdf"></iframe>'
        st.markdown(pdf_display, unsafe_allow_html=True)
    except Exception as e:
        st.error(f"PDF 파일을 표시하는 중 오류가 발생했습니다: {e}")
 # --- 콜백 함수 ---
 def handle_nav_button(direction, total_files):
    """이전/다음 버튼 클릭을 처리하는 콜백"""
    if direction == "prev" and st.session_state.current_index > 0:
        st.session_state.current_index -= 1
    elif direction == "next" and st.session_state.current_index < total_files - 1:
        st.session_state.current_index += 1
 def handle_selectbox_change():
    """selectbox 변경을 처리하는 콜백"""
    selected_basename_with_index = st.session_state.selectbox_key
    # "1. filename" 형식에서 인덱스만 추출
    new_index = int(selected_basename_with_index.split('. ', 1)[0]) - 1
    st.session_state.current_index = new_index
 # --- 메인 UI 로직 ---
 def main():
    st.set_page_config(layout="wide", page_title="결과 비교 도구")
-    st.title("📑 파일 업로드 기반 결과 비교 도구")
+    st.title("📑 결과 비교 및 공유 도구")
    st.markdown("---")
    # --- 1. 파일 업로드 ---
    st.sidebar.header("파일 업로드")
    uploaded_docs = st.sidebar.file_uploader(
        "1. 원본 문서 파일(들)을 업로드하세요.",
        accept_multiple_files=True,
        type=['png', 'jpg', 'jpeg', 'pdf']
    )
    uploaded_jsons = st.sidebar.file_uploader(
        "2. 결과 JSON 파일(들)을 업로드하세요.",
        accept_multiple_files=True,
        type=['json']
    )
    if not uploaded_docs or not uploaded_jsons:
        st.info("사이드바에서 원본 문서와 결과 JSON 파일을 모두 업로드해주세요.")
        return
    try:
        matched_files = match_uploaded_files(uploaded_docs, uploaded_jsons)
    except Exception as e:
        st.error(f"업로드된 파일을 매칭하는 중 오류가 발생했습니다: {e}")
        return
    if not matched_files:
        st.warning("업로드된 파일 중 일치하는 문서-JSON 쌍을 찾을 수 없습니다. 파일 이름(확장자 제외)이 동일한지 확인하세요.")
        return
    # --- 2. 파일 네비게이션 및 선택 ---
    st.sidebar.header("파일 탐색")
    sorted_basenames = sorted(list(matched_files.keys()))
    total_files = len(sorted_basenames)
    # 세션 상태 초기화
    if 'current_index' not in st.session_state:
        st.session_state.current_index = 0
    # 세션 저장 기본 경로 생성
    SESSION_BASE_PATH.mkdir(parents=True, exist_ok=True)
    matched_files = None
    doc_files, json_files = None, None
    # URL에서 시드 확인
    query_params = st.query_params
    url_seed = query_params.get("seed")
    if url_seed:
        doc_files, json_files = load_files_from_session(url_seed)
        if doc_files is None:
            st.error(f"'{url_seed}'에 해당하는 공유 세션을 찾을 수 없습니다. 시드가 정확한지 확인하거나, 파일을 새로 업로드하세요.")
        else:
            st.success(f"'{url_seed}' 시드에서 공유된 파일을 불러왔습니다.")
            matched_files = match_disk_files(doc_files, json_files)
-    # 인덱스가 유효한 범위를 벗어나지 않도록 조정
+    # 시드가 없거나, 시드로 로드 실패 시 파일 업로더 표시
    if not matched_files:
        st.sidebar.header("파일 업로드")
        uploaded_docs = st.sidebar.file_uploader(
            "1. 원본 문서 파일(들)을 업로드하세요.",
            accept_multiple_files=True,
            type=['png', 'jpg', 'jpeg', 'pdf']
        )
        uploaded_jsons = st.sidebar.file_uploader(
            "2. 결과 JSON 파일(들)을 업로드하세요.",
            accept_multiple_files=True,
            type=['json']
        )
        if uploaded_docs and uploaded_jsons:
            if st.sidebar.button("업로드 및 세션 생성"):
                new_seed = str(uuid.uuid4())[:8]
                save_files_to_session(new_seed, uploaded_docs, uploaded_jsons)
                st.query_params["seed"] = new_seed # URL 업데이트 및 앱 재실행
                st.rerun()
    # 공유 UI
    if url_seed and matched_files:
        st.sidebar.header("세션 공유")
        # 현재 페이지의 전체 URL을 가져오는 것은 Streamlit에서 직접 지원하지 않으므로,
        # 사용자에게 주소창의 URL을 복사하라고 안내합니다.
        st.sidebar.success("세션이 활성화되었습니다!")
        st.sidebar.info("다른 사람과 공유하려면 현재 브라우저 주소창의 URL을 복사하여 전달하세요.")
        st.sidebar.text_input("공유 시드", url_seed, disabled=True)
    # --- 결과 표시 로직 (matched_files가 있을 때만 실행) ---
    if not matched_files:
        st.info("사이드바에서 파일을 업로드하고 '업로드 및 세션 생성' 버튼을 누르거나, 공유받은 URL로 접속하세요.")
        return
    st.sidebar.header("파일 탐색")
    sorted_basenames = sorted(list(matched_files.keys()))
    total_files = len(sorted_basenames)
    st.session_state.current_index = max(0, min(st.session_state.current_index, total_files - 1))
    # Selectbox
    display_options = [f"{i + 1}. {name}" for i, name in enumerate(sorted_basenames)]
    st.selectbox(
        "파일을 직접 선택하세요:",
        options=display_options,
        index=st.session_state.current_index,
-        key='selectbox_key', # 콜백에서 값을 참조하기 위한 키
+        key='selectbox_key',
        on_change=handle_selectbox_change
    )
    # 네비게이션 컨트롤
    col1, col2, col3 = st.sidebar.columns([1, 2, 1])
-    
+    col1.button("◀ 이전", on_click=handle_nav_button, args=("prev", total_files), use_container_width=True)
    col1.button(
        "◀ 이전", 
        on_click=handle_nav_button, 
        args=("prev", total_files), 
        use_container_width=True
    )
    col2.markdown(f"<p style='text-align: center;'>{st.session_state.current_index + 1} / {total_files}</p>", unsafe_allow_html=True)
-    col3.button(
+    col3.button("다음 ▶", on_click=handle_nav_button, args=("next", total_files), use_container_width=True)
        "다음 ▶", 
        on_click=handle_nav_button, 
        args=("next", total_files), 
        use_container_width=True
    )
    # --- 3. 결과 표시 ---
    current_basename = sorted_basenames[st.session_state.current_index]
    st.header(f"🔎 비교 결과: `{current_basename}`")
@@ -138,24 +183,20 @@ def main():
    res_col1, res_col2 = st.columns(2)
    with res_col1:
        st.subheader(f"원본 문서: `{doc_file.name}`")
-        doc_suffix = Path(doc_file.name).suffix.lower()
+        if doc_file.suffix.lower() == ".pdf":
        if doc_suffix == ".pdf":
            display_pdf(doc_file)
        else:
-            st.image(doc_file, caption=f"원본 이미지: {doc_file.name}", use_container_width=True)
+            st.image(str(doc_file), caption=f"원본 이미지: {doc_file.name}", use_container_width=True)
    with res_col2:
        st.subheader(f"추출된 데이터: `{json_file.name}`")
        try:
-            json_file.seek(0)
+            with open(json_file, "r", encoding="utf-8") as f:
-            data = json.load(json_file)
+                data = json.load(f)
            result_to_display = data[0] if isinstance(data, list) and data else data
            if isinstance(result_to_display, dict) and 'fields' in result_to_display:
                del result_to_display['fields']
            st.json(result_to_display)
        except Exception as e:
            st.error(f"JSON 파일을 읽거나 처리하는 중 오류가 발생했습니다: {e}")
--- a/workspace/show_summary.py
+++ b/workspace/show_summary.py
@@ -0,0 +1,92 @@
 # workspace/show_summary.py
 import os
 import json
 import argparse
 import pandas as pd
 def generate_summary(directory_path):
    """
    지정된 디렉터리에서 모든 JSON 파일을 읽어 요약 정보를 추출하고,
    pandas DataFrame으로 반환합니다.
    """
    summary_data = []
    if not os.path.isdir(directory_path):
        print(f"오류: 디렉터리를 찾을 수 없습니다 - {directory_path}")
        return None
    for filename in sorted(os.listdir(directory_path)):
        if filename.endswith('.json'):
            file_path = os.path.join(directory_path, filename)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                # JSON 파일이 리스트 형태이므로 첫 번째 항목을 사용
                if isinstance(data, list) and data:
                    item = data[0]
                else:
                    # 예상치 못한 형식이면 건너뛰기
                    continue
                # 필요한 정보 추출
                row_data = {
                    'filename': item.get('filename'),
                    'duration_sec': item.get('time', {}).get('duration_sec')
                }
                # 'processed' 딕셔너리의 모든 키-값을 row_data에 추가
                processed_info = item.get('processed', {})
                if isinstance(processed_info, dict):
                    row_data.update(processed_info)
                summary_data.append(row_data)
            except (json.JSONDecodeError, IndexError) as e:
                print(f"파일 처리 중 오류 발생 ({filename}): {e}")
            except Exception as e:
                print(f"알 수 없는 오류 발생 ({filename}): {e}")
    if not summary_data:
        print("처리할 JSON 파일이 없습니다.")
        return None
    return pd.DataFrame(summary_data)
 def main():
    """메인 실행 함수"""
    parser = argparse.ArgumentParser(description="JSON 파일들을 읽어 요약 테이블을 생성하고 CSV로 저장하는 스크립트")
    parser.add_argument("input_dir", help="JSON 파일들이 포함된 입력 디렉터리 경로")
    parser.add_argument("-o", "--output", help="요약 결과를 저장할 CSV 파일 경로")
    args = parser.parse_args()
    # pandas 출력 옵션 설정
    pd.set_option('display.max_rows', 500)
    pd.set_option('display.max_columns', 50)
    pd.set_option('display.width', 200)
    summary_df = generate_summary(args.input_dir)
    if summary_df is not None:
        print("\n--- JSON 처리 결과 요약 ---")
        print(summary_df)
        print("\n")
        # CSV 파일로 저장하는 로직 추가
        if args.output:
            output_path = args.output
            # 파일명에 .csv 확장자가 없으면 자동으로 추가
            if not output_path.lower().endswith('.csv'):
                output_path += '.csv'
            try:
                # CSV 파일 저장 시 Excel에서 한글이 깨지지 않도록 'utf-8-sig' 인코딩 사용
                summary_df.to_csv(output_path, index=False, encoding='utf-8-sig')
                print(f"요약 결과가 '{output_path}' 파일로 성공적으로 저장되었습니다.")
            except Exception as e:
                print(f"CSV 파일 저장 중 오류가 발생했습니다: {e}")
            print("\n")
 if __name__ == "__main__":
    main()