Files
ocr_macro/workspace/copy_groundtruth_files.py
2025-10-30 09:38:52 +09:00

71 lines
2.4 KiB
Python

import os
import json
import shutil
from pathlib import Path
def copy_files_from_groundtruth():
"""
Reads JSON files from the groundtruth directory, finds matching files
in the docs directory, and copies them to a new directory.
"""
base_dir = Path("/home/jackjack/test/ocr_macro/workspace/shared_sessions/b66123d5")
groundtruth_dir = base_dir / "groundtruth"
docs_dir = base_dir / "docs"
destination_dir = base_dir / "aa"
# Ensure destination directory exists
destination_dir.mkdir(exist_ok=True)
print(f"Destination directory created or already exists: {destination_dir}")
if not groundtruth_dir.is_dir():
print(f"Error: Groundtruth directory not found at {groundtruth_dir}")
return
if not docs_dir.is_dir():
print(f"Error: Docs directory not found at {docs_dir}")
return
json_files = list(groundtruth_dir.glob("*.json"))
if not json_files:
print(f"No JSON files found in {groundtruth_dir}")
return
print(f"Found {len(json_files)} JSON files to process...")
copied_count = 0
not_found_count = 0
for json_file in json_files:
try:
with open(json_file, 'r', encoding='utf-8') as f:
data = json.load(f)
filename_to_find = data.get("filename")
if not filename_to_find:
print(f"Warning: 'filename' key not found in {json_file.name}. Skipping.")
continue
source_file_path = docs_dir / filename_to_find
destination_file_path = destination_dir / filename_to_find
if source_file_path.exists():
print(f"Copying '{source_file_path}' to '{destination_file_path}'...")
shutil.copy(source_file_path, destination_file_path)
copied_count += 1
else:
print(f"Warning: File not found in docs directory: {filename_to_find}")
not_found_count += 1
except json.JSONDecodeError:
print(f"Error: Could not decode JSON from {json_file.name}. Skipping.")
except Exception as e:
print(f"An unexpected error occurred while processing {json_file.name}: {e}")
print("\n--- Operation Summary ---")
print(f"Files copied successfully: {copied_count}")
print(f"Files not found: {not_found_count}")
print("-------------------------")
if __name__ == "__main__":
copy_files_from_groundtruth()