First commit

2025-11-05 15:20:21 +09:00
commit fe8601ac63
22 changed files with 3414 additions and 0 deletions
--- a/config/init.py
+++ b/config/init.py
--- a/config/env_setup.py
+++ b/config/env_setup.py
@@ -0,0 +1,12 @@
+import os
+import torch
+
+def setup_environment():
+    """
+    OCR 모델 실행에 필요한 환경 변수를 설정합니다.
+    """
+    if torch.version.cuda == "11.8":
+        os.environ["TRITON_PTXAS_PATH"] = "/usr/local/cuda-11.8/bin/ptxas"
+
+    os.environ["VLLM_USE_V1"] = "0"
+    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
--- a/config/model_settings.py
+++ b/config/model_settings.py
@@ -0,0 +1,42 @@
+# TODO: change modes
+# Tiny: base_size = 512, image_size = 512, crop_mode = False
+# Small: base_size = 640, image_size = 640, crop_mode = False
+# Base: base_size = 1024, image_size = 1024, crop_mode = False
+# Large: base_size = 1280, image_size = 1280, crop_mode = False
+# Gundam: base_size = 1024, image_size = 640, crop_mode = True
+
+BASE_SIZE = 1024
+IMAGE_SIZE = 640
+CROP_MODE = True
+MIN_CROPS = 2
+MAX_CROPS = 6  # max:9; If your GPU memory is small, it is recommended to set it to 6.
+MAX_CONCURRENCY = 100  # If you have limited GPU memory, lower the concurrency count.
+NUM_WORKERS = 64  # image pre-process (resize/padding) workers
+PRINT_NUM_VIS_TOKENS = False
+SKIP_REPEAT = False
+MODEL_PATH = "deepseek-ai/DeepSeek-OCR"  # change to your model path
+
+# TODO: change INPUT_PATH
+# .pdf: run_dpsk_ocr_pdf.py;
+# .jpg, .png, .jpeg: run_dpsk_ocr_image.py;
+# Omnidocbench images path: run_dpsk_ocr_eval_batch.py
+
+
+INPUT_PATH = "/workspace/input"
+OUTPUT_PATH = "/workspace/output"
+# PROMPT = f"{PROMPT_TEXT.strip()}"
+PROMPT = "<image>\n<|grounding|>Convert the document to markdown."
+# PROMPT = '<image>\nFree OCR.'
+# TODO commonly used prompts
+# document: <image>\n<|grounding|>Convert the document to markdown.
+# other image: <image>\n<|grounding|>OCR this image.
+# without layouts: <idmage>\nFree OCR.
+# figures in document: <image>\nParse the figure.
+# general: <image>\nDescribe this image in detail.
+# rec: <image>\nLocate <|ref|>xxxx<|/ref|> in the image.
+# .......
+
+
+from transformers import AutoTokenizer
+
+TOKENIZER = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)