Deepseek-OCR 환경 설정

This commit is contained in:
kyy
2025-10-27 13:50:19 +09:00
parent 153129649d
commit 96b47df624
21 changed files with 62 additions and 3481 deletions

View File

@@ -1,34 +0,0 @@
from transformers import AutoModel, AutoTokenizer
import torch
import os
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
model_name = 'deepseek-ai/DeepSeek-OCR'
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(model_name, _attn_implementation='flash_attention_2', trust_remote_code=True, use_safetensors=True)
model = model.eval().cuda().to(torch.bfloat16)
# prompt = "<image>\nFree OCR. "
prompt = "<image>\n<|grounding|>Convert the document to markdown. "
image_file = '/workspace/2025-27484-M21472.pdf'
output_path = '/workspace/output_hf'
# infer(self, tokenizer, prompt='', image_file='', output_path = ' ', base_size = 1024, image_size = 640, crop_mode = True, test_compress = False, save_results = False):
# Tiny: base_size = 512, image_size = 512, crop_mode = False
# Small: base_size = 640, image_size = 640, crop_mode = False
# Base: base_size = 1024, image_size = 1024, crop_mode = False
# Large: base_size = 1280, image_size = 1280, crop_mode = False
# Gundam: base_size = 1024, image_size = 640, crop_mode = True
res = model.infer(tokenizer, prompt=prompt, image_file=image_file, output_path = output_path, base_size = 1024, image_size = 640, crop_mode=True, save_results = True, test_compress = True)

View File

@@ -1,42 +0,0 @@
# TODO: change modes
# Tiny: base_size = 512, image_size = 512, crop_mode = False
# Small: base_size = 640, image_size = 640, crop_mode = False
# Base: base_size = 1024, image_size = 1024, crop_mode = False
# Large: base_size = 1280, image_size = 1280, crop_mode = False
# Gundam: base_size = 1024, image_size = 640, crop_mode = True
BASE_SIZE = 1024
IMAGE_SIZE = 640
CROP_MODE = True
MIN_CROPS= 2
MAX_CROPS= 6 # max:9; If your GPU memory is small, it is recommended to set it to 6.
MAX_CONCURRENCY = 100 # If you have limited GPU memory, lower the concurrency count.
NUM_WORKERS = 64 # image pre-process (resize/padding) workers
PRINT_NUM_VIS_TOKENS = False
SKIP_REPEAT = True
MODEL_PATH = 'deepseek-ai/DeepSeek-OCR' # change to your model path
# TODO: change INPUT_PATH
# .pdf: run_dpsk_ocr_pdf.py;
# .jpg, .png, .jpeg: run_dpsk_ocr_image.py;
# Omnidocbench images path: run_dpsk_ocr_eval_batch.py
INPUT_PATH = f'/workspace/2018-0802140959-217049.pdf'
OUTPUT_PATH = '/workspace/output/'
PROMPT = '<image>\n<|grounding|>Convert the document to markdown.'
# PROMPT = '<image>\nFree OCR.'
# TODO commonly used prompts
# document: <image>\n<|grounding|>Convert the document to markdown.
# other image: <image>\n<|grounding|>OCR this image.
# without layouts: <image>\nFree OCR.
# figures in document: <image>\nParse the figure.
# general: <image>\nDescribe this image in detail.
# rec: <image>\nLocate <|ref|>xxxx<|/ref|> in the image.
# '先天下之忧而忧'
# .......
from transformers import AutoTokenizer
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)

View File

@@ -1,174 +0,0 @@
import torch.nn as nn
import torch
import torch.nn.functional as F
import copy
class MlpProjector(nn.Module):
def __init__(self, cfg):
super().__init__()
self.cfg = cfg
if cfg.projector_type == "identity":
modules = nn.Identity()
elif cfg.projector_type == "linear":
modules = nn.Linear(cfg.input_dim, cfg.n_embed)
elif cfg.projector_type == "mlp_gelu":
mlp_depth = cfg.get("depth", 1)
modules = [nn.Linear(cfg.input_dim, cfg.n_embed)]
for _ in range(1, mlp_depth):
modules.append(nn.GELU())
modules.append(nn.Linear(cfg.n_embed, cfg.n_embed))
modules = nn.Sequential(*modules)
elif cfg.projector_type == "normlayer_downsample_mlp_gelu":
mlp_depth = cfg.get("depth", 1)
mlp_ratio = cfg.get("mlp_ratio", 1)
modules = [
nn.LayerNorm(cfg.input_dim * cfg.downsample_ratio * cfg.downsample_ratio),
nn.Linear(cfg.input_dim * cfg.downsample_ratio * cfg.downsample_ratio, cfg.n_embed * mlp_ratio)
]
for _ in range(1, mlp_depth - 1):
modules.append(nn.GELU())
modules.append(nn.Linear(cfg.n_embed * mlp_ratio, cfg.n_embed * mlp_ratio))
modules.append(nn.GELU())
modules.append(nn.Linear(cfg.n_embed * mlp_ratio, cfg.n_embed))
modules = nn.Sequential(*modules)
elif cfg.projector_type == "downsample_mlp_gelu":
mlp_depth = cfg.get("depth", 1)
mlp_ratio = cfg.get("mlp_ratio", 1)
modules = [nn.Linear(cfg.input_dim * cfg.downsample_ratio * cfg.downsample_ratio, cfg.n_embed * mlp_ratio)]
for _ in range(1, mlp_depth - 1):
modules.append(nn.GELU())
modules.append(nn.Linear(cfg.n_embed * mlp_ratio, cfg.n_embed * mlp_ratio))
modules.append(nn.GELU())
modules.append(nn.Linear(cfg.n_embed * mlp_ratio, cfg.n_embed))
modules = nn.Sequential(*modules)
elif cfg.projector_type == "low_high_hybrid_split_mlp_gelu":
mlp_depth = cfg.get("depth", 1)
self.high_up_proj = nn.Linear(cfg.input_dim, cfg.n_embed // 2)
self.low_up_proj = nn.Linear(cfg.input_dim, cfg.n_embed // 2)
modules = []
for _ in range(1, mlp_depth):
modules.append(nn.GELU())
modules.append(nn.Linear(cfg.n_embed, cfg.n_embed))
modules = nn.Sequential(*modules)
elif cfg.projector_type == "hybrid_split_feature_mlp_gelu":
mlp_depth = cfg.get("depth", 1)
channel_div = cfg.get("channel_div", 0.5)
self.high_up_proj = nn.Linear(cfg.input_dim[0], int(cfg.n_embed * channel_div))
self.low_up_proj = nn.Linear(cfg.input_dim[1], cfg.n_embed - int(cfg.n_embed * channel_div))
modules = []
for _ in range(1, mlp_depth):
modules.append(nn.GELU())
modules.append(nn.Linear(cfg.n_embed, cfg.n_embed))
modules = nn.Sequential(*modules)
elif cfg.projector_type == "low_high_split_mlp_gelu":
mlp_depth = cfg.get("depth", 1)
modules = []
for _ in range(1, mlp_depth):
modules.append(nn.GELU())
modules.append(nn.Linear(cfg.n_embed // 2, cfg.n_embed // 2))
modules = nn.Sequential(*modules)
self.high_layers = nn.Sequential(*modules)
self.low_layers = copy.deepcopy(modules)
else:
raise ValueError(f"Unknown projector type: {cfg.projector_type}")
if cfg.get("token_pooling", False):
self.token_pooling_layer = nn.Linear(cfg.input_dim * 4, cfg.input_dim)
if cfg.get("conv_fusion_high_low_features", False):
self.fusion_layer = nn.Linear(cfg.input_dim, cfg.input_dim)
self.layers = modules
def forward(self, x):
if self.cfg.get("token_pooling", False):
batch_size, wxh, channels = x.shape
w = h = int(wxh**0.5)
x = x.view(batch_size, w, h, channels)
x = x.permute(0, 3, 1, 2)
# import ipdb; ipdb.set_trace()
patches = x.unfold(2, 2, 2).unfold(3, 2, 2)
batch_size, channels, h_patches, w_patches, _, _ = patches.size()
# 在通道维度上拼接
patches = patches.contiguous().view(batch_size, channels, h_patches * w_patches, -1)
# 通过线性层
patches = patches.permute(0, 2, 1, 3).contiguous()
patches = patches.view(batch_size, h_patches * w_patches, channels * 4)
x = self.token_pooling_layer(patches)
if self.cfg.get("conv_fusion_high_low_features", False):
x = self.fusion_layer(x[:, 0]) + x[:, 1]
if self.cfg.projector_type == 'low_high_hybrid_split_mlp_gelu':
high_x, low_x = x[0], x[1]
high_x = self.high_up_proj(high_x)
low_x = self.low_up_proj(low_x)
x = torch.concat([high_x, low_x], dim=-1)
if self.cfg.projector_type == 'hybrid_split_feature_mlp_gelu':
high_x = x[...,:self.cfg.input_dim[0]]
low_x = x[...,self.cfg.input_dim[0]:]
high_x = self.high_up_proj(high_x)
low_x = self.low_up_proj(low_x)
x = torch.concat([high_x, low_x], dim=-1)
if self.cfg.projector_type == 'low_high_split_mlp_gelu':
high_x, low_x = x[0], x[1]
high_x = self.high_layers(high_x)
low_x = self.low_layers(low_x)
x = torch.concat([high_x, low_x], dim=-1)
return x
if self.cfg.projector_type == 'downsample_mlp_gelu' or self.cfg.projector_type == 'normlayer_downsample_mlp_gelu':
bs, hw, input_dim = x.shape
h = w = int((hw) ** 0.5)
"""compute padding"""
if h % self.cfg.downsample_ratio:
pad = self.cfg.downsample_ratio - h % self.cfg.downsample_ratio
else:
pad = 0
x = x.reshape(bs, h, w, input_dim)
if pad > 0:
x = F.pad(x, (0, 0, 0, pad, 0, pad), "constant", 0)
"""4 to 1 concat"""
x = x.permute(0, 3, 1, 2) # B, C, H, W
x = F.unfold(x, kernel_size=self.cfg.downsample_ratio, stride=self.cfg.downsample_ratio, padding=0) # B, C*4, HW // 4
x = x.permute(0, 2, 1)
return self.layers(x)
@staticmethod
def get_flops_per_sample(cfg):
if cfg.projector_type == "linear":
fwd = 2 * cfg.input_dim * cfg.n_embed
elif "mlp_gelu" in cfg.projector_type :
mlp_depth = cfg.get("depth", 1)
downsample_ratio = cfg.get("downsample_ratio", 1)
input_dim = sum(cfg.input_dim) if isinstance(cfg.input_dim, list) else cfg.input_dim
input_dim = input_dim * downsample_ratio * downsample_ratio
fwd = 2 * input_dim * cfg.n_embed + (mlp_depth - 1) * 2 * cfg.n_embed * cfg.n_embed
else:
fwd = 0
return fwd * 3

View File

@@ -1,504 +0,0 @@
from contextlib import nullcontext
import math
from typing import Optional, Tuple
# from megatron.model import LayerNorm
from easydict import EasyDict as adict
import torch
from torch.nn import functional as F
from torch import nn
from flash_attn import flash_attn_qkvpacked_func, flash_attn_func
# from optimus import flash_attn_func
# from megatron.core import tensor_parallel
# from megatron.core import parallel_state as mpu
# from megatron.core.utils import make_viewless_tensor, divide
# from megatron.model.fused_rms_norm import RMSNorm
# from megatron.model.transformer import (
# FlashSelfAttention,
# NoopTransformerLayer,
# _cfg_to_kwargs,
# )
# from megatron.model.enums import AttnMaskType, AttnType
# from megatron.model.fused_softmax import FusedScaleMaskSoftmax
# from megatron.model.utils import attention_mask_func
# from megatron.model.module import MegatronModule
# try:
# from einops import rearrange
# except ImportError:
# rearrange = None
# from flash_attn import flash_attn_varlen_func as flash_attn_unpadded_func
# try:
# # flash attention 2.x
# from flash_attn import flash_attn_varlen_func as flash_attn_unpadded_func
# except ImportError:
# try:
# # flash attention 1.x
# from flash_attn.flash_attn_interface import flash_attn_unpadded_func
# except ImportError:
# flash_attn_unpadded_func = None
# try:
# from flash_attn.flash_attn_interface import flash_attn_unpadded_relative_attention_bias_func
# except ImportError:
# flash_attn_unpadded_relative_attention_bias_func = None
# try:
# from flash_attn.flash_attn_interface import mask_flash_attn_unpadded_func
# except ImportError:
# mask_flash_attn_unpadded_func = None
class LayerNormfp32(torch.nn.LayerNorm):
"""Subclass torch's LayerNorm to handle fp16."""
def forward(self, x: torch.Tensor):
orig_type = x.dtype
ret = super().forward(x.type(torch.float32))
return ret.type(orig_type)
def get_abs_pos(abs_pos, tgt_size):
# abs_pos: L, C
# tgt_size: M
# return: M, C
# print(tgt_size)
# print(abs_pos.shape)
# exit()
dim = abs_pos.size(-1)
# print(dim)
abs_pos_new = abs_pos.squeeze(0)
cls_token, old_pos_embed = abs_pos_new[:1], abs_pos_new[1:]
src_size = int(math.sqrt(abs_pos_new.shape[0] - 1))
tgt_size = int(math.sqrt(tgt_size))
dtype = abs_pos.dtype
if src_size != tgt_size:
old_pos_embed = old_pos_embed.view(1, src_size, src_size, dim).permute(0, 3, 1,
2).contiguous()
old_pos_embed = old_pos_embed.to(torch.float32)
new_pos_embed = F.interpolate(
old_pos_embed,
size=(tgt_size, tgt_size),
mode='bicubic',
antialias=True,
align_corners=False,
).to(dtype)
new_pos_embed = new_pos_embed.permute(0, 2, 3, 1)
new_pos_embed = new_pos_embed.view(tgt_size * tgt_size, dim)
vision_pos_embed = torch.cat([cls_token, new_pos_embed], dim=0)
vision_pos_embed = vision_pos_embed.view(1, tgt_size * tgt_size + 1, dim)
return vision_pos_embed
else:
return abs_pos
@torch.jit.script
def quick_gelu(x):
return x * torch.sigmoid(1.702 * x)
class CLIPVisionEmbeddings(nn.Module):
def __init__(self, hidden_size=1024, image_size=224, patch_size=14, num_channels=3):
super().__init__()
self.embed_dim = hidden_size
self.image_size = image_size
self.patch_size = patch_size
self.class_embedding = torch.nn.Parameter(torch.randn(self.embed_dim))
self.patch_embedding = torch.nn.Conv2d(
in_channels=num_channels,
out_channels=self.embed_dim,
kernel_size=self.patch_size,
stride=self.patch_size,
bias=False,
)
self.num_patches = (self.image_size // self.patch_size) ** 2
self.num_positions = self.num_patches + 1
self.position_embedding = torch.nn.Embedding(self.num_positions, self.embed_dim)
self.register_buffer(
"position_ids", torch.arange(self.num_positions).expand((1, -1))
)
def forward(self, pixel_values, patch_embeds):
batch_size = pixel_values.shape[0]
# patch_embeds = self.patch_embedding(
# pixel_values
# ) # shape = [*, width, grid, grid]
if patch_embeds is not None:
patch_embeds = patch_embeds
# print(patch_embeds.shape)
else:
patch_embeds = self.patch_embedding(pixel_values)
# print(111111)
# shape = [*, width, grid, grid]
# patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
class_embeds = self.class_embedding.expand(batch_size, 1, -1)
embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
# x = torch.cat([cls_token, x], dim=1)
embeddings = embeddings + get_abs_pos(self.position_embedding(self.position_ids), embeddings.size(1))
# embeddings = embeddings + self.position_embedding(self.position_ids)
return embeddings
class NoTPFeedForward(nn.Module):
def __init__(
self,
cfg,
dim: int,
hidden_dim: int,
):
super().__init__()
self.fc1 = torch.nn.Linear(dim, hidden_dim, bias=True)
self.fc2 = torch.nn.Linear(hidden_dim, dim, bias=True)
def forward(self, x):
output = self.fc2(quick_gelu(self.fc1(x)))
return output
# from optimus.flash_attn_interface import flash_attn_qkvpacked_func
# class NoTPAttention(nn.Module):
# def __init__(self, cfg):
# super().__init__()
# self.num_heads = cfg.num_attention_heads
# self.n_local_heads = cfg.num_attention_heads
# self.head_dim = cfg.hidden_size // cfg.num_attention_heads
# self.max_seq_len = cfg.seq_length
# self.use_flash_attention = cfg.use_flash_attn
# self.qkv_proj = torch.nn.Linear(cfg.hidden_size, cfg.hidden_size * 3, bias=True)
# self.out_proj = torch.nn.Linear(cfg.hidden_size, cfg.hidden_size, bias=True)
# # self.core_attention = CoreAttention(cfg, AttnType.self_attn)
# self.attn_drop = cfg.attention_dropout
# def forward(
# self,
# x: torch.Tensor,
# ):
# bsz, seqlen, _ = x.shape
# xqkv = self.qkv_proj(x)
# xqkv = xqkv.view(bsz, seqlen, 3, self.num_heads, self.head_dim)
# if self.use_flash_attention:
# output = flash_attn_qkvpacked_func(xqkv)
# output = output.view(bsz, seqlen, -1)
# else:
# xq, xk, xv = torch.split(xqkv, 1, dim=2)
# xq = xq.squeeze(2)
# xk = xk.squeeze(2)
# xv = xv.squeeze(2)
# # xq, xk, xv = xqkv[:, :, 0, ...], xqkv[:, :, 1, ...], xqkv[:, :, 2, ...]
# # B, num_head, S, head_size)
# xq = xq.permute(0, 2, 1, 3)
# xk = xk.permute(0, 2, 1, 3)
# xv = xv.permute(0, 2, 1, 3)
# output = torch.nn.functional.scaled_dot_product_attention(xq, xk, xv, attn_mask=None)
# utput = output.permute(0, 2, 1, 3).view(bsz, seqlen, -1)
# output = self.out_proj(output)
# return output
# from optimus.flash_attn_interface import flash_attn_qkvpacked_func
class NoTPAttention(torch.nn.Module):
def __init__(self, cfg):
super().__init__()
self.num_heads = cfg.num_attention_heads
self.n_local_heads = cfg.num_attention_heads
self.head_dim = cfg.hidden_size // cfg.num_attention_heads
self.max_seq_len = cfg.seq_length
self.use_flash_attention = cfg.use_flash_attn
self.qkv_proj = torch.nn.Linear(cfg.hidden_size, cfg.hidden_size * 3, bias=True)
self.out_proj = torch.nn.Linear(cfg.hidden_size, cfg.hidden_size, bias=True)
# self.core_attention = CoreAttention(cfg, AttnType.self_attn)
self.attn_drop = cfg.attention_dropout
def forward(
self,
x: torch.Tensor,
):
bsz, seqlen, _ = x.shape
xqkv = self.qkv_proj(x)
xqkv = xqkv.view(bsz, seqlen, 3, self.num_heads, self.head_dim)
if self.use_flash_attention:
output = flash_attn_qkvpacked_func(xqkv)
output = output.view(bsz, seqlen, -1)
# xq, xk, xv = torch.split(xqkv, 1, dim=2)
# xq = xq.squeeze(2)
# xk = xk.squeeze(2)
# xv = xv.squeeze(2)
# # xq, xk, xv = xqkv[:, :, 0, ...], xqkv[:, :, 1, ...], xqkv[:, :, 2, ...]
# # B, num_head, S, head_size)
# xq = xq.permute(0, 2, 1, 3)
# xk = xk.permute(0, 2, 1, 3)
# xv = xv.permute(0, 2, 1, 3)
# # with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
# output = torch.nn.functional.scaled_dot_product_attention(xq, xk, xv, attn_mask=None)
# output = output.permute(0, 2, 1, 3).reshape(bsz, seqlen, -1)
# output = output.permute(0, 2, 1, 3).contiguous().view(bsz, seqlen, -1)
else:
# output = flash_attn_qkvpacked_func(xqkv)
xq, xk, xv = torch.split(xqkv, 1, dim=2)
xq = xq.squeeze(2)
xk = xk.squeeze(2)
xv = xv.squeeze(2)
# xq, xk, xv = xqkv[:, :, 0, ...], xqkv[:, :, 1, ...], xqkv[:, :, 2, ...]
# B, num_head, S, head_size)
xq = xq.permute(0, 2, 1, 3)
xk = xk.permute(0, 2, 1, 3)
xv = xv.permute(0, 2, 1, 3)
# with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
output = torch.nn.functional.scaled_dot_product_attention(xq, xk, xv, attn_mask=None)
output = output.permute(0, 2, 1, 3).reshape(bsz, seqlen, -1)
output = self.out_proj(output)
return output
class NoTPTransformerBlock(nn.Module):
def __init__(self, cfg, layer_id: int, multiple_of=256):
super().__init__()
self.n_heads = cfg.num_attention_heads
self.dim = cfg.hidden_size
self.head_dim = cfg.hidden_size // cfg.num_attention_heads
self.self_attn = NoTPAttention(cfg)
self.mlp = NoTPFeedForward(
cfg, dim=cfg.hidden_size, hidden_dim=cfg.ffn_hidden_size
)
self.layer_id = layer_id
self.layer_norm1 = torch.nn.LayerNorm(
cfg.hidden_size, eps=cfg.layernorm_epsilon
)
self.layer_norm2 = torch.nn.LayerNorm(
cfg.hidden_size, eps=cfg.layernorm_epsilon
)
def forward(self, x: torch.Tensor):
residual = self.self_attn.forward(self.layer_norm1(x))
h = x + residual
out = h + self.mlp.forward(self.layer_norm2(h))
return out
class NoTPTransformer(nn.Module):
def __init__(self, cfg):
super().__init__()
self.cfg = cfg
# self.recompute_list = self.cfg.get("recompute_list", [])
self.num_layers = cfg.num_layers # _get_num_layers(cfg)
self.layers = torch.nn.ModuleList()
for layer_id in range(self.num_layers):
self.layers.append(
NoTPTransformerBlock(
cfg,
layer_id + 1,
)
)
def forward(
self,
hidden_states,
):
for lid, layer in enumerate(self.layers):
# if lid in self.recompute_list:
# def custom(layer_id):
# def custom_forward(*args, **kwargs):
# x_ = self.layers[layer_id](*args, **kwargs)
# return x_
# return custom_forward
# assert hidden_states.requires_grad == True, logger.warning(
# "When using recalculation, the input must have grad fn"
# )
# hidden_states = tensor_parallel.checkpoint(
# custom(lid),
# False,
# hidden_states.contiguous()
# )
# else:
hidden_states = layer(hidden_states)
return hidden_states
# from megatron.core.tensor_parallel.layers import non_tensor_paralleled, local_dp_reduce, local_dp_scatter
class VitModel(nn.Module):
def __init__(
self,
cfg,
freeze_embed=False,
freeze_pre_norm=False
) -> None:
super().__init__()
self.embeddings = CLIPVisionEmbeddings(hidden_size=cfg.hidden_size, image_size=cfg.image_size, patch_size=cfg.patch_size)
if freeze_embed:
for name, param in self.embeddings.named_parameters():
param.requires_grad = False
self.transformer = NoTPTransformer(cfg=cfg)
if cfg.get("fp32norm", False):
logger.info("Load fp32 layernorm for ViT.")
self.pre_layrnorm = LayerNormfp32(
cfg.hidden_size,
eps=cfg.get("pre_layernorm_epsilon", 1e-5),
)
else:
self.pre_layrnorm = torch.nn.LayerNorm(
cfg.hidden_size,
eps=cfg.get("pre_layernorm_epsilon", 1e-5),
)
# self.pre_layrnorm = RMSNorm(
# cfg.hidden_size,
# eps=cfg.get("pre_layernorm_epsilon", 1e-5),
# sequence_parallel=False,
# use_fp32=True,
# use_optimus=True,
# )
if freeze_pre_norm:
for name, param in self.pre_layrnorm.named_parameters():
param.requires_grad = False
for p in self.parameters():
p.micro_dp = True
def set_input_tensor(self, input_tensor):
if not isinstance(input_tensor, list):
input_tensor = [input_tensor]
self.transformer.set_input_tensor(input_tensor[0])
def __str__(self) -> str:
return "open_clip"
def forward(
self,
x,
patch_embeds
):
x = self.embeddings(x, patch_embeds)
hidden_states = self.pre_layrnorm(x)
# hidden_states, dis = local_dp_scatter(hidden_states)
output = self.transformer(hidden_states)
# output = local_dp_reduce(output, dis)
return output
vit_model_cfg = adict(
num_layers=24,
hidden_size=1024,
num_heads = 16,
num_attention_heads=16,
ffn_hidden_size=4096,
seq_length=256,
max_position_embeddings=256,
use_flash_attn=False,
understand_projector_stride=2,
hidden_dropout = 0.0,
attention_dropout = 0.0,
no_persist_layer_norm = False,
layernorm_epsilon = 1e-5,
pre_layernorm_epsilon = 1e-5,
image_size = 224,
patch_size = 14,
recompute_list = []
)
def build_clip_l():
return VitModel(
cfg=vit_model_cfg,
freeze_embed=False,
freeze_pre_norm=False,
)
if __name__ == '__main__':
from mmgpt.model.vision_encoder.sam_b import build_sam_vit_b
vit_model_cfg = adict(
num_layers=24,
hidden_size=1024,
num_attention_heads=16,
ffn_hidden_size=4096,
seq_length=256,
max_position_embeddings=256,
use_flash_attn=False,
understand_projector_stride=2,
hidden_dropout = 0.0,
attention_dropout = 0.0,
no_persist_layer_norm = False,
layernorm_epsilon = 1e-5,
pre_layernorm_epsilon = 1e-5,
image_size = 224,
patch_size = 14,
recompute_list = []
)
sam_model = build_sam_vit_b()
vision_model = VitModel(
cfg=vit_model_cfg,
freeze_embed=False,
freeze_pre_norm=False,
)
# model = VitModel(1344)
# x = torch.zeros(2, 3, 224, 224)
x = torch.zeros(2, 3, 1024, 1024)
with torch.no_grad():
# y = vision_model(x)
patch_embed = sam_model(x)
print(patch_embed.shape)
y = vision_model(x, patch_embed)
print(y.shape)
image_feature = torch.add(y[:, 1:], patch_embed.flatten(2).permute(0, 2, 1))
print(image_feature.shape)

View File

@@ -1,528 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Optional, Tuple, Type
from functools import partial
from flash_attn import flash_attn_qkvpacked_func
# from .common import LayerNorm2d, MLPBlock
# from mmgpt.model.vision_encoder.flash_4 import _attention_rel_h_rel_w
def get_abs_pos(abs_pos, tgt_size):
dtype = abs_pos.dtype
src_size = abs_pos.size(1)
if src_size != tgt_size:
old_pos_embed = abs_pos.permute(0, 3, 1, 2)
old_pos_embed = old_pos_embed.to(torch.float32)
new_pos_embed = F.interpolate(
old_pos_embed,
size=(tgt_size, tgt_size),
mode='bicubic',
antialias=True,
align_corners=False,
).to(dtype)
new_pos_embed = new_pos_embed.permute(0, 2, 3, 1)
return new_pos_embed
else:
return abs_pos
class MLPBlock(nn.Module):
def __init__(
self,
embedding_dim: int,
mlp_dim: int,
act: Type[nn.Module] = nn.GELU,
) -> None:
super().__init__()
self.lin1 = nn.Linear(embedding_dim, mlp_dim)
self.lin2 = nn.Linear(mlp_dim, embedding_dim)
self.act = act()
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.lin2(self.act(self.lin1(x)))
# From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
# Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 # noqa
class LayerNorm2d(nn.Module):
def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
super().__init__()
self.weight = nn.Parameter(torch.ones(num_channels))
self.bias = nn.Parameter(torch.zeros(num_channels))
self.eps = eps
def forward(self, x: torch.Tensor) -> torch.Tensor:
u = x.mean(1, keepdim=True)
s = (x - u).pow(2).mean(1, keepdim=True)
x = (x - u) / torch.sqrt(s + self.eps)
x = self.weight[:, None, None] * x + self.bias[:, None, None]
return x
# This class and its supporting functions below lightly adapted from the ViTDet backbone available at: https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py # noqa
class ImageEncoderViT(nn.Module):
def __init__(
self,
img_size: int = 1024,
patch_size: int = 16,
in_chans: int = 3,
embed_dim: int = 768,
depth: int = 12,
num_heads: int = 12,
mlp_ratio: float = 4.0,
out_chans: int = 256,
qkv_bias: bool = True,
norm_layer: Type[nn.Module] = nn.LayerNorm,
act_layer: Type[nn.Module] = nn.GELU,
use_abs_pos: bool = True,
use_rel_pos: bool = False,
rel_pos_zero_init: bool = True,
window_size: int = 0,
global_attn_indexes: Tuple[int, ...] = (),
) -> None:
"""
Args:
img_size (int): Input image size.
patch_size (int): Patch size.
in_chans (int): Number of input image channels.
embed_dim (int): Patch embedding dimension.
depth (int): Depth of ViT.
num_heads (int): Number of attention heads in each ViT block.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
qkv_bias (bool): If True, add a learnable bias to query, key, value.
norm_layer (nn.Module): Normalization layer.
act_layer (nn.Module): Activation layer.
use_abs_pos (bool): If True, use absolute positional embeddings.
use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
window_size (int): Window size for window attention blocks.
global_attn_indexes (list): Indexes for blocks using global attention.
"""
super().__init__()
self.img_size = img_size
self.patch_embed = PatchEmbed(
kernel_size=(patch_size, patch_size),
stride=(patch_size, patch_size),
in_chans=in_chans,
embed_dim=embed_dim,
)
self.pos_embed: Optional[nn.Parameter] = None
if use_abs_pos:
# Initialize absolute positional embedding with pretrain image size.
self.pos_embed = nn.Parameter(
torch.zeros(1, img_size // patch_size, img_size // patch_size, embed_dim)
)
self.blocks = nn.ModuleList()
for i in range(depth):
block = Block(
dim=embed_dim,
num_heads=num_heads,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
norm_layer=norm_layer,
act_layer=act_layer,
use_rel_pos=use_rel_pos,
rel_pos_zero_init=rel_pos_zero_init,
window_size=window_size if i not in global_attn_indexes else 0,
input_size=(img_size // patch_size, img_size // patch_size),
)
self.blocks.append(block)
self.neck = nn.Sequential(
nn.Conv2d(
embed_dim,
out_chans,
kernel_size=1,
bias=False,
),
LayerNorm2d(out_chans),
nn.Conv2d(
out_chans,
out_chans,
kernel_size=3,
padding=1,
bias=False,
),
LayerNorm2d(out_chans),
)
self.net_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1, bias=False)
self.net_3 = nn.Conv2d(512, 1024, kernel_size=3, stride=2, padding=1, bias=False)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.patch_embed(x)
if self.pos_embed is not None:
# x = x + self.pos_embed
x = x + get_abs_pos(self.pos_embed, x.size(1))
for blk in self.blocks:
x = blk(x)
neck_output = self.neck(x.permute(0, 3, 1, 2))
conv2_output = self.net_2(neck_output)
# print(f"conv2_output shape: {conv2_output.shape}")
conv3_output = self.net_3(conv2_output)
return conv3_output
class Block(nn.Module):
"""Transformer blocks with support of window attention and residual propagation blocks"""
def __init__(
self,
dim: int,
num_heads: int,
mlp_ratio: float = 4.0,
qkv_bias: bool = True,
norm_layer: Type[nn.Module] = nn.LayerNorm,
act_layer: Type[nn.Module] = nn.GELU,
use_rel_pos: bool = False,
rel_pos_zero_init: bool = True,
window_size: int = 0,
input_size: Optional[Tuple[int, int]] = None,
) -> None:
"""
Args:
dim (int): Number of input channels.
num_heads (int): Number of attention heads in each ViT block.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
qkv_bias (bool): If True, add a learnable bias to query, key, value.
norm_layer (nn.Module): Normalization layer.
act_layer (nn.Module): Activation layer.
use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
window_size (int): Window size for window attention blocks. If it equals 0, then
use global attention.
input_size (tuple(int, int) or None): Input resolution for calculating the relative
positional parameter size.
"""
super().__init__()
self.norm1 = norm_layer(dim)
self.attn = Attention(
dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
use_rel_pos=use_rel_pos,
rel_pos_zero_init=rel_pos_zero_init,
input_size=input_size if window_size == 0 else (window_size, window_size),
)
self.norm2 = norm_layer(dim)
self.mlp = MLPBlock(embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer)
self.window_size = window_size
def forward(self, x: torch.Tensor) -> torch.Tensor:
shortcut = x
x = self.norm1(x)
# Window partition
if self.window_size > 0:
H, W = x.shape[1], x.shape[2]
x, pad_hw = window_partition(x, self.window_size)
x = self.attn(x)
# Reverse window partition
if self.window_size > 0:
x = window_unpartition(x, self.window_size, pad_hw, (H, W))
x = shortcut + x
x = x + self.mlp(self.norm2(x))
return x
class Attention(nn.Module):
"""Multi-head Attention block with relative position embeddings."""
def __init__(
self,
dim: int,
num_heads: int = 8,
qkv_bias: bool = True,
use_rel_pos: bool = False,
rel_pos_zero_init: bool = True,
input_size: Optional[Tuple[int, int]] = None,
) -> None:
"""
Args:
dim (int): Number of input channels.
num_heads (int): Number of attention heads.
qkv_bias (bool): If True, add a learnable bias to query, key, value.
rel_pos (bool): If True, add relative positional embeddings to the attention map.
rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
input_size (tuple(int, int) or None): Input resolution for calculating the relative
positional parameter size.
"""
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = head_dim**-0.5
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
self.proj = nn.Linear(dim, dim)
self.use_rel_pos = use_rel_pos
if self.use_rel_pos:
assert (
input_size is not None
), "Input size must be provided if using relative positional encoding."
# initialize relative positional embeddings
self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
def forward(self, x: torch.Tensor) -> torch.Tensor:
B, H, W, _ = x.shape
# qkv with shape (3, B, nHead, H * W, C)
qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
# q, k, v with shape (B * nHead, H * W, C)
q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0)
rel_h, rel_w = None, None
if self.use_rel_pos:
rel_h, rel_w = add_decomposed_rel_pos(q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W))
q = q.view(B, self.num_heads, H * W, -1)
k = k.view(B, self.num_heads, H * W, -1)
v = v.view(B, self.num_heads, H * W, -1)
if self.use_rel_pos:
rel_h = rel_h.view(B, self.num_heads, rel_h.size(1), rel_h.size(2), rel_h.size(3))
rel_w = rel_w.view(B, self.num_heads, rel_w.size(1), rel_w.size(2), rel_w.size(3))
attn_bias = (rel_h + rel_w).view(B, self.num_heads, rel_h.size(2), rel_h.size(3) * rel_w.size(4))
x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_bias)
# x = _attention_rel_h_rel_w(q, k, v, rel_h, rel_w)
else:
x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
# qkv = torch.stack([q, k, v], dim=1).transpose(1, 3).reshape(B, H * W, 3, self.num_heads, -1)
# x = flash_attn_qkvpacked_func(qkv, dropout_p=0.0, causal=False).transpose(1, 2)
x = x.view(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1)
x = self.proj(x)
return x
def window_partition(x: torch.Tensor, window_size: int) -> Tuple[torch.Tensor, Tuple[int, int]]:
"""
Partition into non-overlapping windows with padding if needed.
Args:
x (tensor): input tokens with [B, H, W, C].
window_size (int): window size.
Returns:
windows: windows after partition with [B * num_windows, window_size, window_size, C].
(Hp, Wp): padded height and width before partition
"""
B, H, W, C = x.shape
pad_h = (window_size - H % window_size) % window_size
pad_w = (window_size - W % window_size) % window_size
if pad_h > 0 or pad_w > 0:
x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
Hp, Wp = H + pad_h, W + pad_w
x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
return windows, (Hp, Wp)
def window_unpartition(
windows: torch.Tensor, window_size: int, pad_hw: Tuple[int, int], hw: Tuple[int, int]
) -> torch.Tensor:
"""
Window unpartition into original sequences and removing padding.
Args:
windows (tensor): input tokens with [B * num_windows, window_size, window_size, C].
window_size (int): window size.
pad_hw (Tuple): padded height and width (Hp, Wp).
hw (Tuple): original height and width (H, W) before padding.
Returns:
x: unpartitioned sequences with [B, H, W, C].
"""
Hp, Wp = pad_hw
H, W = hw
B = windows.shape[0] // (Hp * Wp // window_size // window_size)
x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
if Hp > H or Wp > W:
x = x[:, :H, :W, :].contiguous()
return x
def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
"""
Get relative positional embeddings according to the relative positions of
query and key sizes.
Args:
q_size (int): size of query q.
k_size (int): size of key k.
rel_pos (Tensor): relative position embeddings (L, C).
Returns:
Extracted positional embeddings according to relative positions.
"""
max_rel_dist = int(2 * max(q_size, k_size) - 1)
# Interpolate rel pos if needed.
if rel_pos.shape[0] != max_rel_dist:
# Interpolate rel pos.
dtype = rel_pos.dtype
rel_pos = rel_pos.to(torch.float32)
rel_pos_resized = F.interpolate(
rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
size=max_rel_dist,
mode="linear",
).to(dtype)
rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
else:
rel_pos_resized = rel_pos
# Scale the coords with short length if shapes for q and k are different.
q_coords = torch.arange(q_size, device=rel_pos.device)[:, None] * max(k_size / q_size, 1.0)
k_coords = torch.arange(k_size, device=rel_pos.device)[None, :] * max(q_size / k_size, 1.0)
relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
return rel_pos_resized[relative_coords.long()]
def add_decomposed_rel_pos(
q: torch.Tensor,
rel_pos_h: torch.Tensor,
rel_pos_w: torch.Tensor,
q_size: Tuple[int, int],
k_size: Tuple[int, int],
) -> torch.Tensor:
"""
Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py # noqa B950
Args:
q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
k_size (Tuple): spatial sequence size of key k with (k_h, k_w).
Returns:
attn (Tensor): attention map with added relative positional embeddings.
"""
q_h, q_w = q_size
k_h, k_w = k_size
Rh = get_rel_pos(q_h, k_h, rel_pos_h)
Rw = get_rel_pos(q_w, k_w, rel_pos_w)
B, _, dim = q.shape
r_q = q.reshape(B, q_h, q_w, dim)
rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
rel_h = rel_h.unsqueeze(-1)
rel_w = rel_w.unsqueeze(-2)
rel_h = rel_h.reshape(B, q_h * q_w, k_h, 1)
rel_w = rel_w.reshape(B, q_h * q_w, 1, k_w)
return rel_h, rel_w
class PatchEmbed(nn.Module):
"""
Image to Patch Embedding.
"""
def __init__(
self,
kernel_size: Tuple[int, int] = (16, 16),
stride: Tuple[int, int] = (16, 16),
padding: Tuple[int, int] = (0, 0),
in_chans: int = 3,
embed_dim: int = 768,
) -> None:
"""
Args:
kernel_size (Tuple): kernel size of the projection layer.
stride (Tuple): stride of the projection layer.
padding (Tuple): padding size of the projection layer.
in_chans (int): Number of input image channels.
embed_dim (int): Patch embedding dimension.
"""
super().__init__()
self.proj = nn.Conv2d(
in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.proj(x)
# B C H W -> B H W C
x = x.permute(0, 2, 3, 1)
return x
def build_sam_vit_b(checkpoint=None):
return _build_sam(
encoder_embed_dim=768,
encoder_depth=12,
encoder_num_heads=12,
encoder_global_attn_indexes=[2, 5, 8, 11],
checkpoint=checkpoint,
)
def _build_sam(
encoder_embed_dim,
encoder_depth,
encoder_num_heads,
encoder_global_attn_indexes,
checkpoint=None,
):
prompt_embed_dim = 256
image_size = 1024
vit_patch_size = 16
image_embedding_size = image_size // vit_patch_size
image_encoder=ImageEncoderViT(
depth=encoder_depth,
embed_dim=encoder_embed_dim,
img_size=image_size,
mlp_ratio=4,
norm_layer=partial(torch.nn.LayerNorm, eps=1e-6),
num_heads=encoder_num_heads,
patch_size=vit_patch_size,
qkv_bias=True,
use_rel_pos=True,
global_attn_indexes=encoder_global_attn_indexes,
window_size=14,
out_chans=prompt_embed_dim,
)
if checkpoint is not None:
# with open(checkpoint, "rb") as f:
state_dict = torch.load(checkpoint)
# print(state_dict.keys())
# for key in state_dict:
# image_encoder.load_state_dict({k[14:]: v for k, v in state_dict.items() if 'image_encoder' in k}, strict=False)
# ocr-anyting
# image_encoder.load_state_dict(state_dict, strict=True)
# tob
image_encoder.load_state_dict({k[30:]: v for k, v in state_dict.items() if 'vision_tower_high' in k}, strict=True)
print(checkpoint)
return image_encoder

View File

@@ -1,582 +0,0 @@
"""Inference-only Deepseek-OCR model compatible with HuggingFace weights."""
import math
from collections.abc import Iterable, Mapping, Sequence
from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
import torch
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange, repeat
from transformers import BatchFeature
from vllm.config import VllmConfig
from vllm.model_executor import SamplingMetadata
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs, NestedTensors)
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
ImageSize, MultiModalDataItems)
from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, PromptReplacement,
PromptUpdate)
from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config,
MlpProjectorConfig,
VisionEncoderConfig)
from process.image_process import (
DeepseekOCRProcessor, count_tiles)
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
# from vllm.utils import is_list_of
from vllm.model_executor.models.interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
init_vllm_registered_model, maybe_prefix,
merge_multimodal_embeddings)
from deepencoder.sam_vary_sdpa import build_sam_vit_b
from deepencoder.clip_sdpa import build_clip_l
from deepencoder.build_linear import MlpProjector
from addict import Dict
# import time
from config import IMAGE_SIZE, BASE_SIZE, CROP_MODE, PRINT_NUM_VIS_TOKENS, PROMPT
# The image token id may be various
_IMAGE_TOKEN = "<image>"
class DeepseekOCRProcessingInfo(BaseProcessingInfo):
def get_hf_config(self):
return self.ctx.get_hf_config(DeepseekVLV2Config)
def get_hf_processor(self, **kwargs: object):
return self.ctx.get_hf_processor(DeepseekOCRProcessor, **kwargs)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None}
def get_num_image_tokens(self,
*,
image_width: int,
image_height: int,
cropping: bool = True) -> int:
hf_processor = self.get_hf_processor()
# image_size = hf_processor.image_size
# patch_size = hf_processor.patch_size
# downsample_ratio = hf_processor.downsample_ratio
image_size = IMAGE_SIZE
base_size = BASE_SIZE
patch_size = 16
downsample_ratio = 4
if CROP_MODE:
if image_width <= 640 and image_height <= 640:
crop_ratio = [1, 1]
else:
# images_crop_raw, crop_ratio = hf_processor.dynamic_preprocess(image)
# find the closest aspect ratio to the target
crop_ratio = count_tiles(image_width, image_height, image_size=IMAGE_SIZE)
# print('===========')
# print('crop_ratio ', crop_ratio)
# print('============')
num_width_tiles, num_height_tiles = crop_ratio
else:
num_width_tiles = num_height_tiles = 1
h = w = math.ceil((base_size // patch_size) / downsample_ratio)
h2 = w2 = math.ceil((image_size // patch_size) / downsample_ratio)
global_views_tokens = h * (w + 1)
if num_width_tiles >1 or num_height_tiles>1:
local_views_tokens = (num_height_tiles * h2) * (num_width_tiles * w2 + 1)
else:
local_views_tokens = 0
return global_views_tokens + local_views_tokens + 1
def get_image_size_with_most_features(self) -> ImageSize:
if IMAGE_SIZE == 1024 and BASE_SIZE == 1280:
return ImageSize(width=1024*2, height=1024*2)
return ImageSize(width=640*2, height=640*2)
class DeepseekOCRDummyInputsBuilder(
BaseDummyInputsBuilder[DeepseekOCRProcessingInfo]):
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
num_images = mm_counts.get("image", 0)
processor = self.info.get_hf_processor()
image_token = processor.image_token
return image_token * num_images
def get_dummy_mm_data(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)
max_image_size = self.info.get_image_size_with_most_features()
if '<image>' in PROMPT:
return {
"image":
DeepseekOCRProcessor().tokenize_with_images(images = self._get_dummy_images(width=max_image_size.width,
height=max_image_size.height,
num_images=num_images), bos=True, eos=True, cropping=CROP_MODE)
}
else:
return {
"image": []
}
class DeepseekOCRMultiModalProcessor(
BaseMultiModalProcessor[DeepseekOCRProcessingInfo]):
def _call_hf_processor(
self,
prompt: str,
mm_data: Mapping[str, object],
mm_kwargs: Mapping[str, object],
) -> BatchFeature:
# print(mm_data)
if mm_data:
processed_outputs = self.info.ctx.call_hf_processor(
self.info.get_hf_processor(**mm_kwargs),
dict(prompt=prompt, **mm_data),
mm_kwargs,
)
else:
tokenizer = self.info.get_tokenizer()
processed_outputs = tokenizer(prompt,
add_special_tokens=True,
return_tensors="pt")
return processed_outputs
def _get_mm_fields_config(
self,
hf_inputs: BatchFeature,
hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]:
return dict(
pixel_values=MultiModalFieldConfig.batched("image"),
images_spatial_crop=MultiModalFieldConfig.batched("image"),
# image_embeds=MultiModalFieldConfig.batched("image2"),
images_crop=MultiModalFieldConfig.batched("image"),
)
def _get_prompt_updates(
self,
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs,
) -> Sequence[PromptUpdate]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
image_token_id = hf_processor.image_token_id
assert isinstance(image_token_id, int)
def get_replacement_deepseek_vl2(item_idx: int):
images = mm_items.get_items(
"image", (ImageEmbeddingItems, ImageProcessorItems))
if isinstance(images, ImageEmbeddingItems):
num_image_tokens = images.get_feature_size(item_idx)
else:
width = images[0][-1][0][0]
height = images[0][-1][0][1]
num_image_tokens = self.info.get_num_image_tokens(
image_width=width,
image_height=height,
# flag = True,
cropping=CROP_MODE,
)
return [image_token_id] * num_image_tokens
return [
PromptReplacement(
modality="image",
target=[image_token_id],
replacement=get_replacement_deepseek_vl2,
)
]
def _cached_apply_hf_processor(
self,
prompt: Union[str, list[int]],
mm_data_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
) -> tuple[list[int], MultiModalKwargs, bool]:
# The processor logic is different for len(images) <= 2 vs > 2
# Since the processing cache assumes that the processor output is
# invariant of how many images are passed per prompt, we only
# perform caching for the most common case
if mm_data_items.get_count("image", strict=False) > 2:
# This code path corresponds to the cache being disabled
return self._apply_hf_processor_main(
prompt=prompt,
mm_items=mm_data_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
enable_hf_prompt_update=True,
)
return super()._cached_apply_hf_processor(
prompt=prompt,
mm_data_items=mm_data_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
)
@MULTIMODAL_REGISTRY.register_processor(
DeepseekOCRMultiModalProcessor,
info=DeepseekOCRProcessingInfo,
dummy_inputs=DeepseekOCRDummyInputsBuilder)
class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
"language.": "language_model.",
})
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
config: DeepseekVLV2Config = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config
multimodal_config = vllm_config.model_config.multimodal_config
# config.model_type ='deepseek_vl_v2'
self.config = config
self.multimodal_config = multimodal_config
self.vision_config = config.vision_config
self.projector_config = config.projector_config
self.text_config = config.text_config
model_config = vllm_config.model_config
tokenizer = cached_tokenizer_from_config(model_config)
self.image_token_id = tokenizer.vocab[_IMAGE_TOKEN]
self.sam_model = build_sam_vit_b()
self.vision_model = build_clip_l()
n_embed = 1280
self.projector = MlpProjector(Dict(projector_type="linear", input_dim=2048, n_embed=n_embed))
self.tile_tag = config.tile_tag
self.global_view_pos = config.global_view_pos
# self.sam_model = torch.compile(self.sam_model, mode="reduce-overhead")
# self.vision_model = torch.compile(self.vision_model, mode="reduce-overhead")
# self.projector = torch.compile(self.projector, mode="max-autotune")
# special token for image token sequence format
embed_std = 1 / torch.sqrt(torch.tensor(n_embed, dtype=torch.float32))
if self.tile_tag == "2D":
# <|view_separator|>, <|\n|>
self.image_newline = nn.Parameter(torch.randn(n_embed) * embed_std)
self.view_seperator = nn.Parameter(torch.randn(n_embed) * embed_std)
else:
raise ValueError(
f"Only 2D tile_tag is supported currently, got: {self.tile_tag}"
)
if self.text_config.topk_method == "noaux_tc":
architectures = ["DeepseekV3ForCausalLM"]
elif not self.text_config.use_mla:
architectures = ["DeepseekForCausalLM"]
else:
architectures = ["DeepseekV2ForCausalLM"]
self.language_model = init_vllm_registered_model(
vllm_config=vllm_config,
hf_config=self.text_config,
prefix=maybe_prefix(prefix, "language"),
architectures=architectures,
)
self.make_empty_intermediate_tensors = (
self.language_model.make_empty_intermediate_tensors)
def _parse_and_validate_image_input(
self, **kwargs: object):
pixel_values = kwargs.pop("pixel_values", None)
images_spatial_crop = kwargs.pop("images_spatial_crop", None)
images_crop = kwargs.pop("images_crop", None)
if pixel_values is None or torch.sum(pixel_values).item() == 0:
return None
if pixel_values is not None:
if not isinstance(pixel_values, (torch.Tensor, list)):
raise ValueError("Incorrect type of pixel values. "
f"Got type: {type(pixel_values)}")
if not isinstance(images_spatial_crop, (torch.Tensor, list)):
raise ValueError("Incorrect type of image sizes. "
f"Got type: {type(images_spatial_crop)}")
if not isinstance(images_crop, (torch.Tensor, list)):
raise ValueError("Incorrect type of image crop. "
f"Got type: {type(images_crop)}")
return [pixel_values, images_crop, images_spatial_crop]
raise AssertionError("This line should be unreachable.")
def _pixel_values_to_embedding(
self,
pixel_values: torch.Tensor,
images_crop: torch.Tensor,
images_spatial_crop: torch.Tensor,
) -> NestedTensors:
# Pixel_values (global view): [n_image, batch_size, 3, height, width]
# images_spatial_crop: [n_image, batch_size, [num_tiles_w, num_tiles_h]]
# images_crop (local view): [n_image, batch_size, num_pathes, 3, h, w]
# split the pixel and image_crop, all batch_size = 1
images_in_this_batch = []
# print(type(images_crop))
# print(pixel_values.shape)
with torch.no_grad():
for jdx in range(images_spatial_crop.size(0)):
# with torch.set_grad_enabled(False):
patches = images_crop[jdx][0].to(torch.bfloat16) # batch_size = 1
image_ori = pixel_values[jdx]
crop_shape = images_spatial_crop[jdx][0]
if torch.sum(patches).item() != 0: # if all values = 0, no crop
# P, C, H, W = patches.shape
# crop_flag = 1
local_features_1 = self.sam_model(patches)
#TODO del patches
# torch.compiler.cudagraph_mark_step_begin()
local_features_2 = self.vision_model(patches, local_features_1)
local_features = torch.cat((local_features_2[:, 1:], local_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
local_features = self.projector(local_features)
global_features_1 = self.sam_model(image_ori)
global_features_2 = self.vision_model(image_ori, global_features_1)
global_features = torch.cat((global_features_2[:, 1:], global_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
global_features = self.projector(global_features)
if PRINT_NUM_VIS_TOKENS:
print('=====================')
print('BASE: ', global_features.shape)
print('PATCHES: ', local_features.shape)
print('=====================')
_, hw, n_dim = global_features.shape
h = w = int(hw ** 0.5)
_2, hw2, n_dim2 = local_features.shape
h2 = w2 = int(hw2 ** 0.5)
width_crop_num, height_crop_num = crop_shape[0], crop_shape[1]
global_features = global_features.view(h, w, n_dim)
global_features = torch.cat(
[global_features, self.image_newline[None, None, :].expand(h, 1, n_dim)], dim=1
)
global_features = global_features.view(-1, n_dim)
local_features = local_features.view(height_crop_num, width_crop_num, h2, w2, n_dim2).permute(0, 2, 1, 3, 4).reshape(height_crop_num*h2, width_crop_num*w2, n_dim2)
local_features = torch.cat(
[local_features, self.image_newline[None, None, :].expand(height_crop_num * h2, 1, n_dim2)], dim=1
)
local_features = local_features.view(-1, n_dim2)
global_local_features = torch.cat([local_features, global_features, self.view_seperator[None, :]], dim=0)
else:
global_features_1 = self.sam_model(image_ori)
global_features_2 = self.vision_model(image_ori, global_features_1)
global_features = torch.cat((global_features_2[:, 1:], global_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
global_features = self.projector(global_features)
if PRINT_NUM_VIS_TOKENS:
print('=====================')
print('BASE: ', global_features.shape)
print('NO PATCHES')
print('=====================')
_, hw, n_dim = global_features.shape
h = w = int(hw ** 0.5)
global_features = global_features.view(h, w, n_dim)
global_features = torch.cat(
[global_features, self.image_newline[None, None, :].expand(h, 1, n_dim)], dim=1
)
global_features = global_features.view(-1, n_dim)
global_local_features = torch.cat([global_features, self.view_seperator[None, :]], dim=0)
images_in_this_batch.append(global_local_features)
return images_in_this_batch
def _process_image_input(
self, image_input) -> torch.Tensor:
# image_input: [pixel_values, images_crop, images_spatial_crop]
pixel_values = image_input[0].to(torch.bfloat16)
# print(image_input[1][0].shape)
# print(type(image_input[1]))
# exit()
# images_crop = image_input[1].to(torch.bfloat16)
images_crop = image_input[1]
# images_crop = image_input[1]
images_spatial_crop = image_input[2].to(dtype=torch.long)
# local_start = time.time()
vision_features = self._pixel_values_to_embedding(
pixel_values=pixel_values, images_crop = images_crop, images_spatial_crop=images_spatial_crop)
# local_total_time = time.time() - local_start
# print('encoder_time: ', local_total_time)
# exit()
return vision_features
def get_language_model(self) -> torch.nn.Module:
return self.language_model
def get_multimodal_embeddings(
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
vision_embeddings = self._process_image_input(image_input)
return vision_embeddings
def get_input_embeddings(
self,
input_ids: torch.Tensor,
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
self.image_token_id)
# print(len(multimodal_embeddings))
# print(input_ids.shape)
# print(type(inputs_embeds))
# print(inputs_embeds.shape)
return inputs_embeds
def forward(self,
input_ids: torch.Tensor,
positions: torch.Tensor,
intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: Optional[torch.Tensor] = None,
**kwargs: object):
if intermediate_tensors is not None:
inputs_embeds = None
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility
elif inputs_embeds is None:
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
inputs_embeds = self.get_input_embeddings(input_ids,
vision_embeddings)
input_ids = None
hidden_states = self.language_model(input_ids,
positions,
intermediate_tensors,
inputs_embeds=inputs_embeds)
return hidden_states
def compute_logits(
self,
hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata,
) -> Optional[torch.Tensor]:
return self.language_model.compute_logits(hidden_states,
sampling_metadata)
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]:
processed_weights = []
for name, tensor in weights:
if 'sam_model' in name or 'vision_model' in name or 'projector' in name or 'image_newline' in name or 'view_seperator' in name:
new_name = name.replace('model.', '', 1)
else:
new_name = 'language.' + name
processed_weights.append((new_name, tensor))
loader = AutoWeightsLoader(self)
autoloaded_weights = loader.load_weights(processed_weights, mapper=self.hf_to_vllm_mapper)
return autoloaded_weights

View File

@@ -1,502 +0,0 @@
import math
from typing import List, Tuple
import torch
import torchvision.transforms as T
from PIL import Image, ImageOps
from transformers import AutoProcessor, BatchFeature, LlamaTokenizerFast
from transformers.processing_utils import ProcessorMixin
from config import IMAGE_SIZE, BASE_SIZE, CROP_MODE, MIN_CROPS, MAX_CROPS, PROMPT, TOKENIZER
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
best_ratio_diff = float('inf')
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
# print(f'width: {width}, height: {height}, best_ratio: {best_ratio}')
return best_ratio
def count_tiles(orig_width, orig_height, min_num=MIN_CROPS, max_num=MAX_CROPS, image_size=640, use_thumbnail=False):
aspect_ratio = orig_width / orig_height
# calculate the existing image aspect ratio
target_ratios = set(
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
i * j <= max_num and i * j >= min_num)
# print(target_ratios)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size)
return target_aspect_ratio
def dynamic_preprocess(image, min_num=MIN_CROPS, max_num=MAX_CROPS, image_size=640, use_thumbnail=False):
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height
# calculate the existing image aspect ratio
target_ratios = set(
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
i * j <= max_num and i * j >= min_num)
# print(target_ratios)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size)
# print(target_aspect_ratio)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images, target_aspect_ratio
class ImageTransform:
def __init__(self,
mean: Tuple[float, float, float] = (0.5, 0.5, 0.5),
std: Tuple[float, float, float] = (0.5, 0.5, 0.5),
normalize: bool = True):
self.mean = mean
self.std = std
self.normalize = normalize
transform_pipelines = [T.ToTensor()]
if normalize:
transform_pipelines.append(T.Normalize(mean, std))
self.transform = T.Compose(transform_pipelines)
def __call__(self, pil_img: Image.Image):
x = self.transform(pil_img)
return x
class DeepseekOCRProcessor(ProcessorMixin):
tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
attributes = ["tokenizer"]
def __init__(
self,
tokenizer: LlamaTokenizerFast = TOKENIZER,
candidate_resolutions: Tuple[Tuple[int, int]] = [[1024, 1024]],
patch_size: int = 16,
downsample_ratio: int = 4,
image_mean: Tuple[float, float, float] = (0.5, 0.5, 0.5),
image_std: Tuple[float, float, float] = (0.5, 0.5, 0.5),
normalize: bool = True,
image_token: str = "<image>",
pad_token: str = "<▁pad▁>",
add_special_token: bool = False,
sft_format: str = "deepseek",
mask_prompt: bool = True,
ignore_id: int = -100,
**kwargs,
):
# self.candidate_resolutions = candidate_resolutions # placeholder no use
self.image_size = IMAGE_SIZE
self.base_size = BASE_SIZE
# self.patch_size = patch_size
self.patch_size = 16
self.image_mean = image_mean
self.image_std = image_std
self.normalize = normalize
# self.downsample_ratio = downsample_ratio
self.downsample_ratio = 4
self.image_transform = ImageTransform(mean=image_mean, std=image_std, normalize=normalize)
self.tokenizer = tokenizer
# self.tokenizer = add_special_token(tokenizer)
self.tokenizer.padding_side = 'left' # must set thispadding side with make a difference in batch inference
# add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id'
if self.tokenizer.pad_token is None:
self.tokenizer.add_special_tokens({'pad_token': pad_token})
# add image token
# image_token_id = self.tokenizer.vocab.get(image_token)
# if image_token_id is None:
# special_tokens = [image_token]
# special_tokens_dict = {"additional_special_tokens": special_tokens}
# self.tokenizer.add_special_tokens(special_tokens_dict)
self.image_token_id = self.tokenizer.vocab.get(image_token)
# add five special tokens for grounding-related tasks
# <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|>
# special_tokens = ['<|ref|>', '<|/ref|>', '<|det|>', '<|/det|>', '<|grounding|>']
# special_tokens_dict = {"additional_special_tokens": special_tokens}
# special_tokens = ['<image>','<|ref|>', '<|/ref|>', '<|det|>', '<|/det|>', '<|grounding|>', '<td>', '</td>', '<tr>', '</tr>']
# special_tokens_dict = {"additional_special_tokens": special_tokens}
# self.tokenizer.add_special_tokens(special_tokens_dict)
# # add special tokens for SFT data
# special_tokens = ["<|User|>", "<|Assistant|>"]
# special_tokens_dict = {"additional_special_tokens": special_tokens}
# self.tokenizer.add_special_tokens(special_tokens_dict)
self.image_token = image_token
self.pad_token = pad_token
self.add_special_token = add_special_token
self.sft_format = sft_format
self.mask_prompt = mask_prompt
self.ignore_id = ignore_id
super().__init__(
tokenizer,
**kwargs,
)
# def select_best_resolution(self, image_size):
# # used for cropping
# original_width, original_height = image_size
# best_fit = None
# max_effective_resolution = 0
# min_wasted_resolution = float("inf")
# for width, height in self.candidate_resolutions:
# scale = min(width / original_width, height / original_height)
# downscaled_width, downscaled_height = int(
# original_width * scale), int(original_height * scale)
# effective_resolution = min(downscaled_width * downscaled_height,
# original_width * original_height)
# wasted_resolution = (width * height) - effective_resolution
# if effective_resolution > max_effective_resolution or (
# effective_resolution == max_effective_resolution
# and wasted_resolution < min_wasted_resolution):
# max_effective_resolution = effective_resolution
# min_wasted_resolution = wasted_resolution
# best_fit = (width, height)
# return best_fit
@property
def bos_id(self):
return self.tokenizer.bos_token_id
@property
def eos_id(self):
return self.tokenizer.eos_token_id
@property
def pad_id(self):
return self.tokenizer.pad_token_id
def encode(self, text: str, bos: bool = True, eos: bool = False):
t = self.tokenizer.encode(text, add_special_tokens=False)
if bos:
t = [self.bos_id] + t
if eos:
t = t + [self.eos_id]
return t
def decode(self, t: List[int], **kwargs) -> str:
return self.tokenizer.decode(t, **kwargs)
def process_one(
self,
prompt: str,
images: List,
inference_mode: bool = True,
**kwargs,
):
"""
Args:
prompt (str): the formatted prompt;
conversations (List[Dict]): conversations with a list of messages;
images (List[ImageType]): the list of images;
inference_mode (bool): if True, then remove the last eos token;
system_prompt (str): the system prompt;
**kwargs:
Returns:
outputs (BaseProcessorOutput): the output of the processor,
- input_ids (torch.LongTensor): [N + image tokens]
- target_ids (torch.LongTensor): [N + image tokens]
- pixel_values (torch.FloatTensor): [n_patches, 3, H, W]
- image_id (int): the id of the image token
- num_image_tokens (List[int]): the number of image tokens
"""
assert (prompt is not None and images is not None
), "prompt and images must be used at the same time."
sft_format = prompt
input_ids, pixel_values, images_crop, images_seq_mask, images_spatial_crop, num_image_tokens, _ = images[0]
return {
"input_ids": input_ids,
"pixel_values": pixel_values,
"images_crop": images_crop,
"images_seq_mask": images_seq_mask,
"images_spatial_crop": images_spatial_crop,
"num_image_tokens": num_image_tokens,
}
# prepare = BatchFeature(
# data=dict(
# input_ids=input_ids,
# pixel_values=pixel_values,
# images_crop = images_crop,
# images_seq_mask=images_seq_mask,
# images_spatial_crop=images_spatial_crop,
# num_image_tokens=num_image_tokens,
# ),
# tensor_type="pt",
# )
# return prepare
def __call__(
self,
*,
prompt: str,
images: List,
inference_mode: bool = True,
**kwargs,
):
"""
Args:
prompt (str): the formatted prompt;
images (List[ImageType]): the list of images;
inference_mode (bool): if True, then remove the last eos token;
**kwargs:
Returns:
outputs (BaseProcessorOutput): the output of the processor,
- input_ids (torch.LongTensor): [N + image tokens]
- images (torch.FloatTensor): [n_images, 3, H, W]
- image_id (int): the id of the image token
- num_image_tokens (List[int]): the number of image tokens
"""
prepare = self.process_one(
prompt=prompt,
images=images,
inference_mode=inference_mode,
)
return prepare
def tokenize_with_images(
self,
# conversation: str,
images: List[Image.Image],
bos: bool = True,
eos: bool = True,
cropping: bool = True,
):
"""Tokenize text with <image> tags."""
# print(conversation)
conversation = PROMPT
assert conversation.count(self.image_token) == len(images)
text_splits = conversation.split(self.image_token)
images_list, images_crop_list, images_seq_mask, images_spatial_crop = [], [], [], []
image_shapes = []
num_image_tokens = []
tokenized_str = []
# print('image: ', len(images))
for text_sep, image in zip(text_splits, images):
"""encode text_sep"""
tokenized_sep = self.encode(text_sep, bos=False, eos=False)
tokenized_str += tokenized_sep
images_seq_mask += [False] * len(tokenized_sep)
"""select best resolution for anyres"""
# if cropping:
# best_width, best_height = self.select_best_resolution(image.size)
# else:
# best_width, best_height = self.image_size, self.image_size
image_shapes.append(image.size)
if image.size[0] <= 640 and image.size[1] <= 640:
crop_ratio = [1, 1]
else:
if cropping:
# print('image-size: ', image.size)
# best_width, best_height = select_best_resolution(image.size, self.candidate_resolutions)
# print('image ', image.size)
# print('open_size:', image.size)
images_crop_raw, crop_ratio = dynamic_preprocess(image, image_size=IMAGE_SIZE)
# print('crop_ratio: ', crop_ratio)
else:
# best_width, best_height = self.image_size, self.image_size
crop_ratio = [1, 1]
# print(image.size, (best_width, best_height)) # check the select_best_resolutions func
# print(crop_ratio)
"""process the global view"""
# if cropping
if self.image_size <= 640 and not cropping:
# print('directly resize')
image = image.resize((self.image_size, self.image_size))
global_view = ImageOps.pad(image, (self.base_size, self.base_size),
color=tuple(int(x * 255) for x in self.image_transform.mean))
images_list.append(self.image_transform(global_view))
"""record height / width crop num"""
# width_crop_num, height_crop_num = best_width // self.image_size, best_height // self.image_size
num_width_tiles, num_height_tiles = crop_ratio
images_spatial_crop.append([num_width_tiles, num_height_tiles])
if num_width_tiles > 1 or num_height_tiles > 1:
"""process the local views"""
# local_view = ImageOps.pad(image, (best_width, best_height),
# color=tuple(int(x * 255) for x in self.image_transform.mean))
# for i in range(0, best_height, self.image_size):
# for j in range(0, best_width, self.image_size):
# images_crop_list.append(
# self.image_transform(local_view.crop((j, i, j + self.image_size, i + self.image_size))))
for i in range(len(images_crop_raw)):
images_crop_list.append(self.image_transform(images_crop_raw[i]))
# """process the global view"""
# global_view = ImageOps.pad(image, (self.image_size, self.image_size),
# color=tuple(int(x * 255) for x in self.image_transform.mean))
# images_list.append(self.image_transform(global_view))
# """process the local views"""
# local_view = ImageOps.pad(image, (best_width, best_height),
# color=tuple(int(x * 255) for x in self.image_transform.mean))
# for i in range(0, best_height, self.image_size):
# for j in range(0, best_width, self.image_size):
# images_list.append(
# self.image_transform(local_view.crop((j, i, j + self.image_size, i + self.image_size))))
# """add image tokens"""
"""add image tokens"""
num_queries = math.ceil((self.image_size // self.patch_size) / self.downsample_ratio)
num_queries_base = math.ceil((self.base_size // self.patch_size) / self.downsample_ratio)
tokenized_image = ([self.image_token_id] * num_queries_base + [self.image_token_id]) * num_queries_base
tokenized_image += [self.image_token_id]
if num_width_tiles > 1 or num_height_tiles > 1:
tokenized_image += ([self.image_token_id] * (num_queries * num_width_tiles) + [self.image_token_id]) * (
num_queries * num_height_tiles)
tokenized_str += tokenized_image
images_seq_mask += [True] * len(tokenized_image)
num_image_tokens.append(len(tokenized_image))
"""process the last text split"""
tokenized_sep = self.encode(text_splits[-1], bos=False, eos=False)
tokenized_str += tokenized_sep
images_seq_mask += [False] * len(tokenized_sep)
"""add the bos and eos tokens"""
if bos:
tokenized_str = [self.bos_id] + tokenized_str
images_seq_mask = [False] + images_seq_mask
if eos:
tokenized_str = tokenized_str + [self.eos_id]
images_seq_mask = images_seq_mask + [False]
assert len(tokenized_str) == len(
images_seq_mask), f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
masked_tokenized_str = []
for token_index in tokenized_str:
if token_index != self.image_token_id:
masked_tokenized_str.append(token_index)
else:
masked_tokenized_str.append(self.ignore_id)
assert len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str), \
(f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
f"imags_seq_mask's length {len(images_seq_mask)}, are not equal")
input_ids = torch.LongTensor(tokenized_str)
target_ids = torch.LongTensor(masked_tokenized_str)
images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
# set input_ids < 0 | input_ids == self.image_token_id as ignore_id
target_ids[(input_ids < 0) |
(input_ids == self.image_token_id)] = self.ignore_id
input_ids[input_ids < 0] = self.pad_id
inference_mode = True
if inference_mode:
# Remove the ending eos token
assert input_ids[-1] == self.eos_id
input_ids = input_ids[:-1]
target_ids = target_ids[:-1]
images_seq_mask = images_seq_mask[:-1]
if len(images_list) == 0:
pixel_values = torch.zeros((1, 3, self.base_size, self.base_size))
images_spatial_crop = torch.zeros((1, 1), dtype=torch.long)
images_crop = torch.zeros((1, 3, self.image_size, self.image_size)).unsqueeze(0)
else:
pixel_values = torch.stack(images_list, dim=0)
images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long)
if images_crop_list:
images_crop = torch.stack(images_crop_list, dim=0).unsqueeze(0)
else:
images_crop = torch.zeros((1, 3, self.image_size, self.image_size)).unsqueeze(0)
input_ids = input_ids.unsqueeze(0)
return [[input_ids, pixel_values, images_crop, images_seq_mask, images_spatial_crop, num_image_tokens, image_shapes]]
AutoProcessor.register("DeepseekVLV2Processor", DeepseekOCRProcessor)

View File

@@ -1,40 +0,0 @@
import torch
from transformers import LogitsProcessor
from transformers.generation.logits_process import _calc_banned_ngram_tokens
from typing import List, Set
class NoRepeatNGramLogitsProcessor(LogitsProcessor):
def __init__(self, ngram_size: int, window_size: int = 100, whitelist_token_ids: set = None):
if not isinstance(ngram_size, int) or ngram_size <= 0:
raise ValueError(f"`ngram_size` has to be a strictly positive integer, but is {ngram_size}")
if not isinstance(window_size, int) or window_size <= 0:
raise ValueError(f"`window_size` has to be a strictly positive integer, but is {window_size}")
self.ngram_size = ngram_size
self.window_size = window_size
self.whitelist_token_ids = whitelist_token_ids or set()
def __call__(self, input_ids: List[int], scores: torch.FloatTensor) -> torch.FloatTensor:
if len(input_ids) < self.ngram_size:
return scores
current_prefix = tuple(input_ids[-(self.ngram_size - 1):])
search_start = max(0, len(input_ids) - self.window_size)
search_end = len(input_ids) - self.ngram_size + 1
banned_tokens = set()
for i in range(search_start, search_end):
ngram = tuple(input_ids[i:i + self.ngram_size])
if ngram[:-1] == current_prefix:
banned_tokens.add(ngram[-1])
banned_tokens = banned_tokens - self.whitelist_token_ids
if banned_tokens:
scores = scores.clone()
for token in banned_tokens:
scores[token] = -float("inf")
return scores

View File

@@ -1,10 +0,0 @@
transformers
tokenizers
PyMuPDF
img2pdf
einops
easydict
addict
Pillow
numpy
matplotlib

View File

@@ -1,161 +0,0 @@
import os
import re
from tqdm import tqdm
import torch
if torch.version.cuda == '11.8':
os.environ["TRITON_PTXAS_PATH"] = "/usr/local/cuda-11.8/bin/ptxas"
os.environ['VLLM_USE_V1'] = '0'
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
from config import MODEL_PATH, INPUT_PATH, OUTPUT_PATH, PROMPT, MAX_CONCURRENCY, CROP_MODE, NUM_WORKERS
from concurrent.futures import ThreadPoolExecutor
import glob
from PIL import Image
from deepseek_ocr import DeepseekOCRForCausalLM
from vllm.model_executor.models.registry import ModelRegistry
from vllm import LLM, SamplingParams
from process.ngram_norepeat import NoRepeatNGramLogitsProcessor
from process.image_process import DeepseekOCRProcessor
ModelRegistry.register_model("DeepseekOCRForCausalLM", DeepseekOCRForCausalLM)
llm = LLM(
model=MODEL_PATH,
hf_overrides={"architectures": ["DeepseekOCRForCausalLM"]},
block_size=256,
enforce_eager=False,
trust_remote_code=True,
max_model_len=8192,
swap_space=0,
max_num_seqs = MAX_CONCURRENCY,
tensor_parallel_size=1,
gpu_memory_utilization=0.9,
)
logits_processors = [NoRepeatNGramLogitsProcessor(ngram_size=40, window_size=90, whitelist_token_ids= {128821, 128822})] #window for fastwhitelist_token_ids: <td>,</td>
sampling_params = SamplingParams(
temperature=0.0,
max_tokens=8192,
logits_processors=logits_processors,
skip_special_tokens=False,
)
class Colors:
RED = '\033[31m'
GREEN = '\033[32m'
YELLOW = '\033[33m'
BLUE = '\033[34m'
RESET = '\033[0m'
def clean_formula(text):
formula_pattern = r'\\\[(.*?)\\\]'
def process_formula(match):
formula = match.group(1)
formula = re.sub(r'\\quad\s*\([^)]*\)', '', formula)
formula = formula.strip()
return r'\[' + formula + r'\]'
cleaned_text = re.sub(formula_pattern, process_formula, text)
return cleaned_text
def re_match(text):
pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
matches = re.findall(pattern, text, re.DOTALL)
# mathes_image = []
mathes_other = []
for a_match in matches:
mathes_other.append(a_match[0])
return matches, mathes_other
def process_single_image(image):
"""single image"""
prompt_in = prompt
cache_item = {
"prompt": prompt_in,
"multi_modal_data": {"image": DeepseekOCRProcessor().tokenize_with_images(images = [image], bos=True, eos=True, cropping=CROP_MODE)},
}
return cache_item
if __name__ == "__main__":
# INPUT_PATH = OmniDocBench images path
os.makedirs(OUTPUT_PATH, exist_ok=True)
# print('image processing until processing prompts.....')
print(f'{Colors.RED}glob images.....{Colors.RESET}')
images_path = glob.glob(f'{INPUT_PATH}/*')
images = []
for image_path in images_path:
image = Image.open(image_path).convert('RGB')
images.append(image)
prompt = PROMPT
# batch_inputs = []
# for image in tqdm(images):
# prompt_in = prompt
# cache_list = [
# {
# "prompt": prompt_in,
# "multi_modal_data": {"image": Image.open(image).convert('RGB')},
# }
# ]
# batch_inputs.extend(cache_list)
with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
batch_inputs = list(tqdm(
executor.map(process_single_image, images),
total=len(images),
desc="Pre-processed images"
))
outputs_list = llm.generate(
batch_inputs,
sampling_params=sampling_params
)
output_path = OUTPUT_PATH
os.makedirs(output_path, exist_ok=True)
for output, image in zip(outputs_list, images_path):
content = output.outputs[0].text
mmd_det_path = output_path + image.split('/')[-1].replace('.jpg', '_det.md')
with open(mmd_det_path, 'w', encoding='utf-8') as afile:
afile.write(content)
content = clean_formula(content)
matches_ref, mathes_other = re_match(content)
for idx, a_match_other in enumerate(tqdm(mathes_other, desc="other")):
content = content.replace(a_match_other, '').replace('\n\n\n\n', '\n\n').replace('\n\n\n', '\n\n').replace('<center>', '').replace('</center>', '')
mmd_path = output_path + image.split('/')[-1].replace('.jpg', '.md')
with open(mmd_path, 'w', encoding='utf-8') as afile:
afile.write(content)

View File

@@ -1,303 +0,0 @@
import asyncio
import re
import os
import torch
if torch.version.cuda == '11.8':
os.environ["TRITON_PTXAS_PATH"] = "/usr/local/cuda-11.8/bin/ptxas"
os.environ['VLLM_USE_V1'] = '0'
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
from vllm import AsyncLLMEngine, SamplingParams
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.model_executor.models.registry import ModelRegistry
import time
from deepseek_ocr import DeepseekOCRForCausalLM
from PIL import Image, ImageDraw, ImageFont, ImageOps
import numpy as np
from tqdm import tqdm
from process.ngram_norepeat import NoRepeatNGramLogitsProcessor
from process.image_process import DeepseekOCRProcessor
from config import MODEL_PATH, INPUT_PATH, OUTPUT_PATH, PROMPT, CROP_MODE
ModelRegistry.register_model("DeepseekOCRForCausalLM", DeepseekOCRForCausalLM)
def load_image(image_path):
try:
image = Image.open(image_path)
corrected_image = ImageOps.exif_transpose(image)
return corrected_image
except Exception as e:
print(f"error: {e}")
try:
return Image.open(image_path)
except:
return None
def re_match(text):
pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
matches = re.findall(pattern, text, re.DOTALL)
mathes_image = []
mathes_other = []
for a_match in matches:
if '<|ref|>image<|/ref|>' in a_match[0]:
mathes_image.append(a_match[0])
else:
mathes_other.append(a_match[0])
return matches, mathes_image, mathes_other
def extract_coordinates_and_label(ref_text, image_width, image_height):
try:
label_type = ref_text[1]
cor_list = eval(ref_text[2])
except Exception as e:
print(e)
return None
return (label_type, cor_list)
def draw_bounding_boxes(image, refs):
image_width, image_height = image.size
img_draw = image.copy()
draw = ImageDraw.Draw(img_draw)
overlay = Image.new('RGBA', img_draw.size, (0, 0, 0, 0))
draw2 = ImageDraw.Draw(overlay)
# except IOError:
font = ImageFont.load_default()
img_idx = 0
for i, ref in enumerate(refs):
try:
result = extract_coordinates_and_label(ref, image_width, image_height)
if result:
label_type, points_list = result
color = (np.random.randint(0, 200), np.random.randint(0, 200), np.random.randint(0, 255))
color_a = color + (20, )
for points in points_list:
x1, y1, x2, y2 = points
x1 = int(x1 / 999 * image_width)
y1 = int(y1 / 999 * image_height)
x2 = int(x2 / 999 * image_width)
y2 = int(y2 / 999 * image_height)
if label_type == 'image':
try:
cropped = image.crop((x1, y1, x2, y2))
cropped.save(f"{OUTPUT_PATH}/images/{img_idx}.jpg")
except Exception as e:
print(e)
pass
img_idx += 1
try:
if label_type == 'title':
draw.rectangle([x1, y1, x2, y2], outline=color, width=4)
draw2.rectangle([x1, y1, x2, y2], fill=color_a, outline=(0, 0, 0, 0), width=1)
else:
draw.rectangle([x1, y1, x2, y2], outline=color, width=2)
draw2.rectangle([x1, y1, x2, y2], fill=color_a, outline=(0, 0, 0, 0), width=1)
text_x = x1
text_y = max(0, y1 - 15)
text_bbox = draw.textbbox((0, 0), label_type, font=font)
text_width = text_bbox[2] - text_bbox[0]
text_height = text_bbox[3] - text_bbox[1]
draw.rectangle([text_x, text_y, text_x + text_width, text_y + text_height],
fill=(255, 255, 255, 30))
draw.text((text_x, text_y), label_type, font=font, fill=color)
except:
pass
except:
continue
img_draw.paste(overlay, (0, 0), overlay)
return img_draw
def process_image_with_refs(image, ref_texts):
result_image = draw_bounding_boxes(image, ref_texts)
return result_image
async def stream_generate(image=None, prompt=''):
engine_args = AsyncEngineArgs(
model=MODEL_PATH,
hf_overrides={"architectures": ["DeepseekOCRForCausalLM"]},
block_size=256,
max_model_len=8192,
enforce_eager=False,
trust_remote_code=True,
tensor_parallel_size=1,
gpu_memory_utilization=0.75,
)
engine = AsyncLLMEngine.from_engine_args(engine_args)
logits_processors = [NoRepeatNGramLogitsProcessor(ngram_size=30, window_size=90, whitelist_token_ids= {128821, 128822})] #whitelist: <td>, </td>
sampling_params = SamplingParams(
temperature=0.0,
max_tokens=8192,
logits_processors=logits_processors,
skip_special_tokens=False,
# ignore_eos=False,
)
request_id = f"request-{int(time.time())}"
printed_length = 0
if image and '<image>' in prompt:
request = {
"prompt": prompt,
"multi_modal_data": {"image": image}
}
elif prompt:
request = {
"prompt": prompt
}
else:
assert False, f'prompt is none!!!'
async for request_output in engine.generate(
request, sampling_params, request_id
):
if request_output.outputs:
full_text = request_output.outputs[0].text
new_text = full_text[printed_length:]
print(new_text, end='', flush=True)
printed_length = len(full_text)
final_output = full_text
print('\n')
return final_output
if __name__ == "__main__":
os.makedirs(OUTPUT_PATH, exist_ok=True)
os.makedirs(f'{OUTPUT_PATH}/images', exist_ok=True)
image = load_image(INPUT_PATH).convert('RGB')
if '<image>' in PROMPT:
image_features = DeepseekOCRProcessor().tokenize_with_images(images = [image], bos=True, eos=True, cropping=CROP_MODE)
else:
image_features = ''
prompt = PROMPT
result_out = asyncio.run(stream_generate(image_features, prompt))
save_results = 1
if save_results and '<image>' in prompt:
print('='*15 + 'save results:' + '='*15)
image_draw = image.copy()
outputs = result_out
with open(f'{OUTPUT_PATH}/result_ori.mmd', 'w', encoding = 'utf-8') as afile:
afile.write(outputs)
matches_ref, matches_images, mathes_other = re_match(outputs)
# print(matches_ref)
result = process_image_with_refs(image_draw, matches_ref)
for idx, a_match_image in enumerate(tqdm(matches_images, desc="image")):
outputs = outputs.replace(a_match_image, f'![](images/' + str(idx) + '.jpg)\n')
for idx, a_match_other in enumerate(tqdm(mathes_other, desc="other")):
outputs = outputs.replace(a_match_other, '').replace('\\coloneqq', ':=').replace('\\eqqcolon', '=:')
# if 'structural formula' in conversation[0]['content']:
# outputs = '<smiles>' + outputs + '</smiles>'
with open(f'{OUTPUT_PATH}/result.mmd', 'w', encoding = 'utf-8') as afile:
afile.write(outputs)
if 'line_type' in outputs:
import matplotlib.pyplot as plt
from matplotlib.patches import Circle
lines = eval(outputs)['Line']['line']
line_type = eval(outputs)['Line']['line_type']
# print(lines)
endpoints = eval(outputs)['Line']['line_endpoint']
fig, ax = plt.subplots(figsize=(3,3), dpi=200)
ax.set_xlim(-15, 15)
ax.set_ylim(-15, 15)
for idx, line in enumerate(lines):
try:
p0 = eval(line.split(' -- ')[0])
p1 = eval(line.split(' -- ')[-1])
if line_type[idx] == '--':
ax.plot([p0[0], p1[0]], [p0[1], p1[1]], linewidth=0.8, color='k')
else:
ax.plot([p0[0], p1[0]], [p0[1], p1[1]], linewidth = 0.8, color = 'k')
ax.scatter(p0[0], p0[1], s=5, color = 'k')
ax.scatter(p1[0], p1[1], s=5, color = 'k')
except:
pass
for endpoint in endpoints:
label = endpoint.split(': ')[0]
(x, y) = eval(endpoint.split(': ')[1])
ax.annotate(label, (x, y), xytext=(1, 1), textcoords='offset points',
fontsize=5, fontweight='light')
try:
if 'Circle' in eval(outputs).keys():
circle_centers = eval(outputs)['Circle']['circle_center']
radius = eval(outputs)['Circle']['radius']
for center, r in zip(circle_centers, radius):
center = eval(center.split(': ')[1])
circle = Circle(center, radius=r, fill=False, edgecolor='black', linewidth=0.8)
ax.add_patch(circle)
except:
pass
plt.savefig(f'{OUTPUT_PATH}/geo.jpg')
plt.close()
result.save(f'{OUTPUT_PATH}/result_with_boxes.jpg')

View File

@@ -1,330 +0,0 @@
import os
import fitz
import img2pdf
import io
import re
from tqdm import tqdm
import torch
from concurrent.futures import ThreadPoolExecutor
if torch.version.cuda == '11.8':
os.environ["TRITON_PTXAS_PATH"] = "/usr/local/cuda-11.8/bin/ptxas"
os.environ['VLLM_USE_V1'] = '0'
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
from config import MODEL_PATH, INPUT_PATH, OUTPUT_PATH, PROMPT, SKIP_REPEAT, MAX_CONCURRENCY, NUM_WORKERS, CROP_MODE
from PIL import Image, ImageDraw, ImageFont
import numpy as np
from deepseek_ocr import DeepseekOCRForCausalLM
from vllm.model_executor.models.registry import ModelRegistry
from vllm import LLM, SamplingParams
from process.ngram_norepeat import NoRepeatNGramLogitsProcessor
from process.image_process import DeepseekOCRProcessor
ModelRegistry.register_model("DeepseekOCRForCausalLM", DeepseekOCRForCausalLM)
llm = LLM(
model=MODEL_PATH,
hf_overrides={"architectures": ["DeepseekOCRForCausalLM"]},
block_size=256,
enforce_eager=False,
trust_remote_code=True,
max_model_len=8192,
swap_space=0,
max_num_seqs=MAX_CONCURRENCY,
tensor_parallel_size=1,
gpu_memory_utilization=0.9,
disable_mm_preprocessor_cache=True
)
logits_processors = [NoRepeatNGramLogitsProcessor(ngram_size=20, window_size=50, whitelist_token_ids= {128821, 128822})] #window for fastwhitelist_token_ids: <td>,</td>
sampling_params = SamplingParams(
temperature=0.0,
max_tokens=8192,
logits_processors=logits_processors,
skip_special_tokens=False,
include_stop_str_in_output=True,
)
class Colors:
RED = '\033[31m'
GREEN = '\033[32m'
YELLOW = '\033[33m'
BLUE = '\033[34m'
RESET = '\033[0m'
def pdf_to_images_high_quality(pdf_path, dpi=144, image_format="PNG"):
"""
pdf2images
"""
images = []
pdf_document = fitz.open(pdf_path)
zoom = dpi / 72.0
matrix = fitz.Matrix(zoom, zoom)
for page_num in range(pdf_document.page_count):
page = pdf_document[page_num]
pixmap = page.get_pixmap(matrix=matrix, alpha=False)
Image.MAX_IMAGE_PIXELS = None
if image_format.upper() == "PNG":
img_data = pixmap.tobytes("png")
img = Image.open(io.BytesIO(img_data))
else:
img_data = pixmap.tobytes("png")
img = Image.open(io.BytesIO(img_data))
if img.mode in ('RGBA', 'LA'):
background = Image.new('RGB', img.size, (255, 255, 255))
background.paste(img, mask=img.split()[-1] if img.mode == 'RGBA' else None)
img = background
images.append(img)
pdf_document.close()
return images
def pil_to_pdf_img2pdf(pil_images, output_path):
if not pil_images:
return
image_bytes_list = []
for img in pil_images:
if img.mode != 'RGB':
img = img.convert('RGB')
img_buffer = io.BytesIO()
img.save(img_buffer, format='JPEG', quality=95)
img_bytes = img_buffer.getvalue()
image_bytes_list.append(img_bytes)
try:
pdf_bytes = img2pdf.convert(image_bytes_list)
with open(output_path, "wb") as f:
f.write(pdf_bytes)
except Exception as e:
print(f"error: {e}")
def re_match(text):
pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
matches = re.findall(pattern, text, re.DOTALL)
mathes_image = []
mathes_other = []
for a_match in matches:
if '<|ref|>image<|/ref|>' in a_match[0]:
mathes_image.append(a_match[0])
else:
mathes_other.append(a_match[0])
return matches, mathes_image, mathes_other
def extract_coordinates_and_label(ref_text, image_width, image_height):
try:
label_type = ref_text[1]
cor_list = eval(ref_text[2])
except Exception as e:
print(e)
return None
return (label_type, cor_list)
def draw_bounding_boxes(image, refs, jdx):
image_width, image_height = image.size
img_draw = image.copy()
draw = ImageDraw.Draw(img_draw)
overlay = Image.new('RGBA', img_draw.size, (0, 0, 0, 0))
draw2 = ImageDraw.Draw(overlay)
# except IOError:
font = ImageFont.load_default()
img_idx = 0
for i, ref in enumerate(refs):
try:
result = extract_coordinates_and_label(ref, image_width, image_height)
if result:
label_type, points_list = result
color = (np.random.randint(0, 200), np.random.randint(0, 200), np.random.randint(0, 255))
color_a = color + (20, )
for points in points_list:
x1, y1, x2, y2 = points
x1 = int(x1 / 999 * image_width)
y1 = int(y1 / 999 * image_height)
x2 = int(x2 / 999 * image_width)
y2 = int(y2 / 999 * image_height)
if label_type == 'image':
try:
cropped = image.crop((x1, y1, x2, y2))
cropped.save(f"{OUTPUT_PATH}/images/{jdx}_{img_idx}.jpg")
except Exception as e:
print(e)
pass
img_idx += 1
try:
if label_type == 'title':
draw.rectangle([x1, y1, x2, y2], outline=color, width=4)
draw2.rectangle([x1, y1, x2, y2], fill=color_a, outline=(0, 0, 0, 0), width=1)
else:
draw.rectangle([x1, y1, x2, y2], outline=color, width=2)
draw2.rectangle([x1, y1, x2, y2], fill=color_a, outline=(0, 0, 0, 0), width=1)
text_x = x1
text_y = max(0, y1 - 15)
text_bbox = draw.textbbox((0, 0), label_type, font=font)
text_width = text_bbox[2] - text_bbox[0]
text_height = text_bbox[3] - text_bbox[1]
draw.rectangle([text_x, text_y, text_x + text_width, text_y + text_height],
fill=(255, 255, 255, 30))
draw.text((text_x, text_y), label_type, font=font, fill=color)
except:
pass
except:
continue
img_draw.paste(overlay, (0, 0), overlay)
return img_draw
def process_image_with_refs(image, ref_texts, jdx):
result_image = draw_bounding_boxes(image, ref_texts, jdx)
return result_image
def process_single_image(image):
"""single image"""
prompt_in = prompt
cache_item = {
"prompt": prompt_in,
"multi_modal_data": {"image": DeepseekOCRProcessor().tokenize_with_images(images = [image], bos=True, eos=True, cropping=CROP_MODE)},
}
return cache_item
if __name__ == "__main__":
os.makedirs(OUTPUT_PATH, exist_ok=True)
os.makedirs(f'{OUTPUT_PATH}/images', exist_ok=True)
print(f'{Colors.RED}PDF loading .....{Colors.RESET}')
images = pdf_to_images_high_quality(INPUT_PATH)
prompt = PROMPT
# batch_inputs = []
with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
batch_inputs = list(tqdm(
executor.map(process_single_image, images),
total=len(images),
desc="Pre-processed images"
))
# for image in tqdm(images):
# prompt_in = prompt
# cache_list = [
# {
# "prompt": prompt_in,
# "multi_modal_data": {"image": DeepseekOCRProcessor().tokenize_with_images(images = [image], bos=True, eos=True, cropping=CROP_MODE)},
# }
# ]
# batch_inputs.extend(cache_list)
outputs_list = llm.generate(
batch_inputs,
sampling_params=sampling_params
)
output_path = OUTPUT_PATH
os.makedirs(output_path, exist_ok=True)
mmd_det_path = output_path + '/' + INPUT_PATH.split('/')[-1].replace('.pdf', '_det.mmd')
mmd_path = output_path + '/' + INPUT_PATH.split('/')[-1].replace('pdf', 'mmd')
pdf_out_path = output_path + '/' + INPUT_PATH.split('/')[-1].replace('.pdf', '_layouts.pdf')
contents_det = ''
contents = ''
draw_images = []
jdx = 0
for output, img in zip(outputs_list, images):
content = output.outputs[0].text
if '<end▁of▁sentence>' in content: # repeat no eos
content = content.replace('<end▁of▁sentence>', '')
else:
if SKIP_REPEAT:
continue
page_num = f'\n<--- Page Split --->'
contents_det += content + f'\n{page_num}\n'
image_draw = img.copy()
matches_ref, matches_images, mathes_other = re_match(content)
# print(matches_ref)
result_image = process_image_with_refs(image_draw, matches_ref, jdx)
draw_images.append(result_image)
for idx, a_match_image in enumerate(matches_images):
content = content.replace(a_match_image, f'![](images/' + str(jdx) + '_' + str(idx) + '.jpg)\n')
for idx, a_match_other in enumerate(mathes_other):
content = content.replace(a_match_other, '').replace('\\coloneqq', ':=').replace('\\eqqcolon', '=:').replace('\n\n\n\n', '\n\n').replace('\n\n\n', '\n\n')
contents += content + f'\n{page_num}\n'
jdx += 1
with open(mmd_det_path, 'w', encoding='utf-8') as afile:
afile.write(contents_det)
with open(mmd_path, 'w', encoding='utf-8') as afile:
afile.write(contents)
pil_to_pdf_img2pdf(draw_images, pdf_out_path)

View File

@@ -1,48 +0,0 @@
# PyTorch 2.6.0 + CUDA 12.6 + cuDNN9
FROM pytorch/pytorch:2.6.0-cuda12.6-cudnn9-devel
ENV DEBIAN_FRONTEND=noninteractive \
HF_HOME=/workspace/.cache/huggingface \
CUDA_HOME=/usr/local/cuda \
LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH} \
PIP_DISABLE_PIP_VERSION_CHECK=1 \
PYTHONUNBUFFERED=1 \
HF_HUB_DISABLE_TELEMETRY=1
ARG TORCH_CUDA_ARCH_LIST=80
ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
WORKDIR /workspace
# 빌드 도구
RUN apt-get update && apt-get install -y --no-install-recommends \
git build-essential ninja-build cmake \
&& rm -rf /var/lib/apt/lists/*
RUN python -m pip install -U pip setuptools wheel packaging ninja
# 명시 재설치(일관성)
RUN pip install --no-cache-dir \
torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0
# DeepSeek-OCR(HF) 호환 스택
RUN pip install --no-cache-dir \
"transformers==4.43.3" \
"accelerate==0.33.0" \
"tokenizers==0.19.1" \
"numpy==1.26.4" \
"safetensors>=0.4.2" \
"einops" "timm>=0.9"
# flash-attn 2.7.3 (CUDA 12.6에서 빌드)
RUN pip install --no-cache-dir --no-build-isolation --no-binary=flash-attn flash-attn==2.7.3
# vLLM 제거(혹시 들어오더라도 충돌 방지)
RUN pip uninstall -y vllm || true
# OpenCV ↔ numpy 1.26 호환 버전 고정
RUN pip install --no-cache-dir "opencv-python-headless==4.8.1.78"
# 앱 소스
COPY DeepSeek-OCR-master/DeepSeek-OCR-hf/ /workspace/DeepSeek-OCR-hf/

View File

@@ -1,39 +0,0 @@
# PyTorch 2.6.0 + CUDA 12.6 + cuDNN9
FROM pytorch/pytorch:2.6.0-cuda12.6-cudnn9-devel
# 기본 환경 변수 설정
ENV DEBIAN_FRONTEND=noninteractive \
HF_HOME=/workspace/.cache/huggingface \
CUDA_HOME=/usr/local/cuda \
LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH} \
PIP_DISABLE_PIP_VERSION_CHECK=1 \
PYTHONUNBUFFERED=1 \
TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas \
TORCH_CUDA_ARCH_LIST="8.0"
WORKDIR /workspace
# 필수 빌드 도구 설치
RUN apt-get update && apt-get install -y --no-install-recommends \
git build-essential ninja-build \
&& rm -rf /var/lib/apt/lists/*
# pip 업그레이드
RUN python -m pip install -U pip setuptools wheel
# 기존 라이브러리 제거 및 특정 버전 재설치
RUN pip uninstall -y vllm torch torchvision torchaudio triton flash-attn || true
RUN pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0
# 프로젝트 의존성 설치
COPY requirements.txt /tmp/requirements.txt
RUN pip install -r /tmp/requirements.txt
# vLLM 특정 버전 설치
RUN pip install vllm==0.8.5
# FlashAttention 소스에서 빌드하여 설치
RUN pip cache purge && \
pip install --no-cache-dir --no-build-isolation --no-binary=flash-attn flash-attn==2.7.3
WORKDIR /workspace

View File

@@ -1,37 +0,0 @@
1version: '3.8'
services:
api_gateway:
build:
context: ./api_gateway
ports:
- "80:8000" # 호스트의 80번 포트를 게이트웨이의 8000번 포트로 연결
networks:
- vlm_network
depends_on:
- deepseek_ocr # deepseek_ocr 서비스가 시작된 후에 게이트웨이를 시작
restart: always
deepseek_ocr:
build:
context: ./model_services/deepseek_ocr
# deploy 키를 사용하여 GPU 리소스를 요청합니다.
# 이 설정은 docker-compose up --build 대신 docker stack deploy 또는 docker compose up --deploy와 함께 사용할 때 공식적으로 지원됩니다.
# 일반적인 docker-compose up에서도 동작하는 경우가 많습니다.
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1 # 1개의 GPU를 할당
capabilities: [gpu]
environment:
- NVIDIA_VISIBLE_DEVICES=all # 컨테이너가 모든 GPU를 볼 수 있도록 설정
- MODEL_PATH=deepseek-ai/deepseek-vl-7b-base # 사용할 모델 지정 (필요시 수정)
networks:
- vlm_network
restart: on-failure
networks:
vlm_network:
driver: bridge

View File

@@ -1,32 +1,39 @@
# 1. vLLM 호환을 위해 NVIDIA CUDA 베이스 이미지 선택
# 참고: vLLM 버전에 따라 적절한 CUDA 버전을 선택해야 할 수 있습니다.
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04
# PyTorch 2.6.0 + CUDA 12.6 + cuDNN9
FROM pytorch/pytorch:2.6.0-cuda12.6-cudnn9-devel
# 2. 환경 변수 설정 및 기본 패키지 설치
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && apt-get install -y \
python3.9 \
python3.9-pip \
git \
# 기본 환경 변수 설정
ENV DEBIAN_FRONTEND=noninteractive \
HF_HOME=/workspace/.cache/huggingface \
CUDA_HOME=/usr/local/cuda \
LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH} \
PIP_DISABLE_PIP_VERSION_CHECK=1 \
PYTHONUNBUFFERED=1 \
TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas \
TORCH_CUDA_ARCH_LIST="8.0"
WORKDIR /workspace
# 필수 빌드 도구 설치
RUN apt-get update && apt-get install -y --no-install-recommends \
git build-essential ninja-build \
&& rm -rf /var/lib/apt/lists/*
# python3.9을 기본 python/pip으로 설정
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.9 1 && \
update-alternatives --install /usr/bin/pip pip /usr/bin/pip 1
# pip 업그레이드
RUN python -m pip install -U pip setuptools wheel
# 3. 작업 디렉토리 설정
WORKDIR /app
# 기존 라이브러리 제거 및 특정 버전 재설치
RUN pip uninstall -y vllm torch torchvision torchaudio triton flash-attn || true
RUN pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0
# 4. 소스 코드 및 의존성 파일 복사
# (main.py, requirements.txt, deepseek_ocr.py, process/, config.py 등 모든 파일)
COPY . .
# 프로젝트 의존성 설치
COPY requirements.txt /tmp/requirements.txt
RUN pip install -r /tmp/requirements.txt
# 5. Python 의존성 설치
# vLLM은 torch를 필요로 하므로 함께 설치합니다.
RUN pip install --no-cache-dir -r requirements.txt
# vLLM 특정 버전 설치
RUN pip install vllm==0.8.5
# 6. 서비스 포트 노출
EXPOSE 8000
# FlashAttention 소스에서 빌드하여 설치
RUN pip cache purge && \
pip install --no-cache-dir --no-build-isolation --no-binary=flash-attn flash-attn==2.7.3
# 7. FastAPI 서버 실행
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
WORKDIR /workspace

View File

@@ -8,25 +8,24 @@
BASE_SIZE = 1024
IMAGE_SIZE = 640
CROP_MODE = True
MIN_CROPS= 2
MAX_CROPS= 6 # max:9; If your GPU memory is small, it is recommended to set it to 6.
MAX_CONCURRENCY = 100 # If you have limited GPU memory, lower the concurrency count.
NUM_WORKERS = 64 # image pre-process (resize/padding) workers
MIN_CROPS = 2
MAX_CROPS = 6 # max:9; If your GPU memory is small, it is recommended to set it to 6.
MAX_CONCURRENCY = 100 # If you have limited GPU memory, lower the concurrency count.
NUM_WORKERS = 64 # image pre-process (resize/padding) workers
PRINT_NUM_VIS_TOKENS = False
SKIP_REPEAT = True
MODEL_PATH = 'deepseek-ai/DeepSeek-OCR' # change to your model path
MODEL_PATH = "deepseek-ai/DeepSeek-OCR" # change to your model path
# TODO: change INPUT_PATH
# .pdf: run_dpsk_ocr_pdf.py;
# .jpg, .png, .jpeg: run_dpsk_ocr_image.py;
# Omnidocbench images path: run_dpsk_ocr_eval_batch.py
FILE_NAME='2025-27484-M21472.pdf'
INPUT_PATH = f'/workspace/2025-27484-M21472.pdf'
OUTPUT_PATH = '/workspace/output/'
INPUT_PATH = "/workspace/2018-0802140959-217049.pdf"
OUTPUT_PATH = "/workspace/output/"
PROMPT = '<image>\n<|grounding|>Convert the document to markdown.'
PROMPT = "<image>\n<|grounding|>Convert the document to markdown."
# PROMPT = '<image>\nFree OCR.'
# TODO commonly used prompts
# document: <image>\n<|grounding|>Convert the document to markdown.

View File

@@ -0,0 +1,21 @@
services:
deepseek_ocr_vllm:
build:
context: .
dockerfile: Dockerfile
image: deepseek-ocr-vllm:cu126
container_name: deepseek_ocr_vllm
working_dir: /workspace
volumes:
- ./:/workspace
gpus: all
shm_size: "8g"
ipc: "host"
environment:
- HF_HOME=/workspace/.cache/huggingface
- CUDA_HOME=/usr/local/cuda
- LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH}
- PIP_DISABLE_PIP_VERSION_CHECK=1
- PYTHONUNBUFFERED=1
tty: true
entrypoint: ["/bin/bash"]

View File

@@ -1,112 +0,0 @@
import os
import base64
import io
import time
from fastapi import FastAPI
from pydantic import BaseModel
from PIL import Image
# vLLM 및 모델 관련 import
from vllm import AsyncLLMEngine, SamplingParams
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.model_executor.models.registry import ModelRegistry
# DeepSeek-OCR 관련 로컬 import
from deepseek_ocr import DeepseekOCRForCausalLM
from process.image_process import DeepseekOCRProcessor
from process.ngram_norepeat import NoRepeatNGramLogitsProcessor
# --- Configuration ---
# Docker 환경에서는 환경 변수를 사용하거나, Dockerfile에서 모델을 다운로드하는 것이 좋습니다.
# 여기서는 config.py의 기본값을 사용하되, 환경 변수로 재정의할 수 있도록 합니다.
MODEL_PATH = os.environ.get("MODEL_PATH", "deepseek-ai/deepseek-vl-7b-base")
# 참고: 실제 `config.py`는 로컬 경로를 사용하므로, 허깅페이스 모델 ID로 대체합니다.
# 이 모델을 사용하려면 인터넷 연결이 필요하며, 처음 실행 시 다운로드됩니다.
# --- Model Initialization ---
# 1. 커스텀 모델 등록
ModelRegistry.register_model("DeepseekOCRForCausalLM", DeepseekOCRForCausalLM)
# 2. vLLM 엔진 설정
engine_args = AsyncEngineArgs(
model=MODEL_PATH,
hf_overrides={"architectures": ["DeepseekOCRForCausalLM"]},
max_model_len=8192,
enforce_eager=False,
trust_remote_code=True,
tensor_parallel_size=1, # 단일 GPU 사용
gpu_memory_utilization=0.90, # GPU 메모리 사용률
)
engine = AsyncLLMEngine.from_engine_args(engine_args)
# 3. Deepseek OCR 프로세서 초기화
processor = DeepseekOCRProcessor()
# 4. FastAPI 앱 초기화
app = FastAPI()
# --- Pydantic Models ---
class InferenceRequest(BaseModel):
# Base64로 인코딩된 이미지 문자열
base64_image: str
class InferenceResponse(BaseModel):
text: str
# --- API Endpoints ---
@app.get("/")
def health_check():
return {"status": "DeepSeek-OCR service is running"}
@app.post("/process", response_model=InferenceResponse)
async def process_image(request: InferenceRequest):
"""
Base64 인코딩된 이미지를 받아 OCR 추론을 수행합니다.
"""
try:
# 1. Base64 이미지 디코딩
image_data = base64.b64decode(request.base64_image)
image = Image.open(io.BytesIO(image_data)).convert('RGB')
# 2. 이미지 전처리
prompt = "<image>"
image_features = processor.tokenize_with_images(
images=[image],
bos=True,
eos=True,
cropping=False # CROP_MODE 기본값 사용
)
# 3. 샘플링 파라미터 설정 (기존 스크립트 참조)
logits_processors = [NoRepeatNGramLogitsProcessor(ngram_size=30, window_size=90, whitelist_token_ids={128821, 128822})]
sampling_params = SamplingParams(
temperature=0.0,
max_tokens=8192,
logits_processors=logits_processors,
skip_special_tokens=False,
)
# 4. vLLM으로 추론 실행
request_id = f"dpsk-request-{int(time.time())}"
vllm_request = {
"prompt": prompt,
"multi_modal_data": {"image": image_features}
}
final_output = None
async for request_output in engine.generate(vllm_request, sampling_params, request_id):
# 스트리밍 결과의 마지막 최종본을 사용
final_output = request_output
if final_output and final_output.outputs:
generated_text = final_output.outputs[0].text
return InferenceResponse(text=generated_text)
else:
raise Exception("Model generated no output.")
except Exception as e:
# 실제 운영 환경에서는 로깅을 추가하는 것이 좋습니다.
return {"error": f"An error occurred: {str(e)}"}, 500