Lint 적용

This commit is contained in:
kyy
2025-11-06 11:57:29 +09:00
parent f9975620cb
commit 2c3b417f3b
3 changed files with 155 additions and 82 deletions

View File

@@ -1,20 +1,25 @@
import logging import logging
from fastapi import APIRouter, File, HTTPException, UploadFile from fastapi import APIRouter, File, HTTPException, UploadFile
from services.ocr_engine import process_document from services.ocr_engine import process_document
router = APIRouter(prefix="/ocr", tags=["OCR"]) router = APIRouter(prefix="/ocr", tags=["OCR"])
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@router.post("", description="요청된 파일에서 Deepseek OCR을 수행하고 텍스트를 추출합니다.") @router.post(
async def perform_ocr(document: UploadFile = File(..., description="OCR을 수행할 PDF 또는 이미지 파일")): "", description="요청된 파일에서 Deepseek OCR을 수행하고 텍스트를 추출합니다."
)
async def perform_ocr(
document: UploadFile = File(..., description="OCR을 수행할 PDF 또는 이미지 파일"),
):
""" """
클라이언트로부터 받은 파일을 OCR 엔진에 전달하고, 추출된 텍스트를 반환합니다. 클라이언트로부터 받은 파일을 OCR 엔진에 전달하고, 추출된 텍스트를 반환합니다.
- **document**: `multipart/form-data` 형식으로 전송된 파일. - **document**: `multipart/form-data` 형식으로 전송된 파일.
""" """
logger.info(f"'{document.filename}' 파일에 대한 OCR 요청 수신 (Content-Type: {document.content_type})") logger.info(
f"'{document.filename}' 파일에 대한 OCR 요청 수신 (Content-Type: {document.content_type})"
)
try: try:
file_content = await document.read() file_content = await document.read()
@@ -36,6 +41,8 @@ async def perform_ocr(document: UploadFile = File(..., description="OCR을 수
except Exception as e: except Exception as e:
# 예상치 못한 서버 내부 오류 # 예상치 못한 서버 내부 오류
logger.exception(f"OCR 처리 중 예상치 못한 오류 발생: {e}") logger.exception(f"OCR 처리 중 예상치 못한 오류 발생: {e}")
raise HTTPException(status_code=500, detail=f"서버 내부 오류가 발생했습니다: {e}") raise HTTPException(
status_code=500, detail=f"서버 내부 오류가 발생했습니다: {e}"
)
finally: finally:
await document.close() await document.close()

View File

@@ -3,13 +3,21 @@ from typing import List, Tuple
import torch import torch
import torchvision.transforms as T import torchvision.transforms as T
from config.model_settings import (
BASE_SIZE,
IMAGE_SIZE,
MAX_CROPS,
MIN_CROPS,
PROMPT,
TOKENIZER,
)
from PIL import Image, ImageOps from PIL import Image, ImageOps
from transformers import AutoProcessor, BatchFeature, LlamaTokenizerFast from transformers import AutoProcessor, LlamaTokenizerFast
from transformers.processing_utils import ProcessorMixin from transformers.processing_utils import ProcessorMixin
from config import IMAGE_SIZE, BASE_SIZE, CROP_MODE, MIN_CROPS, MAX_CROPS, PROMPT, TOKENIZER
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size): def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
best_ratio_diff = float('inf') best_ratio_diff = float("inf")
best_ratio = (1, 1) best_ratio = (1, 1)
area = width * height area = width * height
for ratio in target_ratios: for ratio in target_ratios:
@@ -25,37 +33,56 @@ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_
return best_ratio return best_ratio
def count_tiles(orig_width, orig_height, min_num=MIN_CROPS, max_num=MAX_CROPS, image_size=640, use_thumbnail=False): def count_tiles(
orig_width,
orig_height,
min_num=MIN_CROPS,
max_num=MAX_CROPS,
image_size=640,
use_thumbnail=False,
):
aspect_ratio = orig_width / orig_height aspect_ratio = orig_width / orig_height
# calculate the existing image aspect ratio # calculate the existing image aspect ratio
target_ratios = set( target_ratios = set(
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if (i, j)
i * j <= max_num and i * j >= min_num) for n in range(min_num, max_num + 1)
for i in range(1, n + 1)
for j in range(1, n + 1)
if i * j <= max_num and i * j >= min_num
)
# print(target_ratios) # print(target_ratios)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
# find the closest aspect ratio to the target # find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio( target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size) aspect_ratio, target_ratios, orig_width, orig_height, image_size
)
return target_aspect_ratio return target_aspect_ratio
def dynamic_preprocess(image, min_num=MIN_CROPS, max_num=MAX_CROPS, image_size=640, use_thumbnail=False): def dynamic_preprocess(
image, min_num=MIN_CROPS, max_num=MAX_CROPS, image_size=640, use_thumbnail=False
):
orig_width, orig_height = image.size orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height aspect_ratio = orig_width / orig_height
# calculate the existing image aspect ratio # calculate the existing image aspect ratio
target_ratios = set( target_ratios = set(
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if (i, j)
i * j <= max_num and i * j >= min_num) for n in range(min_num, max_num + 1)
for i in range(1, n + 1)
for j in range(1, n + 1)
if i * j <= max_num and i * j >= min_num
)
# print(target_ratios) # print(target_ratios)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
# find the closest aspect ratio to the target # find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio( target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size) aspect_ratio, target_ratios, orig_width, orig_height, image_size
)
# print(target_aspect_ratio) # print(target_aspect_ratio)
# calculate the target width and height # calculate the target width and height
@@ -71,7 +98,7 @@ def dynamic_preprocess(image, min_num=MIN_CROPS, max_num=MAX_CROPS, image_size=6
(i % (target_width // image_size)) * image_size, (i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size, (i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size, ((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size ((i // (target_width // image_size)) + 1) * image_size,
) )
# split the image # split the image
split_img = resized_img.crop(box) split_img = resized_img.crop(box)
@@ -83,15 +110,13 @@ def dynamic_preprocess(image, min_num=MIN_CROPS, max_num=MAX_CROPS, image_size=6
return processed_images, target_aspect_ratio return processed_images, target_aspect_ratio
class ImageTransform: class ImageTransform:
def __init__(
def __init__(self, self,
mean: Tuple[float, float, float] = (0.5, 0.5, 0.5), mean: Tuple[float, float, float] = (0.5, 0.5, 0.5),
std: Tuple[float, float, float] = (0.5, 0.5, 0.5), std: Tuple[float, float, float] = (0.5, 0.5, 0.5),
normalize: bool = True): normalize: bool = True,
):
self.mean = mean self.mean = mean
self.std = std self.std = std
self.normalize = normalize self.normalize = normalize
@@ -129,28 +154,28 @@ class DeepseekOCRProcessor(ProcessorMixin):
ignore_id: int = -100, ignore_id: int = -100,
**kwargs, **kwargs,
): ):
# self.candidate_resolutions = candidate_resolutions # placeholder no use # self.candidate_resolutions = candidate_resolutions # placeholder no use
self.image_size = IMAGE_SIZE self.image_size = IMAGE_SIZE
self.base_size = BASE_SIZE self.base_size = BASE_SIZE
# self.patch_size = patch_size # self.patch_size = patch_size
self.patch_size = 16 self.patch_size = 16
self.image_mean = image_mean self.image_mean = image_mean
self.image_std = image_std self.image_std = image_std
self.normalize = normalize self.normalize = normalize
# self.downsample_ratio = downsample_ratio # self.downsample_ratio = downsample_ratio
self.downsample_ratio = 4 self.downsample_ratio = 4
self.image_transform = ImageTransform(mean=image_mean, std=image_std, normalize=normalize) self.image_transform = ImageTransform(
mean=image_mean, std=image_std, normalize=normalize
)
self.tokenizer = tokenizer self.tokenizer = tokenizer
# self.tokenizer = add_special_token(tokenizer) # self.tokenizer = add_special_token(tokenizer)
self.tokenizer.padding_side = 'left' # must set thispadding side with make a difference in batch inference self.tokenizer.padding_side = "left" # must set thispadding side with make a difference in batch inference
# add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id' # add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id'
if self.tokenizer.pad_token is None: if self.tokenizer.pad_token is None:
self.tokenizer.add_special_tokens({'pad_token': pad_token}) self.tokenizer.add_special_tokens({"pad_token": pad_token})
# add image token # add image token
# image_token_id = self.tokenizer.vocab.get(image_token) # image_token_id = self.tokenizer.vocab.get(image_token)
@@ -186,9 +211,6 @@ class DeepseekOCRProcessor(ProcessorMixin):
**kwargs, **kwargs,
) )
# def select_best_resolution(self, image_size): # def select_best_resolution(self, image_size):
# # used for cropping # # used for cropping
# original_width, original_height = image_size # original_width, original_height = image_size
@@ -264,13 +286,21 @@ class DeepseekOCRProcessor(ProcessorMixin):
- num_image_tokens (List[int]): the number of image tokens - num_image_tokens (List[int]): the number of image tokens
""" """
assert (prompt is not None and images is not None assert (
), "prompt and images must be used at the same time." prompt is not None and images is not None
), "prompt and images must be used at the same time."
sft_format = prompt sft_format = prompt
input_ids, pixel_values, images_crop, images_seq_mask, images_spatial_crop, num_image_tokens, _ = images[0] (
input_ids,
pixel_values,
images_crop,
images_seq_mask,
images_spatial_crop,
num_image_tokens,
_,
) = images[0]
return { return {
"input_ids": input_ids, "input_ids": input_ids,
@@ -281,7 +311,6 @@ class DeepseekOCRProcessor(ProcessorMixin):
"num_image_tokens": num_image_tokens, "num_image_tokens": num_image_tokens,
} }
# prepare = BatchFeature( # prepare = BatchFeature(
# data=dict( # data=dict(
# input_ids=input_ids, # input_ids=input_ids,
@@ -341,7 +370,12 @@ class DeepseekOCRProcessor(ProcessorMixin):
conversation = PROMPT conversation = PROMPT
assert conversation.count(self.image_token) == len(images) assert conversation.count(self.image_token) == len(images)
text_splits = conversation.split(self.image_token) text_splits = conversation.split(self.image_token)
images_list, images_crop_list, images_seq_mask, images_spatial_crop = [], [], [], [] images_list, images_crop_list, images_seq_mask, images_spatial_crop = (
[],
[],
[],
[],
)
image_shapes = [] image_shapes = []
num_image_tokens = [] num_image_tokens = []
tokenized_str = [] tokenized_str = []
@@ -368,7 +402,9 @@ class DeepseekOCRProcessor(ProcessorMixin):
# best_width, best_height = select_best_resolution(image.size, self.candidate_resolutions) # best_width, best_height = select_best_resolution(image.size, self.candidate_resolutions)
# print('image ', image.size) # print('image ', image.size)
# print('open_size:', image.size) # print('open_size:', image.size)
images_crop_raw, crop_ratio = dynamic_preprocess(image, image_size=IMAGE_SIZE) images_crop_raw, crop_ratio = dynamic_preprocess(
image, image_size=IMAGE_SIZE
)
# print('crop_ratio: ', crop_ratio) # print('crop_ratio: ', crop_ratio)
else: else:
# best_width, best_height = self.image_size, self.image_size # best_width, best_height = self.image_size, self.image_size
@@ -383,8 +419,11 @@ class DeepseekOCRProcessor(ProcessorMixin):
# print('directly resize') # print('directly resize')
image = image.resize((self.image_size, self.image_size)) image = image.resize((self.image_size, self.image_size))
global_view = ImageOps.pad(image, (self.base_size, self.base_size), global_view = ImageOps.pad(
color=tuple(int(x * 255) for x in self.image_transform.mean)) image,
(self.base_size, self.base_size),
color=tuple(int(x * 255) for x in self.image_transform.mean),
)
images_list.append(self.image_transform(global_view)) images_list.append(self.image_transform(global_view))
"""record height / width crop num""" """record height / width crop num"""
@@ -392,9 +431,6 @@ class DeepseekOCRProcessor(ProcessorMixin):
num_width_tiles, num_height_tiles = crop_ratio num_width_tiles, num_height_tiles = crop_ratio
images_spatial_crop.append([num_width_tiles, num_height_tiles]) images_spatial_crop.append([num_width_tiles, num_height_tiles])
if num_width_tiles > 1 or num_height_tiles > 1: if num_width_tiles > 1 or num_height_tiles > 1:
"""process the local views""" """process the local views"""
# local_view = ImageOps.pad(image, (best_width, best_height), # local_view = ImageOps.pad(image, (best_width, best_height),
@@ -421,15 +457,22 @@ class DeepseekOCRProcessor(ProcessorMixin):
# """add image tokens""" # """add image tokens"""
"""add image tokens""" """add image tokens"""
num_queries = math.ceil((self.image_size // self.patch_size) / self.downsample_ratio) num_queries = math.ceil(
num_queries_base = math.ceil((self.base_size // self.patch_size) / self.downsample_ratio) (self.image_size // self.patch_size) / self.downsample_ratio
)
num_queries_base = math.ceil(
(self.base_size // self.patch_size) / self.downsample_ratio
)
tokenized_image = (
tokenized_image = ([self.image_token_id] * num_queries_base + [self.image_token_id]) * num_queries_base [self.image_token_id] * num_queries_base + [self.image_token_id]
) * num_queries_base
tokenized_image += [self.image_token_id] tokenized_image += [self.image_token_id]
if num_width_tiles > 1 or num_height_tiles > 1: if num_width_tiles > 1 or num_height_tiles > 1:
tokenized_image += ([self.image_token_id] * (num_queries * num_width_tiles) + [self.image_token_id]) * ( tokenized_image += (
num_queries * num_height_tiles) [self.image_token_id] * (num_queries * num_width_tiles)
+ [self.image_token_id]
) * (num_queries * num_height_tiles)
tokenized_str += tokenized_image tokenized_str += tokenized_image
images_seq_mask += [True] * len(tokenized_image) images_seq_mask += [True] * len(tokenized_image)
num_image_tokens.append(len(tokenized_image)) num_image_tokens.append(len(tokenized_image))
@@ -447,10 +490,9 @@ class DeepseekOCRProcessor(ProcessorMixin):
tokenized_str = tokenized_str + [self.eos_id] tokenized_str = tokenized_str + [self.eos_id]
images_seq_mask = images_seq_mask + [False] images_seq_mask = images_seq_mask + [False]
assert len(tokenized_str) == len( assert (
images_seq_mask), f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}" len(tokenized_str) == len(images_seq_mask)
), f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
masked_tokenized_str = [] masked_tokenized_str = []
for token_index in tokenized_str: for token_index in tokenized_str:
@@ -459,17 +501,21 @@ class DeepseekOCRProcessor(ProcessorMixin):
else: else:
masked_tokenized_str.append(self.ignore_id) masked_tokenized_str.append(self.ignore_id)
assert len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str), \ assert (
(f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, " len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str)
f"imags_seq_mask's length {len(images_seq_mask)}, are not equal") ), (
f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
f"imags_seq_mask's length {len(images_seq_mask)}, are not equal"
)
input_ids = torch.LongTensor(tokenized_str) input_ids = torch.LongTensor(tokenized_str)
target_ids = torch.LongTensor(masked_tokenized_str) target_ids = torch.LongTensor(masked_tokenized_str)
images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool) images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
# set input_ids < 0 | input_ids == self.image_token_id as ignore_id # set input_ids < 0 | input_ids == self.image_token_id as ignore_id
target_ids[(input_ids < 0) | target_ids[(input_ids < 0) | (input_ids == self.image_token_id)] = (
(input_ids == self.image_token_id)] = self.ignore_id self.ignore_id
)
input_ids[input_ids < 0] = self.pad_id input_ids[input_ids < 0] = self.pad_id
inference_mode = True inference_mode = True
@@ -484,19 +530,32 @@ class DeepseekOCRProcessor(ProcessorMixin):
if len(images_list) == 0: if len(images_list) == 0:
pixel_values = torch.zeros((1, 3, self.base_size, self.base_size)) pixel_values = torch.zeros((1, 3, self.base_size, self.base_size))
images_spatial_crop = torch.zeros((1, 1), dtype=torch.long) images_spatial_crop = torch.zeros((1, 1), dtype=torch.long)
images_crop = torch.zeros((1, 3, self.image_size, self.image_size)).unsqueeze(0) images_crop = torch.zeros(
(1, 3, self.image_size, self.image_size)
).unsqueeze(0)
else: else:
pixel_values = torch.stack(images_list, dim=0) pixel_values = torch.stack(images_list, dim=0)
images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long) images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long)
if images_crop_list: if images_crop_list:
images_crop = torch.stack(images_crop_list, dim=0).unsqueeze(0) images_crop = torch.stack(images_crop_list, dim=0).unsqueeze(0)
else: else:
images_crop = torch.zeros((1, 3, self.image_size, self.image_size)).unsqueeze(0) images_crop = torch.zeros(
(1, 3, self.image_size, self.image_size)
).unsqueeze(0)
input_ids = input_ids.unsqueeze(0) input_ids = input_ids.unsqueeze(0)
return [
return [[input_ids, pixel_values, images_crop, images_seq_mask, images_spatial_crop, num_image_tokens, image_shapes]] [
input_ids,
pixel_values,
images_crop,
images_seq_mask,
images_spatial_crop,
num_image_tokens,
image_shapes,
]
]
AutoProcessor.register("DeepseekVLV2Processor", DeepseekOCRProcessor) AutoProcessor.register("DeepseekVLV2Processor", DeepseekOCRProcessor)

View File

@@ -1,40 +1,47 @@
from typing import List
import torch import torch
from transformers import LogitsProcessor from transformers import LogitsProcessor
from transformers.generation.logits_process import _calc_banned_ngram_tokens
from typing import List, Set
class NoRepeatNGramLogitsProcessor(LogitsProcessor): class NoRepeatNGramLogitsProcessor(LogitsProcessor):
def __init__(
def __init__(self, ngram_size: int, window_size: int = 100, whitelist_token_ids: set = None): self, ngram_size: int, window_size: int = 100, whitelist_token_ids: set = None
):
if not isinstance(ngram_size, int) or ngram_size <= 0: if not isinstance(ngram_size, int) or ngram_size <= 0:
raise ValueError(f"`ngram_size` has to be a strictly positive integer, but is {ngram_size}") raise ValueError(
f"`ngram_size` has to be a strictly positive integer, but is {ngram_size}"
)
if not isinstance(window_size, int) or window_size <= 0: if not isinstance(window_size, int) or window_size <= 0:
raise ValueError(f"`window_size` has to be a strictly positive integer, but is {window_size}") raise ValueError(
f"`window_size` has to be a strictly positive integer, but is {window_size}"
)
self.ngram_size = ngram_size self.ngram_size = ngram_size
self.window_size = window_size self.window_size = window_size
self.whitelist_token_ids = whitelist_token_ids or set() self.whitelist_token_ids = whitelist_token_ids or set()
def __call__(self, input_ids: List[int], scores: torch.FloatTensor) -> torch.FloatTensor: def __call__(
self, input_ids: List[int], scores: torch.FloatTensor
) -> torch.FloatTensor:
if len(input_ids) < self.ngram_size: if len(input_ids) < self.ngram_size:
return scores return scores
current_prefix = tuple(input_ids[-(self.ngram_size - 1):]) current_prefix = tuple(input_ids[-(self.ngram_size - 1) :])
search_start = max(0, len(input_ids) - self.window_size) search_start = max(0, len(input_ids) - self.window_size)
search_end = len(input_ids) - self.ngram_size + 1 search_end = len(input_ids) - self.ngram_size + 1
banned_tokens = set() banned_tokens = set()
for i in range(search_start, search_end): for i in range(search_start, search_end):
ngram = tuple(input_ids[i:i + self.ngram_size]) ngram = tuple(input_ids[i : i + self.ngram_size])
if ngram[:-1] == current_prefix: if ngram[:-1] == current_prefix:
banned_tokens.add(ngram[-1]) banned_tokens.add(ngram[-1])
banned_tokens = banned_tokens - self.whitelist_token_ids banned_tokens = banned_tokens - self.whitelist_token_ids
if banned_tokens: if banned_tokens:
scores = scores.clone() scores = scores.clone()
for token in banned_tokens: for token in banned_tokens:
scores[token] = -float("inf") scores[token] = -float("inf")
return scores return scores