Files
fletimageanalysis/comprehensive_text_extractor.py
2025-07-16 17:33:20 +09:00

549 lines
23 KiB
Python

# -*- coding: utf-8 -*-
"""
포괄적 텍스트 추출 모듈
DXF 파일에서 도곽 블록 외의 모든 텍스트 엔티티를 추출하여 표시 및 저장
- 모델스페이스의 독립적인 TEXT/MTEXT 엔티티
- 페이퍼스페이스의 독립적인 TEXT/MTEXT 엔티티
- 모든 블록 내부의 TEXT/MTEXT 엔티티
- 블록 속성(ATTRIB) 중 도곽이 아닌 것들
"""
import os
import csv
import json
import logging
from typing import Dict, List, Optional, Tuple, Any
from dataclasses import dataclass, asdict, field
from datetime import datetime
try:
import ezdxf
from ezdxf.document import Drawing
from ezdxf.entities import Insert, Attrib, AttDef, Text, MText
from ezdxf.layouts import BlockLayout, Modelspace, Paperspace
from ezdxf import bbox
EZDXF_AVAILABLE = True
except ImportError:
EZDXF_AVAILABLE = False
logging.warning("ezdxf 라이브러리가 설치되지 않았습니다.")
from config import Config
@dataclass
class ComprehensiveTextEntity:
"""포괄적인 텍스트 엔티티 정보"""
entity_type: str # TEXT, MTEXT, ATTRIB
text: str
position_x: float
position_y: float
position_z: float
height: float
rotation: float
layer: str
color: Optional[int] = None
style: Optional[str] = None
entity_handle: Optional[str] = None
# 위치 정보
location_type: str = "Unknown" # ModelSpace, PaperSpace, Block
parent_block: Optional[str] = None
layout_name: Optional[str] = None
# 블록 속성 정보 (ATTRIB인 경우)
attribute_tag: Optional[str] = None
is_title_block_attribute: bool = False
# 바운딩 박스
bbox_min_x: Optional[float] = None
bbox_min_y: Optional[float] = None
bbox_max_x: Optional[float] = None
bbox_max_y: Optional[float] = None
# 추가 속성
width_factor: float = 1.0
oblique_angle: float = 0.0
text_generation_flag: int = 0
@dataclass
class ComprehensiveExtractionResult:
"""포괄적인 텍스트 추출 결과"""
all_text_entities: List[ComprehensiveTextEntity] = field(default_factory=list)
modelspace_texts: List[ComprehensiveTextEntity] = field(default_factory=list)
paperspace_texts: List[ComprehensiveTextEntity] = field(default_factory=list)
block_texts: List[ComprehensiveTextEntity] = field(default_factory=list)
non_title_block_attributes: List[ComprehensiveTextEntity] = field(default_factory=list)
# 통계 정보
total_count: int = 0
by_type_count: Dict[str, int] = field(default_factory=dict)
by_location_count: Dict[str, int] = field(default_factory=dict)
by_layer_count: Dict[str, int] = field(default_factory=dict)
class ComprehensiveTextExtractor:
"""포괄적 텍스트 추출기"""
# 도곽 블록 식별을 위한 키워드
TITLE_BLOCK_KEYWORDS = {
'건설분야', '건설단계', '도면명', '축척', '도면번호', '설계자',
'프로젝트', '날짜', '리비전', '위치', 'title', 'scale', 'drawing',
'project', 'designer', 'date', 'revision', 'dwg', 'construction'
}
def __init__(self):
"""텍스트 추출기 초기화"""
self.logger = logging.getLogger(__name__)
if not EZDXF_AVAILABLE:
raise ImportError("ezdxf 라이브러리가 필요합니다.")
def extract_all_texts_comprehensive(self, file_path: str) -> ComprehensiveExtractionResult:
"""DXF 파일에서 모든 텍스트를 포괄적으로 추출"""
try:
self.logger.info(f"포괄적 텍스트 추출 시작: {file_path}")
# DXF 문서 로드
doc = ezdxf.readfile(file_path)
result = ComprehensiveExtractionResult()
# 1. 모델스페이스에서 독립적인 텍스트 추출
self._extract_layout_texts(doc.modelspace(), "ModelSpace", "Model", result)
# 2. 모든 페이퍼스페이스에서 텍스트 추출
for layout_name in doc.layout_names_in_taborder():
if layout_name != "Model": # 모델스페이스 제외
try:
layout = doc.paperspace(layout_name)
self._extract_layout_texts(layout, "PaperSpace", layout_name, result)
except Exception as e:
self.logger.warning(f"페이퍼스페이스 {layout_name} 처리 실패: {e}")
# 3. 모든 블록 정의에서 텍스트 추출
for block_layout in doc.blocks:
if not block_layout.name.startswith('*'): # 시스템 블록 제외
self._extract_block_texts(block_layout, result)
# 4. 모든 블록 참조의 속성 추출 (도곽 제외)
self._extract_block_attributes(doc, result)
# 5. 결과 분류 및 통계 생성
self._classify_and_analyze(result)
self.logger.info(f"포괄적 텍스트 추출 완료: 총 {result.total_count}")
return result
except Exception as e:
self.logger.error(f"포괄적 텍스트 추출 실패: {e}")
raise
def _extract_layout_texts(self, layout, location_type: str, layout_name: str, result: ComprehensiveExtractionResult):
"""레이아웃(모델스페이스/페이퍼스페이스)에서 텍스트 추출"""
try:
# TEXT 엔티티 추출
for text_entity in layout.query('TEXT'):
text_info = self._create_text_entity_info(
text_entity, 'TEXT', location_type, layout_name
)
if text_info and text_info.text.strip():
result.all_text_entities.append(text_info)
if location_type == "ModelSpace":
result.modelspace_texts.append(text_info)
else:
result.paperspace_texts.append(text_info)
# MTEXT 엔티티 추출
for mtext_entity in layout.query('MTEXT'):
text_info = self._create_text_entity_info(
mtext_entity, 'MTEXT', location_type, layout_name
)
if text_info and text_info.text.strip():
result.all_text_entities.append(text_info)
if location_type == "ModelSpace":
result.modelspace_texts.append(text_info)
else:
result.paperspace_texts.append(text_info)
# 독립적인 ATTRIB 엔티티 추출 (블록 외부)
for attrib_entity in layout.query('ATTRIB'):
text_info = self._create_text_entity_info(
attrib_entity, 'ATTRIB', location_type, layout_name
)
if text_info and text_info.text.strip():
result.all_text_entities.append(text_info)
if location_type == "ModelSpace":
result.modelspace_texts.append(text_info)
else:
result.paperspace_texts.append(text_info)
except Exception as e:
self.logger.warning(f"레이아웃 {layout_name} 텍스트 추출 실패: {e}")
def _extract_block_texts(self, block_layout: BlockLayout, result: ComprehensiveExtractionResult):
"""블록 정의 내부의 TEXT/MTEXT 엔티티 추출"""
try:
block_name = block_layout.name
# TEXT 엔티티 추출
for text_entity in block_layout.query('TEXT'):
text_info = self._create_text_entity_info(
text_entity, 'TEXT', "Block", None, block_name
)
if text_info and text_info.text.strip():
result.all_text_entities.append(text_info)
result.block_texts.append(text_info)
# MTEXT 엔티티 추출
for mtext_entity in block_layout.query('MTEXT'):
text_info = self._create_text_entity_info(
mtext_entity, 'MTEXT', "Block", None, block_name
)
if text_info and text_info.text.strip():
result.all_text_entities.append(text_info)
result.block_texts.append(text_info)
except Exception as e:
self.logger.warning(f"블록 {block_layout.name} 텍스트 추출 실패: {e}")
def _extract_block_attributes(self, doc: Drawing, result: ComprehensiveExtractionResult):
"""모든 블록 참조의 속성 추출 (도곽 블록 제외)"""
try:
# 모델스페이스의 블록 참조
self._process_layout_block_references(doc.modelspace(), "ModelSpace", "Model", result)
# 페이퍼스페이스의 블록 참조
for layout_name in doc.layout_names_in_taborder():
if layout_name != "Model":
try:
layout = doc.paperspace(layout_name)
self._process_layout_block_references(layout, "PaperSpace", layout_name, result)
except Exception as e:
self.logger.warning(f"페이퍼스페이스 {layout_name} 블록 참조 처리 실패: {e}")
except Exception as e:
self.logger.warning(f"블록 속성 추출 실패: {e}")
def _process_layout_block_references(self, layout, location_type: str, layout_name: str, result: ComprehensiveExtractionResult):
"""레이아웃의 블록 참조 처리"""
for insert in layout.query('INSERT'):
block_name = insert.dxf.name
# 도곽 블록인지 확인
is_title_block = self._is_title_block(insert)
# 블록의 속성들 추출
for attrib in insert.attribs:
text_info = self._create_attrib_entity_info(
attrib, location_type, layout_name, block_name, is_title_block
)
if text_info and text_info.text.strip():
result.all_text_entities.append(text_info)
if not is_title_block:
result.non_title_block_attributes.append(text_info)
def _is_title_block(self, insert: Insert) -> bool:
"""블록이 도곽 블록인지 판단"""
try:
# 블록 이름에서 도곽 키워드 확인
block_name = insert.dxf.name.lower()
if any(keyword in block_name for keyword in ['title', 'border', '도곽', '표제']):
return True
# 속성에서 도곽 키워드 확인
title_block_attrs = 0
for attrib in insert.attribs:
tag = attrib.dxf.tag.lower()
text = attrib.dxf.text.lower()
if any(keyword in tag or keyword in text for keyword in self.TITLE_BLOCK_KEYWORDS):
title_block_attrs += 1
# 2개 이상의 도곽 관련 속성이 있으면 도곽으로 판단
return title_block_attrs >= 2
except Exception:
return False
def _create_text_entity_info(self, entity, entity_type: str, location_type: str,
layout_name: Optional[str], parent_block: Optional[str] = None) -> Optional[ComprehensiveTextEntity]:
"""텍스트 엔티티 정보 생성"""
try:
# 텍스트 내용 추출
if entity_type == 'MTEXT':
text_content = getattr(entity, 'text', '') or getattr(entity.dxf, 'text', '')
else:
text_content = getattr(entity.dxf, 'text', '')
if not text_content.strip():
return None
# 위치 정보
insert_point = getattr(entity.dxf, 'insert', (0, 0, 0))
if hasattr(insert_point, 'x'):
position = (insert_point.x, insert_point.y, insert_point.z)
else:
position = (insert_point[0], insert_point[1], insert_point[2] if len(insert_point) > 2 else 0)
# 속성 정보
height = getattr(entity.dxf, 'height', 1.0)
if entity_type == 'MTEXT':
height = getattr(entity.dxf, 'char_height', height)
rotation = getattr(entity.dxf, 'rotation', 0.0)
layer = getattr(entity.dxf, 'layer', '0')
color = getattr(entity.dxf, 'color', None)
style = getattr(entity.dxf, 'style', None)
entity_handle = getattr(entity.dxf, 'handle', None)
width_factor = getattr(entity.dxf, 'width_factor', 1.0)
oblique_angle = getattr(entity.dxf, 'oblique_angle', 0.0)
text_generation_flag = getattr(entity.dxf, 'text_generation_flag', 0)
# 바운딩 박스 계산
bbox_info = self._calculate_entity_bbox(entity)
return ComprehensiveTextEntity(
entity_type=entity_type,
text=text_content,
position_x=position[0],
position_y=position[1],
position_z=position[2],
height=height,
rotation=rotation,
layer=layer,
color=color,
style=style,
entity_handle=entity_handle,
location_type=location_type,
parent_block=parent_block,
layout_name=layout_name,
bbox_min_x=bbox_info[0] if bbox_info else None,
bbox_min_y=bbox_info[1] if bbox_info else None,
bbox_max_x=bbox_info[2] if bbox_info else None,
bbox_max_y=bbox_info[3] if bbox_info else None,
width_factor=width_factor,
oblique_angle=oblique_angle,
text_generation_flag=text_generation_flag
)
except Exception as e:
self.logger.warning(f"텍스트 엔티티 정보 생성 실패: {e}")
return None
def _create_attrib_entity_info(self, attrib: Attrib, location_type: str, layout_name: Optional[str],
parent_block: str, is_title_block: bool) -> Optional[ComprehensiveTextEntity]:
"""속성 엔티티 정보 생성"""
try:
text_content = getattr(attrib.dxf, 'text', '')
if not text_content.strip():
return None
# 위치 정보
insert_point = getattr(attrib.dxf, 'insert', (0, 0, 0))
if hasattr(insert_point, 'x'):
position = (insert_point.x, insert_point.y, insert_point.z)
else:
position = (insert_point[0], insert_point[1], insert_point[2] if len(insert_point) > 2 else 0)
# 속성 정보
tag = getattr(attrib.dxf, 'tag', '')
height = getattr(attrib.dxf, 'height', 1.0)
rotation = getattr(attrib.dxf, 'rotation', 0.0)
layer = getattr(attrib.dxf, 'layer', '0')
color = getattr(attrib.dxf, 'color', None)
style = getattr(attrib.dxf, 'style', None)
entity_handle = getattr(attrib.dxf, 'handle', None)
width_factor = getattr(attrib.dxf, 'width_factor', 1.0)
oblique_angle = getattr(attrib.dxf, 'oblique_angle', 0.0)
text_generation_flag = getattr(attrib.dxf, 'text_generation_flag', 0)
# 바운딩 박스 계산
bbox_info = self._calculate_entity_bbox(attrib)
return ComprehensiveTextEntity(
entity_type='ATTRIB',
text=text_content,
position_x=position[0],
position_y=position[1],
position_z=position[2],
height=height,
rotation=rotation,
layer=layer,
color=color,
style=style,
entity_handle=entity_handle,
location_type=location_type,
parent_block=parent_block,
layout_name=layout_name,
attribute_tag=tag,
is_title_block_attribute=is_title_block,
bbox_min_x=bbox_info[0] if bbox_info else None,
bbox_min_y=bbox_info[1] if bbox_info else None,
bbox_max_x=bbox_info[2] if bbox_info else None,
bbox_max_y=bbox_info[3] if bbox_info else None,
width_factor=width_factor,
oblique_angle=oblique_angle,
text_generation_flag=text_generation_flag
)
except Exception as e:
self.logger.warning(f"속성 엔티티 정보 생성 실패: {e}")
return None
def _calculate_entity_bbox(self, entity) -> Optional[Tuple[float, float, float, float]]:
"""엔티티의 바운딩 박스 계산"""
try:
entity_bbox = bbox.extents([entity])
if entity_bbox:
return (entity_bbox.extmin.x, entity_bbox.extmin.y,
entity_bbox.extmax.x, entity_bbox.extmax.y)
except Exception:
# 대안: 추정 계산
try:
insert_point = getattr(entity.dxf, 'insert', (0, 0, 0))
height = getattr(entity.dxf, 'height', 1.0)
if hasattr(entity, 'text'):
text_content = entity.text
elif hasattr(entity.dxf, 'text'):
text_content = entity.dxf.text
else:
text_content = ""
estimated_width = len(text_content) * height * 0.6
x, y = insert_point[0], insert_point[1]
return (x, y, x + estimated_width, y + height)
except Exception:
pass
return None
def _classify_and_analyze(self, result: ComprehensiveExtractionResult):
"""결과 분류 및 통계 분석"""
result.total_count = len(result.all_text_entities)
# 타입별 개수
for entity in result.all_text_entities:
entity_type = entity.entity_type
result.by_type_count[entity_type] = result.by_type_count.get(entity_type, 0) + 1
# 위치별 개수
for entity in result.all_text_entities:
location = entity.location_type
result.by_location_count[location] = result.by_location_count.get(location, 0) + 1
# 레이어별 개수
for entity in result.all_text_entities:
layer = entity.layer
result.by_layer_count[layer] = result.by_layer_count.get(layer, 0) + 1
def save_to_csv(self, result: ComprehensiveExtractionResult, output_path: str) -> bool:
"""결과를 CSV 파일로 저장"""
try:
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
fieldnames = [
'Entity_Type', 'Text', 'Position_X', 'Position_Y', 'Position_Z',
'Height', 'Rotation', 'Layer', 'Color', 'Style', 'Entity_Handle',
'Location_Type', 'Parent_Block', 'Layout_Name', 'Attribute_Tag',
'Is_Title_Block_Attribute', 'BBox_Min_X', 'BBox_Min_Y',
'BBox_Max_X', 'BBox_Max_Y', 'Width_Factor', 'Oblique_Angle'
]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for entity in result.all_text_entities:
writer.writerow({
'Entity_Type': entity.entity_type,
'Text': entity.text,
'Position_X': entity.position_x,
'Position_Y': entity.position_y,
'Position_Z': entity.position_z,
'Height': entity.height,
'Rotation': entity.rotation,
'Layer': entity.layer,
'Color': entity.color,
'Style': entity.style,
'Entity_Handle': entity.entity_handle,
'Location_Type': entity.location_type,
'Parent_Block': entity.parent_block,
'Layout_Name': entity.layout_name,
'Attribute_Tag': entity.attribute_tag,
'Is_Title_Block_Attribute': entity.is_title_block_attribute,
'BBox_Min_X': entity.bbox_min_x,
'BBox_Min_Y': entity.bbox_min_y,
'BBox_Max_X': entity.bbox_max_x,
'BBox_Max_Y': entity.bbox_max_y,
'Width_Factor': entity.width_factor,
'Oblique_Angle': entity.oblique_angle
})
self.logger.info(f"CSV 저장 완료: {output_path}")
return True
except Exception as e:
self.logger.error(f"CSV 저장 실패: {e}")
return False
def save_to_json(self, result: ComprehensiveExtractionResult, output_path: str) -> bool:
"""결과를 JSON 파일로 저장"""
try:
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as jsonfile:
json.dump(asdict(result), jsonfile, ensure_ascii=False, indent=2, default=str)
self.logger.info(f"JSON 저장 완료: {output_path}")
return True
except Exception as e:
self.logger.error(f"JSON 저장 실패: {e}")
return False
def main():
"""테스트용 메인 함수"""
logging.basicConfig(level=logging.INFO)
if not EZDXF_AVAILABLE:
print("ezdxf 라이브러리가 설치되지 않았습니다.")
return
extractor = ComprehensiveTextExtractor()
test_file = "test_drawing.dxf"
if os.path.exists(test_file):
try:
result = extractor.extract_all_texts_comprehensive(test_file)
print(f"포괄적 텍스트 추출 결과:")
print(f"총 텍스트 엔티티: {result.total_count}")
print(f"모델스페이스: {len(result.modelspace_texts)}")
print(f"페이퍼스페이스: {len(result.paperspace_texts)}")
print(f"블록 내부: {len(result.block_texts)}")
print(f"비도곽 속성: {len(result.non_title_block_attributes)}")
print("\n타입별 개수:")
for entity_type, count in result.by_type_count.items():
print(f" {entity_type}: {count}")
print("\n위치별 개수:")
for location, count in result.by_location_count.items():
print(f" {location}: {count}")
# CSV 저장 테스트
csv_path = "test_comprehensive_texts.csv"
if extractor.save_to_csv(result, csv_path):
print(f"\nCSV 저장 성공: {csv_path}")
except Exception as e:
print(f"추출 실패: {e}")
else:
print(f"테스트 파일을 찾을 수 없습니다: {test_file}")
if __name__ == "__main__":
main()