Files
_Geulbeot/03.Code/업로드용/converters/style_analyzer.py
2026-03-19 09:02:27 +09:00

995 lines
31 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
HTML ㅽ
瑜 遺
ㅼ (Role)
v3.0 蹂寃쎌ы :
- 湲踰 HTML 援ъ“
踰 吏щ━留踰 嫄
- 媛 ν 以蹂 肄
- 紐 怨
痢 援ъ“
"""
COVER = "cover" # 吏
TOC = "toc" # 紐⑹감
CONTENT = "content" # 蹂몃Ц
@dataclass
class StyledElement:
""""""
role: str # (H1, BODY, TH )
text: str #
tag: str # HTML 洹
html: str # HTML
section: str #
( src )
def __repr__(self):
preview = self.text[:30] + "..." if len(self.text) > 30 else self.text
return f"<{self.role}> {preview}"
class StyleAnalyzer:
"""HTML 臾몄
瑜 遺
ㅼ 遺
瑜 ⑦
踰 : " 1 ", " 2 "
"chapter": re.compile(r'^ \s*\d+\s* '),
# 1 ④
紐 : "1 ", "2 " ( 듬갚, )
"h1_num": re.compile(r'^(\d+)\s+[媛- ]'),
# ぉ: "1.", "2."
"h2_num": re.compile(r'^(\d+)\.\s'),
# 以ぉ: "1.1 ", "1.2 "
"h3_num": re.compile(r'^(\d+)\.\d+\s'),
# ぉ: "1.1.1"
"h4_num": re.compile(r'^(\d+)\.(\d+)\.(\d+)'),
#
몃 : "1)", "2)"
"h5_paren": re.compile(r'^(\d+)\)\s*'),
#
몃 : "(1)", "(2)"
"h6_paren": re.compile(r'^\((\d+)\)\s*'),
# 媛: "媛.", " - 愿
: "媛)", " -몄 ", " "
"h6_circle": re.compile(r'^[△™bㅲβ╈㎮ⓥ]'),
# 紐⑹감 ", "-", " "
"list_bullet": re.compile(r'^[\-]\s'),
# 踰 ⑦
: "- 1 -", "- 12 -"
"page_number": re.compile(r'^-\s*\d+\s*-$'),
# 瑗щ━留⑦
: "臾몄
紐 - 1 -"
"footer_pattern": re.compile(r'.+[- ]\s*\d+\s*[- ]$'),
}
# 嫄고
ㅽ ⑦
REMOVE_PATTERNS = [
re.compile(r'^-\s*\d+\s*-$'), # "- 1 -"
re.compile(r'[- ]\s*\d+\s*[- ]\s*$'), # "臾몄
紐 - 1 -"
re.compile(r'^\d+\s* \s*\d+$'), # "643 236" ( ш린)
re.compile(r'^\[ :.*\]$'), # "[ : xxx]"
re.compile(r'^\[洹몃┝\s*\d+-\d+\]$'), # "[洹몃┝ 1-1]"
]
def __init__(self):
self.elements: List[StyledElement] = []
self.current_section = DocumentSection.CONTENT
self.seen_texts: Set[str] = set() # 以蹂 諛⑹
self.document_title = "" # 臾몄
紐 (瑗щ━留嫄곗 )
def analyze(self, html: str) -> List[StyledElement]:
"""HTML 臾몄
由ъㅽ
:
self._preprocess(soup)
# 2. 臾몄
異異 (瑗щ留⑦
)
self._extract_document_title(soup)
# 3.
self._detect_and_process_sections(soup)
# 4.
: 以蹂
self._postprocess()
return self.elements
def _preprocess(self, soup: BeautifulSoup):
"""HTML
泥 - 遺
"""
print(" HTML
...")
# 1. ㅽщ┰ / ㅽ 洹 嫄
removed_count = 0
for tag in soup(['script', 'style', 'noscript', 'meta', 'link', 'head']):
tag.decompose()
removed_count += 1
if removed_count > 0:
print(f" - script/style {removed_count}媛 嫄")
# 2. 癒몃━留щ━留 嫄 (湲踰 HTML 援ъ“)
header_footer_count = 0
for selector in ['.page-header', '.page-footer', '.header', '.footer',
'[class*="header"]', '[class*="footer"]',
'.running-header', '.running-footer']:
for elem in soup.select(selector):
# ㅼ 肄
ㅻ媛
ㅻ留
text = elem.get_text(strip=True)
if self._is_header_footer_text(text):
elem.decompose()
header_footer_count += 1
if header_footer_count > 0:
print(f" - 癒몃━留щ━留 嫄")
# 3. ④꺼吏
hidden_count = 0
for elem in soup.select('[style*="display:none"], [style*="display: none"]'):
elem.decompose()
hidden_count += 1
for elem in soup.select('[style*="visibility:hidden"], [style*="visibility: hidden"]'):
elem.decompose()
hidden_count += 1
# 4. #raw-container 몃 嫄 (湲踰 援ъ“)
raw_container = soup.find(id='raw-container')
if raw_container:
print(" - 湲踰 援ъ“ 媛: #raw-container 곗
ъ ")
# raw-container 몃 ⑤ .sheet 嫄
for sheet in soup.select('.sheet'):
if not self._is_descendant_of(sheet, raw_container):
sheet.decompose()
def _extract_document_title(self, soup: BeautifulSoup):
"""臾몄
紐 異異 (瑗щ━留⑦
媛 )"""
# 吏
李얘린
cover = soup.find(id='box-cover') or soup.find(class_='box-cover')
if cover:
h1 = cover.find('h1')
if h1:
self.document_title = h1.get_text(strip=True)
print(f" - 臾몄
: {self.document_title[:30]}...")
def _is_header_footer_text(self, text: str) -> bool:
"""癒몃━留щ━留
ㅽ몄 """
if not text:
return False
# 踰 ⑦
if self.PATTERNS['page_number'].match(text):
return True
# "臾몄
- 1 -"
if self.PATTERNS['footer_pattern'].match(text):
return True
# 臾몄
+ 議고
if self.document_title and self.document_title in text:
if re.search(r'[- ]\s*\d+\s*[- ]', text):
return True
return False
def _should_skip_text(self, text: str) -> bool:
"""嫄대
ㅽ몄 """
if not text:
return True
# 嫄 ⑦
泥댄
for pattern in self.REMOVE_PATTERNS:
if pattern.match(text):
return True
# 癒몃━留щ━留댄
if self._is_header_footer_text(text):
return True
# 臾몄
紐⑸
(瑗щ
)
if self.document_title and text.strip() == self.document_title:
# 吏
泥ы쇰
if any(e.role == 'COVER_TITLE' and self.document_title in e.text
for e in self.elements):
return True
return False
def _is_descendant_of(self, element: Tag, ancestor: Tag) -> bool:
"""element媛 ancestor
諛 泥 """
# 湲踰 援ъ“ (#raw-container) 곗
raw = soup.find(id='raw-container')
if raw:
self._process_geulbeot_structure(raw)
return
# .sheet 援ъ“ 泥 ( 留 )
sheets = soup.select('.sheet')
if sheets:
self._process_sheet_structure(sheets)
return
# ъ“ 泥
self._process_generic_html(soup)
def _process_geulbeot_structure(self, raw: Tag):
"""湲踰 HTML #raw-container 援ъ“ 泥 """
print("
湲踰 #raw-container 援ъ“ 泥 以...")
# 吏
cover = raw.find(id='box-cover')
if cover:
print(" - 吏
toc = raw.find(id='box-toc')
if toc:
print(" - 紐⑹감
summary = raw.find(id='box-summary')
if summary:
print(" -
몃Ц
content = raw.find(id='box-content')
if content:
print(" - 蹂몃Ц
.sheet 援ъ """
print(f"
.sheet 援ъ“ 泥 以... ({len(sheets)} )")
for i, sheet in enumerate(sheets):
# body-content留 異異
body_content = sheet.select_one('.body-content')
if body_content:
self._process_content_element(body_content)
else:
# body-content媛 쇰㈃ 癒몃━留щ━留명 泥
for child in sheet.children:
if isinstance(child, Tag):
classes = child.get('class', [])
class_str = ' '.join(classes) if classes else ''
# 癒몃━留щ━留ㅽ
if any(x in class_str.lower() for x in ['header', 'footer']):
continue
self._process_content_element(child)
def _process_generic_html(self, soup: BeautifulSoup):
""" ъ """
print("
ъ“ 泥 以...")
# 吏
cover = soup.find(class_=re.compile(r'cover|title-page|box-cover'))
if cover:
self.current_section = DocumentSection.COVER
self._process_cover(cover)
# 紐⑹감
toc = soup.find(class_=re.compile(r'toc|table-of-contents'))
if toc:
self.current_section = DocumentSection.TOC
self._process_toc(toc)
# 蹂몃Ц
self.current_section = DocumentSection.CONTENT
main_content = soup.find('main') or soup.find('article') or soup.find('body') or soup
for child in main_content.children:
if isinstance(child, Tag):
self._process_content_element(child)
def _process_cover(self, cover: Tag):
""" """
# H1 = 紐
h1 = cover.find('h1')
if h1:
text = h1.get_text(strip=True)
if text and not self._is_duplicate(text):
self.elements.append(StyledElement(
role="COVER_TITLE",
text=text,
tag="h1",
html=str(h1)[:200],
section="cover",
attributes={}
))
# H2 = 遺 紐
h2 = cover.find('h2')
if h2:
text = h2.get_text(strip=True)
if text and not self._is_duplicate(text):
self.elements.append(StyledElement(
role="COVER_SUBTITLE",
text=text,
tag="h2",
html=str(h2)[:200],
section="cover",
attributes={}
))
# P = 蹂
for p in cover.find_all('p'):
text = p.get_text(strip=True)
if text and not self._is_duplicate(text):
self.elements.append(StyledElement(
role="COVER_INFO",
text=text,
tag="p",
html=str(p)[:200],
section="cover",
attributes={}
))
def _process_toc(self, toc: Tag):
"""紐⑹감 """
# UL/OL 湲 紐⑹감
for li in toc.find_all('li'):
text = li.get_text(strip=True)
if not text or self._is_duplicate(text):
continue
classes = li.get('class', [])
class_str = ' '.join(classes) if classes else ''
# 踰 (援ъ껜
!)
if 'lvl-1' in class_str or 'toc-lvl-1' in class_str:
role = "TOC_H1"
elif 'lvl-2' in class_str or 'toc-lvl-2' in class_str:
role = "TOC_H2"
elif 'lvl-3' in class_str or 'toc-lvl-3' in class_str:
role = "TOC_H3"
elif self.PATTERNS['h4_num'].match(text): # 1.1.1 癒쇱 !
role = "TOC_H3"
elif self.PATTERNS['h3_num'].match(text): # 1.1 洹몃
role = "TOC_H2"
elif self.PATTERNS['h2_num'].match(text): # 1. 洹몃
role = "TOC_H1"
else:
role = "TOC_H1"
self.elements.append(StyledElement(
role=role,
text=text,
tag="li",
html=str(li)[:200],
section="toc",
attributes={}
))
def _process_content_element(self, element: Tag):
"""蹂몃Ц """
if not isinstance(element, Tag):
return
tag_name = element.name.lower() if element.name else ""
classes = element.get('class', [])
class_str = ' '.join(classes) if classes else ''
# 癒몃━留щ━留대 ㅽ
if any(x in class_str.lower() for x in ['header', 'footer', 'page-num']):
return
#
ㅽ 뱀
if tag_name == 'table':
self._process_table(element)
return
# 洹몃┝ 뱀
if tag_name in ['figure', 'img']:
self._process_figure(element)
return
#
ㅽ 異異
text = self._get_direct_text(element)
if text:
# 嫄대
ㅽ 泥댄
if self._should_skip_text(text):
pass # 怨
elif not self._is_duplicate(text):
role = self._classify_role(element, tag_name, classes, text)
if role:
self.elements.append(StyledElement(
role=role,
text=text,
tag=tag_name,
html=str(element)[:200],
section=self.current_section.value,
attributes=dict(element.attrs) if element.attrs else {}
))
# 泥 (而⑦
洹 )
if tag_name in ['div', 'section', 'article', 'aside', 'main', 'body',
'ul', 'ol', 'dl', 'blockquote']:
for child in element.children:
if isinstance(child, Tag):
self._process_content_element(child)
def _get_direct_text(self, element: Tag) -> str:
""" ㅽ몃 異異 (
)"""
# 紐 洹
ㅽ몃
if element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li', 'td', 'th', 'caption']:
return element.get_text(strip=True)
# 而⑦
洹 吏
ㅽ몃
texts = []
for child in element.children:
if isinstance(child, NavigableString):
t = str(child).strip()
if t:
texts.append(t)
return ' '.join(texts)
def _is_duplicate(self, text: str) -> bool:
"""以蹂
ㅽ몄
normalized = re.sub(r'\s+', ' ', text.strip())
# 吏㏃
ㅽ몃 以蹂 ( )
if len(normalized) < 10:
return False
# 泥 50 泥댄
key = normalized[:50]
if key in self.seen_texts:
return True
self.seen_texts.add(key)
return False
def _classify_role(self, element: Tag, tag: str, classes: List[str], text: str) -> Optional[str]:
"""
瑜 截 以 : ⑦
留㼼 諛 援ъ껜
濡 !
1.1.1 1.1 1. 1
(1) 1)
媛) 媛.
"""
class_str = ' '.join(classes) if classes else ''
# ============ 紐 洹 (HTML 洹 곗
) ============
if tag == 'h1':
return "H1"
if tag == 'h2':
return "H2"
if tag == 'h3':
return "H3"
if tag == 'h4':
return "H4"
if tag == 'h5':
return "H5"
if tag == 'h6':
return "H6"
# ============ 蹂몃Ц (p, div ) - 踰 ⑦
댁쇰
(援ъ껜
!) ------
# "1.1.1" ⑦
( 援ъ껜 - 癒쇱 泥댄 !)
if self.PATTERNS['h4_num'].match(text):
if len(text) < 100:
return "H3"
return "BODY"
# "1.1 " ⑦
if self.PATTERNS['h3_num'].match(text):
if len(text) < 100:
return "H2"
return "BODY"
# "1." ⑦
if self.PATTERNS['h2_num'].match(text):
if len(text) < 100:
return "H1"
return "BODY"
# "1 媛..." ⑦
( 듬갚+ )
if self.PATTERNS['h1_num'].match(text):
return "H1"
# ------ 愿
(援ъ껜
!) ------
# "(1)" ⑦
(
援ъ껜 - 癒쇱 泥댄 !)
if self.PATTERNS['h6_paren'].match(text):
if element.find('strong') or len(text) < 80:
return "H5"
return "BODY"
# "1)" ⑦
if self.PATTERNS['h5_paren'].match(text):
if element.find('strong') or len(text) < 80:
return "H4"
return "BODY"
# ------ 湲 ⑦
(援ъ껜
!) ------
# "媛)" ⑦
(
援ъ껜 - 癒쇱 泥댄 !)
if self.PATTERNS['h5_korean'].match(text):
return "H5"
# "媛." ⑦
if self.PATTERNS['h4_korean'].match(text):
return "H4"
# ------ 뱀고 ⑦
------
# "△" ⑦
if self.PATTERNS['h6_circle'].match(text):
return "H6"
# ------ 湲고 ------
# 媛議 諛
if any(x in class_str for x in ['highlight', 'box', 'note', 'tip']):
return "HIGHLIGHT_BOX"
# 몃Ц
return "BODY"
# ============ 紐⑹감 ⑸
- 援ъ
ы """
# 罹≪
援ъ“ 곗
table_data = {'rows': [], 'caption': caption_text}
for tr in table.find_all('tr'):
row = []
for cell in tr.find_all(['th', 'td']):
cell_info = {
'text': cell.get_text(strip=True),
'is_header': cell.name == 'th',
'colspan': int(cell.get('colspan', 1)),
'rowspan': int(cell.get('rowspan', 1)),
'bg_color': self._extract_bg_color(cell),
}
row.append(cell_info)
if row:
table_data['rows'].append(row)
# TABLE 濡 異媛 (媛蹂
TH/TD )
if table_data['rows']:
self.elements.append(StyledElement(
role="TABLE",
text=f"[ : {len(table_data['rows'])} ]",
tag="table",
html=str(table)[:200],
section=self.current_section.value,
attributes={'table_data': table_data}
))
def _extract_bg_color(self, element: Tag) -> str:
"""
諛곌꼍 異異
style = element.get('style', '')
# background-color 異異
match = re.search(r'background-color:\s*([^;]+)', style)
if match:
return self._normalize_color(match.group(1))
# bgcolor
bgcolor = element.get('bgcolor', '')
if bgcolor:
return self._normalize_color(bgcolor)
return ''
def _process_figure(self, element: Tag):
"""洹몃┝ 泥 """
img = element.find('img') if element.name == 'figure' else element
if img and img.name == 'img':
src = img.get('src', '')
alt = img.get('alt', '')
if src: # src媛
異媛
self.elements.append(StyledElement(
role="FIGURE",
text=alt or " ",
tag="img",
html=str(img)[:100],
section=self.current_section.value,
attributes={"src": src, "alt": alt}
))
# 罹≪
:
"""
print(f" 泥 以... (泥
: {len(self.elements)}媛)")
filtered = []
for elem in self.elements:
# 鍮
ㅽ 嫄
if not elem.text or not elem.text.strip():
continue
# 癒몃━留щ━留
ㅽ 嫄
if self._is_header_footer_text(elem.text):
continue
# 嫄 ⑦
泥댄
skip = False
for pattern in self.REMOVE_PATTERNS:
if pattern.match(elem.text.strip()):
skip = True
break
if not skip:
filtered.append(elem)
self.elements = filtered
print(f" - 泥
: {len(self.elements)}媛")
def get_role_summary(self) -> Dict[str, int]:
"""
"""
summary = {}
for elem in self.elements:
summary[elem.role] = summary.get(elem.role, 0) + 1
return dict(sorted(summary.items()))
def extract_css_styles(self, html: str) -> Dict[str, Dict]:
"""
HTML
CSS
Returns: { : {font_size, color, bold, ...}}
"""
soup = BeautifulSoup(html, 'html.parser')
role_styles = {}
# <style> 洹몄
CSS
style_tag = soup.find('style')
if style_tag:
css_text = style_tag.string or ''
role_styles.update(self._parse_css_rules(css_text))
# ㅽ
異異 ( 蹂
)
for elem in self.elements:
if elem.role not in role_styles:
role_styles[elem.role] = self._extract_inline_style(elem.html)
return role_styles
def _parse_css_rules(self, css_text: str) -> Dict[str, Dict]:
"""CSS
ㅽ몄
洹移 """
import re
rules = {}
# h1, h2, .section-title 깆 ⑦
pattern = r'([^{]+)\{([^}]+)\}'
for match in re.finditer(pattern, css_text):
selector = match.group(1).strip()
properties = match.group(2)
style = {}
for prop in properties.split(';'):
if ':' in prop:
key, value = prop.split(':', 1)
key = key.strip().lower()
value = value.strip()
if key == 'font-size':
style['font_size'] = self._parse_font_size(value)
elif key == 'color':
style['color'] = self._normalize_color(value)
elif key == 'font-weight':
style['bold'] = value in ['bold', '700', '800', '900']
elif key == 'text-align':
style['align'] = value
#
留㼼
role = self._selector_to_role(selector)
if role:
rules[role] = style
return rules
def _selector_to_role(self, selector: str) -> str:
"""CSS
留㼼"""
selector = selector.lower().strip()
mapping = {
'h1': 'H1', 'h2': 'H2', 'h3': 'H3', 'h4': 'H4',
'.cover-title': 'COVER_TITLE',
'.section-title': 'H1',
'th': 'TH', 'td': 'TD',
'p': 'BODY',
}
for key, role in mapping.items():
if key in selector:
return role
return None
def _parse_font_size(self, value: str) -> float:
""" ш린 (pt
pt
elif unit in ['em', 'rem']:
size = size * 11 # 湲곕낯 11pt 湲곗
return size
return 11.0
def _normalize_color(self, value: str) -> str:
"""媛 洹 (#RRGGBB)"""
import re
value = value.strip().lower()
# #rrggbb
if re.match(r'^#[0-9a-f]{3}$', value):
return f'#{value[1]*2}{value[2]*2}{value[3]*2}'.upper()
# rgb(r, g, b)
match = re.search(r'rgb\s*\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)', value)
if match:
r, g, b = int(match.group(1)), int(match.group(2)), int(match.group(3))
return f'#{r:02X}{g:02X}{b:02X}'
#
color_names = {
'black': '#000000', 'white': '#FFFFFF',
'red': '#FF0000', 'green': '#008000', 'blue': '#0000FF',
'navy': '#1A365D',
}
return color_names.get(value, '#000000')
def _extract_inline_style(self, html: str) -> Dict:
"""HTML
ㅽ 異
style = {}
# style
李얘린
match = re.search(r'style\s*=\s*["\']([^"\']+)["\']', html)
if match:
style_str = match.group(1)
for prop in style_str.split(';'):
if ':' in prop:
key, value = prop.split(':', 1)
key = key.strip().lower()
value = value.strip()
if key == 'font-size':
style['font_size'] = self._parse_font_size(value)
elif key == 'color':
style['color'] = self._normalize_color(value)
elif key == 'font-weight':
style['bold'] = value in ['bold', '700', '800', '900']
elif key == 'text-align':
style['align'] = value
elif key == 'background-color':
style['bg_color'] = self._normalize_color(value)
return style
def _extract_bg_color(self, element) -> str:
"""
諛곌꼍 異異
if not hasattr(element, 'get'):
return ''
style = element.get('style', '')
# background-color 異異
match = re.search(r'background-color:\s*([^;]+)', style)
if match:
return self._normalize_color(match.group(1))
# bgcolor
bgcolor = element.get('bgcolor', '')
if bgcolor:
return self._normalize_color(bgcolor)
return ''
def export_for_hwp(self) -> List[Dict]:
"""HWP 蹂 곗
대낫닿린"""
return [
{
"role": e.role,
"text": e.text,
"tag": e.tag,
"section": e.section,
"attributes": e.attributes
}
for e in self.elements
]
if __name__ == "__main__":
#
test_html = """
<html>
<head>
<script>var x = 1;</script>
<style>.test { color: red; }</style>
</head>
<body>
<div class="sheet">
<div class="page-header">嫄댁
ㅒ룻 紐 痢〓 DX ㅻТ吏移 </div>
<div class="body-content">
<h1>1 DX 媛 湲곕낯 媛
DX
</h2>
<h3>1.1.1 痢〓 DX 諛
</h3>
<p>1) <strong>Digitization 몃Ц 댁
. 異⑸
ㅽ몃 蹂몃Ц쇰 댁 ⑸ .</p>
<p>(1) ④
</p>
<p>痢〓 湲곗
깃낵臾쇱⑥곕
.</p>
</div>
<div class="page-footer">嫄댁
ㅒ룻 紐 痢〓 DX ㅻТ吏移 - 1 -</div>
</div>
<div class="sheet">
<div class="page-header">嫄댁
ㅒ룻 紐 痢〓 DX ㅻТ吏移 </div>
<div class="body-content">
<p> 泥 踰吏 ぉ</p>
<table>
<caption> 1. 곗
</th><th> 댁 </th></tr>
<tr><td> ぉ1</td><td>
1</td></tr>
</table>
</div>
<div class="page-footer">嫄댁
ㅒ룻 紐 痢〓 DX ㅻТ吏移 - 2 -</div>
</div>
</body>
</html>
"""
analyzer = StyleAnalyzer()
elements = analyzer.analyze(test_html)
print("\n" + "="*60)
print("
")
print("="*60)
for role, count in analyzer.get_role_summary().items():
print(f" {role}: {count}")