diff --git a/03.Code/업로드용/converters/style_analyzer.py b/03.Code/업로드용/converters/style_analyzer.py new file mode 100644 index 0000000..e22b7a1 --- /dev/null +++ b/03.Code/업로드용/converters/style_analyzer.py @@ -0,0 +1,994 @@ +""" +HTML ㅽ + + 瑜 遺 + + ㅼ (Role) + 遺 + 瑜 + v3.0 蹂寃쎌ы : +- 湲踰 HTML 援ъ“ + 踰 吏щ━留踰 嫄 +- 媛 ν 以蹂 肄 + + + 곕 +- 紐 怨 + 痢 援ъ“ + + 뱀 +""" + COVER = "cover" # 吏 + TOC = "toc" # 紐⑹감 + CONTENT = "content" # 蹂몃Ц + + +@dataclass +class StyledElement: + """ㅽ """ + role: str # (H1, BODY, TH ) + text: str # +ㅽ 댁 + tag: str # HTML 洹 + html: str # HTML + section: str # + 뱀 +媛 + ( src ) + + def __repr__(self): + preview = self.text[:30] + "..." if len(self.text) > 30 else self.text + return f"<{self.role}> {preview}" + + +class StyleAnalyzer: + """HTML 臾몄 + 瑜 遺 + + ㅼ 遺 + 瑜 ⑦ + 踰 : " 1 ", " 2 " + "chapter": re.compile(r'^ \s*\d+\s* '), + # 1 ④ + 紐 : "1 ", "2 " ( 듬갚, ) + "h1_num": re.compile(r'^(\d+)\s+[媛- ]'), + # ぉ: "1.", "2." + "h2_num": re.compile(r'^(\d+)\.\s'), + # 以ぉ: "1.1 ", "1.2 " + "h3_num": re.compile(r'^(\d+)\.\d+\s'), + # ぉ: "1.1.1" + "h4_num": re.compile(r'^(\d+)\.(\d+)\.(\d+)'), + # + 몃 : "1)", "2)" + "h5_paren": re.compile(r'^(\d+)\)\s*'), + # + 몄 + 몃 : "(1)", "(2)" + "h6_paren": re.compile(r'^\((\d+)\)\s*'), + # 媛: "媛.", " - 愿 + : "媛)", " -몄 ", " " + "h6_circle": re.compile(r'^[△™bㅲβ╈㎮ⓥ]'), + # 紐⑹감 ", "-", " " + "list_bullet": re.compile(r'^[\-]\s'), + # 踰 ⑦ + : "- 1 -", "- 12 -" + "page_number": re.compile(r'^-\s*\d+\s*-$'), + # 瑗щ━留⑦ + : "臾몄 + 紐 - 1 -" + "footer_pattern": re.compile(r'.+[- ]\s*\d+\s*[- ]$'), + } + + # 嫄고 +ㅽ ⑦ + 대 + REMOVE_PATTERNS = [ + re.compile(r'^-\s*\d+\s*-$'), # "- 1 -" + re.compile(r'[- ]\s*\d+\s*[- ]\s*$'), # "臾몄 + 紐 - 1 -" + re.compile(r'^\d+\s* \s*\d+$'), # "643 236" ( ш린) + re.compile(r'^\[ :.*\]$'), # "[ : xxx]" + re.compile(r'^\[洹몃┝\s*\d+-\d+\]$'), # "[洹몃┝ 1-1]" + ] + + def __init__(self): + self.elements: List[StyledElement] = [] + self.current_section = DocumentSection.CONTENT + self.seen_texts: Set[str] = set() # 以蹂 諛⑹ + self.document_title = "" # 臾몄 + 紐 (瑗щ━留嫄곗 ) + + def analyze(self, html: str) -> List[StyledElement]: + """HTML 臾몄 + 遺 + + ㅼ 遺 + 瑜 由ъㅽ 諛 + 泥 : 遺 + 嫄 + self._preprocess(soup) + + # 2. 臾몄 + 紐 異異 (瑗щ━留⑦ + 媛 ) + self._extract_document_title(soup) + + # 3. + 뱀 + 諛 + self._detect_and_process_sections(soup) + + # 4. + 泥 : 以蹂 諛 遺 + 嫄 + self._postprocess() + + return self.elements + + def _preprocess(self, soup: BeautifulSoup): + """HTML + 泥 - 遺 + 嫄""" + print(" HTML + 泥 以...") + + # 1. ㅽщ┰ / ㅽ 洹 嫄 + removed_count = 0 + for tag in soup(['script', 'style', 'noscript', 'meta', 'link', 'head']): + tag.decompose() + removed_count += 1 + + if removed_count > 0: + print(f" - script/style {removed_count}媛 嫄") + + # 2. 癒몃━留щ━留 嫄 (湲踰 HTML 援ъ“) + header_footer_count = 0 + for selector in ['.page-header', '.page-footer', '.header', '.footer', + '[class*="header"]', '[class*="footer"]', + '.running-header', '.running-footer']: + for elem in soup.select(selector): + # ㅼ 肄 + ㅻ媛 + ㅻ留 嫄 + text = elem.get_text(strip=True) + if self._is_header_footer_text(text): + elem.decompose() + header_footer_count += 1 + + if header_footer_count > 0: + print(f" - 癒몃━留щ━留 嫄") + + # 3. ④꺼吏 + 嫄 + hidden_count = 0 + for elem in soup.select('[style*="display:none"], [style*="display: none"]'): + elem.decompose() + hidden_count += 1 + for elem in soup.select('[style*="visibility:hidden"], [style*="visibility: hidden"]'): + elem.decompose() + hidden_count += 1 + + # 4. #raw-container 몃 嫄 (湲踰 援ъ“) + raw_container = soup.find(id='raw-container') + if raw_container: + print(" - 湲踰 援ъ“ 媛: #raw-container 곗 + ъ ") + # raw-container 몃 ⑤ .sheet 嫄 + for sheet in soup.select('.sheet'): + if not self._is_descendant_of(sheet, raw_container): + sheet.decompose() + + def _extract_document_title(self, soup: BeautifulSoup): + """臾몄 + 紐 異異 (瑗щ━留⑦ + 媛 )""" + # 吏 + 紐 李얘린 + cover = soup.find(id='box-cover') or soup.find(class_='box-cover') + if cover: + h1 = cover.find('h1') + if h1: + self.document_title = h1.get_text(strip=True) + print(f" - 臾몄 + 紐 媛: {self.document_title[:30]}...") + + def _is_header_footer_text(self, text: str) -> bool: + """癒몃━留щ━留 + ㅽ몄 """ + if not text: + return False + + # 踰 ⑦ + + if self.PATTERNS['page_number'].match(text): + return True + + # "臾몄 + 紐 - 1 -" ⑦ + + if self.PATTERNS['footer_pattern'].match(text): + return True + + # 臾몄 + 紐 + 踰 議고 + if self.document_title and self.document_title in text: + if re.search(r'[- ]\s*\d+\s*[- ]', text): + return True + + return False + + def _should_skip_text(self, text: str) -> bool: + """嫄대 + ㅽ몄 """ + if not text: + return True + + # 嫄 ⑦ + 泥댄 + for pattern in self.REMOVE_PATTERNS: + if pattern.match(text): + return True + + # 癒몃━留щ━留댄 + if self._is_header_footer_text(text): + return True + + # 臾몄 + 紐⑸ 以 + (瑗щ━留 + 寃) + if self.document_title and text.strip() == self.document_title: + # 吏 + 泥ы쇰㈃ ㅽ + if any(e.role == 'COVER_TITLE' and self.document_title in e.text + for e in self.elements): + return True + + return False + + def _is_descendant_of(self, element: Tag, ancestor: Tag) -> bool: + """element媛 ancestor + 뱀 + 諛 泥 """ + + # 湲踰 援ъ“ (#raw-container) 곗 + 泥 + raw = soup.find(id='raw-container') + if raw: + self._process_geulbeot_structure(raw) + return + + # .sheet 援ъ“ 泥 ( 留 ) + sheets = soup.select('.sheet') + if sheets: + self._process_sheet_structure(sheets) + return + + # ъ“ 泥 + self._process_generic_html(soup) + + def _process_geulbeot_structure(self, raw: Tag): + """湲踰 HTML #raw-container 援ъ“ 泥 """ + print(" + 湲踰 #raw-container 援ъ“ 泥 以...") + + # 吏 + cover = raw.find(id='box-cover') + if cover: + print(" - 吏 + 뱀 +⑹감 + toc = raw.find(id='box-toc') + if toc: + print(" - 紐⑹감 + 뱀 + + summary = raw.find(id='box-summary') + if summary: + print(" - + 뱀 +몃Ц + content = raw.find(id='box-content') + if content: + print(" - 蹂몃Ц + 뱀 +踰 .sheet 援ъ“ 泥 """ + print(f" + .sheet 援ъ“ 泥 以... ({len(sheets)} )") + + for i, sheet in enumerate(sheets): + # body-content留 異異 + body_content = sheet.select_one('.body-content') + if body_content: + self._process_content_element(body_content) + else: + # body-content媛 쇰㈃ 癒몃━留щ━留명 泥 + for child in sheet.children: + if isinstance(child, Tag): + classes = child.get('class', []) + class_str = ' '.join(classes) if classes else '' + + # 癒몃━留щ━留ㅽ + if any(x in class_str.lower() for x in ['header', 'footer']): + continue + + self._process_content_element(child) + + def _process_generic_html(self, soup: BeautifulSoup): + """ ъ“ 泥 """ + print(" + ъ“ 泥 以...") + + # 吏 + cover = soup.find(class_=re.compile(r'cover|title-page|box-cover')) + if cover: + self.current_section = DocumentSection.COVER + self._process_cover(cover) + + # 紐⑹감 + toc = soup.find(class_=re.compile(r'toc|table-of-contents')) + if toc: + self.current_section = DocumentSection.TOC + self._process_toc(toc) + + # 蹂몃Ц + self.current_section = DocumentSection.CONTENT + main_content = soup.find('main') or soup.find('article') or soup.find('body') or soup + + for child in main_content.children: + if isinstance(child, Tag): + self._process_content_element(child) + + def _process_cover(self, cover: Tag): + """ 吏 泥 """ + # H1 = 紐 + h1 = cover.find('h1') + if h1: + text = h1.get_text(strip=True) + if text and not self._is_duplicate(text): + self.elements.append(StyledElement( + role="COVER_TITLE", + text=text, + tag="h1", + html=str(h1)[:200], + section="cover", + attributes={} + )) + + # H2 = 遺 紐 + h2 = cover.find('h2') + if h2: + text = h2.get_text(strip=True) + if text and not self._is_duplicate(text): + self.elements.append(StyledElement( + role="COVER_SUBTITLE", + text=text, + tag="h2", + html=str(h2)[:200], + section="cover", + attributes={} + )) + + # P = 蹂 + for p in cover.find_all('p'): + text = p.get_text(strip=True) + if text and not self._is_duplicate(text): + self.elements.append(StyledElement( + role="COVER_INFO", + text=text, + tag="p", + html=str(p)[:200], + section="cover", + attributes={} + )) + + def _process_toc(self, toc: Tag): + """紐⑹감 泥 """ + # UL/OL 湲 紐⑹감 + for li in toc.find_all('li'): + text = li.get_text(strip=True) + if not text or self._is_duplicate(text): + continue + + classes = li.get('class', []) + class_str = ' '.join(classes) if classes else '' + + # 踰 (援ъ껜 + !) + if 'lvl-1' in class_str or 'toc-lvl-1' in class_str: + role = "TOC_H1" + elif 'lvl-2' in class_str or 'toc-lvl-2' in class_str: + role = "TOC_H2" + elif 'lvl-3' in class_str or 'toc-lvl-3' in class_str: + role = "TOC_H3" + elif self.PATTERNS['h4_num'].match(text): # 1.1.1 癒쇱 ! + role = "TOC_H3" + elif self.PATTERNS['h3_num'].match(text): # 1.1 洹몃 + role = "TOC_H2" + elif self.PATTERNS['h2_num'].match(text): # 1. 洹몃 + role = "TOC_H1" + else: + role = "TOC_H1" + + self.elements.append(StyledElement( + role=role, + text=text, + tag="li", + html=str(li)[:200], + section="toc", + attributes={} + )) + + def _process_content_element(self, element: Tag): + """蹂몃Ц 泥 """ + if not isinstance(element, Tag): + return + + tag_name = element.name.lower() if element.name else "" + classes = element.get('class', []) + class_str = ' '.join(classes) if classes else '' + + # 癒몃━留щ━留대 ㅽ + if any(x in class_str.lower() for x in ['header', 'footer', 'page-num']): + return + + # + ㅽ 뱀 + if tag_name == 'table': + self._process_table(element) + return + + # 洹몃┝ 뱀 + if tag_name in ['figure', 'img']: + self._process_figure(element) + return + + # + ㅽ 異異 + text = self._get_direct_text(element) + + if text: + # 嫄대 + ㅽ 泥댄 + if self._should_skip_text(text): + pass # 怨 + 泥 + elif not self._is_duplicate(text): + role = self._classify_role(element, tag_name, classes, text) + if role: + self.elements.append(StyledElement( + role=role, + text=text, + tag=tag_name, + html=str(element)[:200], + section=self.current_section.value, + attributes=dict(element.attrs) if element.attrs else {} + )) + + # 泥 (而⑦ + + 洹 ) + if tag_name in ['div', 'section', 'article', 'aside', 'main', 'body', + 'ul', 'ol', 'dl', 'blockquote']: + for child in element.children: + if isinstance(child, Tag): + self._process_content_element(child) + + def _get_direct_text(self, element: Tag) -> str: + """ ㅽ몃 異異 ( + + )""" + # 紐 洹 + 泥 + ㅽ몃 + if element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li', 'td', 'th', 'caption']: + return element.get_text(strip=True) + + # 而⑦ + + 洹 吏 + ㅽ몃 + texts = [] + for child in element.children: + if isinstance(child, NavigableString): + t = str(child).strip() + if t: + texts.append(t) + + return ' '.join(texts) + + def _is_duplicate(self, text: str) -> bool: + """以蹂 + ㅽ몄 洹 + normalized = re.sub(r'\s+', ' ', text.strip()) + + # 吏㏃ + ㅽ몃 以蹂 (踰 ) + if len(normalized) < 10: + return False + + # 泥 50 泥댄 + key = normalized[:50] + + if key in self.seen_texts: + return True + + self.seen_texts.add(key) + return False + + def _classify_role(self, element: Tag, tag: str, classes: List[str], text: str) -> Optional[str]: + """ 遺 + 瑜 截 以 : ⑦ + 留㼼 諛 援ъ껜 + 濡 ! + 1.1.1 1.1 1. 1 + (1) 1) + 媛) 媛. + """ + + class_str = ' '.join(classes) if classes else '' + + # ============ 紐 洹 (HTML 洹 곗 + ) ============ + if tag == 'h1': + return "H1" + if tag == 'h2': + return "H2" + if tag == 'h3': + return "H3" + if tag == 'h4': + return "H4" + if tag == 'h5': + return "H5" + if tag == 'h6': + return "H6" + + # ============ 蹂몃Ц (p, div ) - 踰 ⑦ + 댁쇰 遺 + 瑜 ⑦ + (援ъ껜 + !) ------ + + # "1.1.1" ⑦ + (媛 援ъ껜 - 癒쇱 泥댄 !) + if self.PATTERNS['h4_num'].match(text): + if len(text) < 100: + return "H3" + return "BODY" + + # "1.1 " ⑦ + + if self.PATTERNS['h3_num'].match(text): + if len(text) < 100: + return "H2" + return "BODY" + + # "1." ⑦ + + if self.PATTERNS['h2_num'].match(text): + if len(text) < 100: + return "H1" + return "BODY" + + # "1 媛..." ⑦ + ( 듬갚+ 湲) + if self.PATTERNS['h1_num'].match(text): + return "H1" + + # ------ 愿 + ⑦ + (援ъ껜 + !) ------ + + # "(1)" ⑦ + (愿 + 媛 寃 援ъ껜 - 癒쇱 泥댄 !) + if self.PATTERNS['h6_paren'].match(text): + if element.find('strong') or len(text) < 80: + return "H5" + return "BODY" + + # "1)" ⑦ + + if self.PATTERNS['h5_paren'].match(text): + if element.find('strong') or len(text) < 80: + return "H4" + return "BODY" + + # ------ 湲 ⑦ + (援ъ껜 + !) ------ + + # "媛)" ⑦ + (愿 + 媛 寃 援ъ껜 - 癒쇱 泥댄 !) + if self.PATTERNS['h5_korean'].match(text): + return "H5" + + # "媛." ⑦ + + if self.PATTERNS['h4_korean'].match(text): + return "H4" + + # ------ 뱀고 ⑦ + ------ + + # "△" ⑦ + + if self.PATTERNS['h6_circle'].match(text): + return "H6" + + # ------ 湲고 ------ + + # 媛議 諛 + if any(x in class_str for x in ['highlight', 'box', 'note', 'tip']): + return "HIGHLIGHT_BOX" + + # 몃Ц + return "BODY" + + # ============ 紐⑹감 ⑸ + 泥 - 援ъ“ 곗 + ы """ + + # 罹≪ + 援ъ“ 곗 + + table_data = {'rows': [], 'caption': caption_text} + + for tr in table.find_all('tr'): + row = [] + for cell in tr.find_all(['th', 'td']): + cell_info = { + 'text': cell.get_text(strip=True), + 'is_header': cell.name == 'th', + 'colspan': int(cell.get('colspan', 1)), + 'rowspan': int(cell.get('rowspan', 1)), + 'bg_color': self._extract_bg_color(cell), + } + row.append(cell_info) + if row: + table_data['rows'].append(row) + + # TABLE 濡 異媛 (媛蹂 + TH/TD ) + if table_data['rows']: + self.elements.append(StyledElement( + role="TABLE", + text=f"[ : {len(table_data['rows'])} ]", + tag="table", + html=str(table)[:200], + section=self.current_section.value, + attributes={'table_data': table_data} + )) + + def _extract_bg_color(self, element: Tag) -> str: + """ + 諛곌꼍 異異 + style = element.get('style', '') + + # background-color 異異 + match = re.search(r'background-color:\s*([^;]+)', style) + if match: + return self._normalize_color(match.group(1)) + + # bgcolor + + bgcolor = element.get('bgcolor', '') + if bgcolor: + return self._normalize_color(bgcolor) + + return '' + + def _process_figure(self, element: Tag): + """洹몃┝ 泥 """ + img = element.find('img') if element.name == 'figure' else element + + if img and img.name == 'img': + src = img.get('src', '') + alt = img.get('alt', '') + + if src: # src媛 + 留 異媛 + self.elements.append(StyledElement( + role="FIGURE", + text=alt or " ", + tag="img", + html=str(img)[:100], + section=self.current_section.value, + attributes={"src": src, "alt": alt} + )) + + # 罹≪ + +泥 : 遺 + 嫄""" + print(f" 泥 以... (泥 + : {len(self.elements)}媛)") + + filtered = [] + for elem in self.elements: + # 鍮 + ㅽ 嫄 + if not elem.text or not elem.text.strip(): + continue + + # 癒몃━留щ━留 + ㅽ 嫄 + if self._is_header_footer_text(elem.text): + continue + + # 嫄 ⑦ + 泥댄 + skip = False + for pattern in self.REMOVE_PATTERNS: + if pattern.match(elem.text.strip()): + skip = True + break + + if not skip: + filtered.append(elem) + + self.elements = filtered + print(f" - 泥 + : {len(self.elements)}媛") + + def get_role_summary(self) -> Dict[str, int]: + """ 蹂 + """ + summary = {} + for elem in self.elements: + summary[elem.role] = summary.get(elem.role, 0) + 1 + return dict(sorted(summary.items())) + + + def extract_css_styles(self, html: str) -> Dict[str, Dict]: + """ + HTML + 蹂 + CSS ㅽ 異 + Returns: { : {font_size, color, bold, ...}} + """ + soup = BeautifulSoup(html, 'html.parser') + role_styles = {} + + # + + +
+ +
+

1 DX 媛 湲곕낯 媛 + DX + +

+

1.1.1 痢〓 DX 諛 + ④ +

+

1) Digitization 몃Ц 댁 +. 異⑸ + 湲 + ㅽ몃 蹂몃Ц쇰 댁 ⑸ .

+

(1) ④ + 蹂 + 吏 +

+

痢〓 湲곗 + + 깃낵臾쇱⑥곕 + .

+
+ +
+ +
+ +
+

泥 踰吏 ぉ

+ + + +
1. 곗 + 鍮 + 援 +
ぉ1 + ㅻ +1
+
+ +
+ + + """ + + analyzer = StyleAnalyzer() + elements = analyzer.analyze(test_html) + + print("\n" + "="*60) + print("遺 + + ") + print("="*60) + for role, count in analyzer.get_role_summary().items(): + print(f" {role}: {count}")