""" HTML ㅽ 瑜 遺 ㅼ (Role) 遺 瑜 v3.0 蹂寃쎌ы : - 湲踰 HTML 援ъ“ 踰 吏щ━留踰 嫄 - 媛 ν 以蹂 肄 곕 - 紐 怨 痢 援ъ“ 뱀 """ COVER = "cover" # 吏 TOC = "toc" # 紐⑹감 CONTENT = "content" # 蹂몃Ц @dataclass class StyledElement: """ㅽ """ role: str # (H1, BODY, TH ) text: str # ㅽ 댁 tag: str # HTML 洹 html: str # HTML section: str # 뱀 媛 ( src ) def __repr__(self): preview = self.text[:30] + "..." if len(self.text) > 30 else self.text return f"<{self.role}> {preview}" class StyleAnalyzer: """HTML 臾몄 瑜 遺 ㅼ 遺 瑜 ⑦ 踰 : " 1 ", " 2 " "chapter": re.compile(r'^ \s*\d+\s* '), # 1 ④ 紐 : "1 ", "2 " ( 듬갚, ) "h1_num": re.compile(r'^(\d+)\s+[媛- ]'), # ぉ: "1.", "2." "h2_num": re.compile(r'^(\d+)\.\s'), # 以ぉ: "1.1 ", "1.2 " "h3_num": re.compile(r'^(\d+)\.\d+\s'), # ぉ: "1.1.1" "h4_num": re.compile(r'^(\d+)\.(\d+)\.(\d+)'), # 몃 : "1)", "2)" "h5_paren": re.compile(r'^(\d+)\)\s*'), # 몄 몃 : "(1)", "(2)" "h6_paren": re.compile(r'^\((\d+)\)\s*'), # 媛: "媛.", " - 愿 : "媛)", " -몄 ", " " "h6_circle": re.compile(r'^[△™bㅲβ╈㎮ⓥ]'), # 紐⑹감 ", "-", " " "list_bullet": re.compile(r'^[\-]\s'), # 踰 ⑦ : "- 1 -", "- 12 -" "page_number": re.compile(r'^-\s*\d+\s*-$'), # 瑗щ━留⑦ : "臾몄 紐 - 1 -" "footer_pattern": re.compile(r'.+[- ]\s*\d+\s*[- ]$'), } # 嫄고 ㅽ ⑦ 대 REMOVE_PATTERNS = [ re.compile(r'^-\s*\d+\s*-$'), # "- 1 -" re.compile(r'[- ]\s*\d+\s*[- ]\s*$'), # "臾몄 紐 - 1 -" re.compile(r'^\d+\s* \s*\d+$'), # "643 236" ( ш린) re.compile(r'^\[ :.*\]$'), # "[ : xxx]" re.compile(r'^\[洹몃┝\s*\d+-\d+\]$'), # "[洹몃┝ 1-1]" ] def __init__(self): self.elements: List[StyledElement] = [] self.current_section = DocumentSection.CONTENT self.seen_texts: Set[str] = set() # 以蹂 諛⑹ self.document_title = "" # 臾몄 紐 (瑗щ━留嫄곗 ) def analyze(self, html: str) -> List[StyledElement]: """HTML 臾몄 遺 ㅼ 遺 瑜 由ъㅽ 諛 泥 : 遺 嫄 self._preprocess(soup) # 2. 臾몄 紐 異異 (瑗щ━留⑦ 媛 ) self._extract_document_title(soup) # 3. 뱀 諛 self._detect_and_process_sections(soup) # 4. 泥 : 以蹂 諛 遺 嫄 self._postprocess() return self.elements def _preprocess(self, soup: BeautifulSoup): """HTML 泥 - 遺 嫄""" print(" HTML 泥 以...") # 1. ㅽщ┰ / ㅽ 洹 嫄 removed_count = 0 for tag in soup(['script', 'style', 'noscript', 'meta', 'link', 'head']): tag.decompose() removed_count += 1 if removed_count > 0: print(f" - script/style {removed_count}媛 嫄") # 2. 癒몃━留щ━留 嫄 (湲踰 HTML 援ъ“) header_footer_count = 0 for selector in ['.page-header', '.page-footer', '.header', '.footer', '[class*="header"]', '[class*="footer"]', '.running-header', '.running-footer']: for elem in soup.select(selector): # ㅼ 肄 ㅻ媛 ㅻ留 嫄 text = elem.get_text(strip=True) if self._is_header_footer_text(text): elem.decompose() header_footer_count += 1 if header_footer_count > 0: print(f" - 癒몃━留щ━留 嫄") # 3. ④꺼吏 嫄 hidden_count = 0 for elem in soup.select('[style*="display:none"], [style*="display: none"]'): elem.decompose() hidden_count += 1 for elem in soup.select('[style*="visibility:hidden"], [style*="visibility: hidden"]'): elem.decompose() hidden_count += 1 # 4. #raw-container 몃 嫄 (湲踰 援ъ“) raw_container = soup.find(id='raw-container') if raw_container: print(" - 湲踰 援ъ“ 媛: #raw-container 곗 ъ ") # raw-container 몃 ⑤ .sheet 嫄 for sheet in soup.select('.sheet'): if not self._is_descendant_of(sheet, raw_container): sheet.decompose() def _extract_document_title(self, soup: BeautifulSoup): """臾몄 紐 異異 (瑗щ━留⑦ 媛 )""" # 吏 紐 李얘린 cover = soup.find(id='box-cover') or soup.find(class_='box-cover') if cover: h1 = cover.find('h1') if h1: self.document_title = h1.get_text(strip=True) print(f" - 臾몄 紐 媛: {self.document_title[:30]}...") def _is_header_footer_text(self, text: str) -> bool: """癒몃━留щ━留 ㅽ몄 """ if not text: return False # 踰 ⑦ if self.PATTERNS['page_number'].match(text): return True # "臾몄 紐 - 1 -" ⑦ if self.PATTERNS['footer_pattern'].match(text): return True # 臾몄 紐 + 踰 議고 if self.document_title and self.document_title in text: if re.search(r'[- ]\s*\d+\s*[- ]', text): return True return False def _should_skip_text(self, text: str) -> bool: """嫄대 ㅽ몄 """ if not text: return True # 嫄 ⑦ 泥댄 for pattern in self.REMOVE_PATTERNS: if pattern.match(text): return True # 癒몃━留щ━留댄 if self._is_header_footer_text(text): return True # 臾몄 紐⑸ 以 (瑗щ━留 寃) if self.document_title and text.strip() == self.document_title: # 吏 泥ы쇰㈃ ㅽ if any(e.role == 'COVER_TITLE' and self.document_title in e.text for e in self.elements): return True return False def _is_descendant_of(self, element: Tag, ancestor: Tag) -> bool: """element媛 ancestor 뱀 諛 泥 """ # 湲踰 援ъ“ (#raw-container) 곗 泥 raw = soup.find(id='raw-container') if raw: self._process_geulbeot_structure(raw) return # .sheet 援ъ“ 泥 ( 留 ) sheets = soup.select('.sheet') if sheets: self._process_sheet_structure(sheets) return # ъ“ 泥 self._process_generic_html(soup) def _process_geulbeot_structure(self, raw: Tag): """湲踰 HTML #raw-container 援ъ“ 泥 """ print(" 湲踰 #raw-container 援ъ“ 泥 以...") # 吏 cover = raw.find(id='box-cover') if cover: print(" - 吏 뱀 ⑹감 toc = raw.find(id='box-toc') if toc: print(" - 紐⑹감 뱀 summary = raw.find(id='box-summary') if summary: print(" - 뱀 몃Ц content = raw.find(id='box-content') if content: print(" - 蹂몃Ц 뱀 踰 .sheet 援ъ“ 泥 """ print(f" .sheet 援ъ“ 泥 以... ({len(sheets)} )") for i, sheet in enumerate(sheets): # body-content留 異異 body_content = sheet.select_one('.body-content') if body_content: self._process_content_element(body_content) else: # body-content媛 쇰㈃ 癒몃━留щ━留명 泥 for child in sheet.children: if isinstance(child, Tag): classes = child.get('class', []) class_str = ' '.join(classes) if classes else '' # 癒몃━留щ━留ㅽ if any(x in class_str.lower() for x in ['header', 'footer']): continue self._process_content_element(child) def _process_generic_html(self, soup: BeautifulSoup): """ ъ“ 泥 """ print(" ъ“ 泥 以...") # 吏 cover = soup.find(class_=re.compile(r'cover|title-page|box-cover')) if cover: self.current_section = DocumentSection.COVER self._process_cover(cover) # 紐⑹감 toc = soup.find(class_=re.compile(r'toc|table-of-contents')) if toc: self.current_section = DocumentSection.TOC self._process_toc(toc) # 蹂몃Ц self.current_section = DocumentSection.CONTENT main_content = soup.find('main') or soup.find('article') or soup.find('body') or soup for child in main_content.children: if isinstance(child, Tag): self._process_content_element(child) def _process_cover(self, cover: Tag): """ 吏 泥 """ # H1 = 紐 h1 = cover.find('h1') if h1: text = h1.get_text(strip=True) if text and not self._is_duplicate(text): self.elements.append(StyledElement( role="COVER_TITLE", text=text, tag="h1", html=str(h1)[:200], section="cover", attributes={} )) # H2 = 遺 紐 h2 = cover.find('h2') if h2: text = h2.get_text(strip=True) if text and not self._is_duplicate(text): self.elements.append(StyledElement( role="COVER_SUBTITLE", text=text, tag="h2", html=str(h2)[:200], section="cover", attributes={} )) # P = 蹂 for p in cover.find_all('p'): text = p.get_text(strip=True) if text and not self._is_duplicate(text): self.elements.append(StyledElement( role="COVER_INFO", text=text, tag="p", html=str(p)[:200], section="cover", attributes={} )) def _process_toc(self, toc: Tag): """紐⑹감 泥 """ # UL/OL 湲 紐⑹감 for li in toc.find_all('li'): text = li.get_text(strip=True) if not text or self._is_duplicate(text): continue classes = li.get('class', []) class_str = ' '.join(classes) if classes else '' # 踰 (援ъ껜 !) if 'lvl-1' in class_str or 'toc-lvl-1' in class_str: role = "TOC_H1" elif 'lvl-2' in class_str or 'toc-lvl-2' in class_str: role = "TOC_H2" elif 'lvl-3' in class_str or 'toc-lvl-3' in class_str: role = "TOC_H3" elif self.PATTERNS['h4_num'].match(text): # 1.1.1 癒쇱 ! role = "TOC_H3" elif self.PATTERNS['h3_num'].match(text): # 1.1 洹몃 role = "TOC_H2" elif self.PATTERNS['h2_num'].match(text): # 1. 洹몃 role = "TOC_H1" else: role = "TOC_H1" self.elements.append(StyledElement( role=role, text=text, tag="li", html=str(li)[:200], section="toc", attributes={} )) def _process_content_element(self, element: Tag): """蹂몃Ц 泥 """ if not isinstance(element, Tag): return tag_name = element.name.lower() if element.name else "" classes = element.get('class', []) class_str = ' '.join(classes) if classes else '' # 癒몃━留щ━留대 ㅽ if any(x in class_str.lower() for x in ['header', 'footer', 'page-num']): return # ㅽ 뱀 if tag_name == 'table': self._process_table(element) return # 洹몃┝ 뱀 if tag_name in ['figure', 'img']: self._process_figure(element) return # ㅽ 異異 text = self._get_direct_text(element) if text: # 嫄대 ㅽ 泥댄 if self._should_skip_text(text): pass # 怨 泥 elif not self._is_duplicate(text): role = self._classify_role(element, tag_name, classes, text) if role: self.elements.append(StyledElement( role=role, text=text, tag=tag_name, html=str(element)[:200], section=self.current_section.value, attributes=dict(element.attrs) if element.attrs else {} )) # 泥 (而⑦ 洹 ) if tag_name in ['div', 'section', 'article', 'aside', 'main', 'body', 'ul', 'ol', 'dl', 'blockquote']: for child in element.children: if isinstance(child, Tag): self._process_content_element(child) def _get_direct_text(self, element: Tag) -> str: """ ㅽ몃 異異 ( )""" # 紐 洹 泥 ㅽ몃 if element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li', 'td', 'th', 'caption']: return element.get_text(strip=True) # 而⑦ 洹 吏 ㅽ몃 texts = [] for child in element.children: if isinstance(child, NavigableString): t = str(child).strip() if t: texts.append(t) return ' '.join(texts) def _is_duplicate(self, text: str) -> bool: """以蹂 ㅽ몄 洹 normalized = re.sub(r'\s+', ' ', text.strip()) # 吏㏃ ㅽ몃 以蹂 (踰 ) if len(normalized) < 10: return False # 泥 50 泥댄 key = normalized[:50] if key in self.seen_texts: return True self.seen_texts.add(key) return False def _classify_role(self, element: Tag, tag: str, classes: List[str], text: str) -> Optional[str]: """ 遺 瑜 截 以 : ⑦ 留㼼 諛 援ъ껜 濡 ! 1.1.1 1.1 1. 1 (1) 1) 媛) 媛. """ class_str = ' '.join(classes) if classes else '' # ============ 紐 洹 (HTML 洹 곗 ) ============ if tag == 'h1': return "H1" if tag == 'h2': return "H2" if tag == 'h3': return "H3" if tag == 'h4': return "H4" if tag == 'h5': return "H5" if tag == 'h6': return "H6" # ============ 蹂몃Ц (p, div ) - 踰 ⑦ 댁쇰 遺 瑜 ⑦ (援ъ껜 !) ------ # "1.1.1" ⑦ (媛 援ъ껜 - 癒쇱 泥댄 !) if self.PATTERNS['h4_num'].match(text): if len(text) < 100: return "H3" return "BODY" # "1.1 " ⑦ if self.PATTERNS['h3_num'].match(text): if len(text) < 100: return "H2" return "BODY" # "1." ⑦ if self.PATTERNS['h2_num'].match(text): if len(text) < 100: return "H1" return "BODY" # "1 媛..." ⑦ ( 듬갚+ 湲) if self.PATTERNS['h1_num'].match(text): return "H1" # ------ 愿 ⑦ (援ъ껜 !) ------ # "(1)" ⑦ (愿 媛 寃 援ъ껜 - 癒쇱 泥댄 !) if self.PATTERNS['h6_paren'].match(text): if element.find('strong') or len(text) < 80: return "H5" return "BODY" # "1)" ⑦ if self.PATTERNS['h5_paren'].match(text): if element.find('strong') or len(text) < 80: return "H4" return "BODY" # ------ 湲 ⑦ (援ъ껜 !) ------ # "媛)" ⑦ (愿 媛 寃 援ъ껜 - 癒쇱 泥댄 !) if self.PATTERNS['h5_korean'].match(text): return "H5" # "媛." ⑦ if self.PATTERNS['h4_korean'].match(text): return "H4" # ------ 뱀고 ⑦ ------ # "△" ⑦ if self.PATTERNS['h6_circle'].match(text): return "H6" # ------ 湲고 ------ # 媛議 諛 if any(x in class_str for x in ['highlight', 'box', 'note', 'tip']): return "HIGHLIGHT_BOX" # 몃Ц return "BODY" # ============ 紐⑹감 ⑸ 泥 - 援ъ“ 곗 ы """ # 罹≪ 援ъ“ 곗 table_data = {'rows': [], 'caption': caption_text} for tr in table.find_all('tr'): row = [] for cell in tr.find_all(['th', 'td']): cell_info = { 'text': cell.get_text(strip=True), 'is_header': cell.name == 'th', 'colspan': int(cell.get('colspan', 1)), 'rowspan': int(cell.get('rowspan', 1)), 'bg_color': self._extract_bg_color(cell), } row.append(cell_info) if row: table_data['rows'].append(row) # TABLE 濡 異媛 (媛蹂 TH/TD ) if table_data['rows']: self.elements.append(StyledElement( role="TABLE", text=f"[ : {len(table_data['rows'])} ]", tag="table", html=str(table)[:200], section=self.current_section.value, attributes={'table_data': table_data} )) def _extract_bg_color(self, element: Tag) -> str: """ 諛곌꼍 異異 style = element.get('style', '') # background-color 異異 match = re.search(r'background-color:\s*([^;]+)', style) if match: return self._normalize_color(match.group(1)) # bgcolor bgcolor = element.get('bgcolor', '') if bgcolor: return self._normalize_color(bgcolor) return '' def _process_figure(self, element: Tag): """洹몃┝ 泥 """ img = element.find('img') if element.name == 'figure' else element if img and img.name == 'img': src = img.get('src', '') alt = img.get('alt', '') if src: # src媛 留 異媛 self.elements.append(StyledElement( role="FIGURE", text=alt or " ", tag="img", html=str(img)[:100], section=self.current_section.value, attributes={"src": src, "alt": alt} )) # 罹≪ 泥 : 遺 嫄""" print(f" 泥 以... (泥 : {len(self.elements)}媛)") filtered = [] for elem in self.elements: # 鍮 ㅽ 嫄 if not elem.text or not elem.text.strip(): continue # 癒몃━留щ━留 ㅽ 嫄 if self._is_header_footer_text(elem.text): continue # 嫄 ⑦ 泥댄 skip = False for pattern in self.REMOVE_PATTERNS: if pattern.match(elem.text.strip()): skip = True break if not skip: filtered.append(elem) self.elements = filtered print(f" - 泥 : {len(self.elements)}媛") def get_role_summary(self) -> Dict[str, int]: """ 蹂 """ summary = {} for elem in self.elements: summary[elem.role] = summary.get(elem.role, 0) + 1 return dict(sorted(summary.items())) def extract_css_styles(self, html: str) -> Dict[str, Dict]: """ HTML 蹂 CSS ㅽ 異 Returns: { : {font_size, color, bold, ...}} """ soup = BeautifulSoup(html, 'html.parser') role_styles = {} #
1) Digitization 몃Ц 댁 . 異⑸ 湲 ㅽ몃 蹂몃Ц쇰 댁 ⑸ .
(1) ④ 蹂 吏
痢〓 湲곗 깃낵臾쇱⑥곕 .
泥 踰吏 ぉ
| 댁 | |
|---|---|
| ぉ1 | ㅻ 1 |