diff --git a/03.Code/업로드용/converters/style_analyzer.py b/03.Code/업로드용/converters/style_analyzer.py deleted file mode 100644 index e22b7a1..0000000 --- a/03.Code/업로드용/converters/style_analyzer.py +++ /dev/null @@ -1,994 +0,0 @@ -""" -HTML ㅽ - - 瑜 遺 - - ㅼ (Role) - 遺 - 瑜 - v3.0 蹂寃쎌ы : -- 湲踰 HTML 援ъ“ - 踰 吏щ━留踰 嫄 -- 媛 ν 以蹂 肄 - - - 곕 -- 紐 怨 - 痢 援ъ“ - - 뱀 -""" - COVER = "cover" # 吏 - TOC = "toc" # 紐⑹감 - CONTENT = "content" # 蹂몃Ц - - -@dataclass -class StyledElement: - """ㅽ """ - role: str # (H1, BODY, TH ) - text: str # -ㅽ 댁 - tag: str # HTML 洹 - html: str # HTML - section: str # - 뱀 -媛 - ( src ) - - def __repr__(self): - preview = self.text[:30] + "..." if len(self.text) > 30 else self.text - return f"<{self.role}> {preview}" - - -class StyleAnalyzer: - """HTML 臾몄 - 瑜 遺 - - ㅼ 遺 - 瑜 ⑦ - 踰 : " 1 ", " 2 " - "chapter": re.compile(r'^ \s*\d+\s* '), - # 1 ④ - 紐 : "1 ", "2 " ( 듬갚, ) - "h1_num": re.compile(r'^(\d+)\s+[媛- ]'), - # ぉ: "1.", "2." - "h2_num": re.compile(r'^(\d+)\.\s'), - # 以ぉ: "1.1 ", "1.2 " - "h3_num": re.compile(r'^(\d+)\.\d+\s'), - # ぉ: "1.1.1" - "h4_num": re.compile(r'^(\d+)\.(\d+)\.(\d+)'), - # - 몃 : "1)", "2)" - "h5_paren": re.compile(r'^(\d+)\)\s*'), - # - 몄 - 몃 : "(1)", "(2)" - "h6_paren": re.compile(r'^\((\d+)\)\s*'), - # 媛: "媛.", " - 愿 - : "媛)", " -몄 ", " " - "h6_circle": re.compile(r'^[△™bㅲβ╈㎮ⓥ]'), - # 紐⑹감 ", "-", " " - "list_bullet": re.compile(r'^[\-]\s'), - # 踰 ⑦ - : "- 1 -", "- 12 -" - "page_number": re.compile(r'^-\s*\d+\s*-$'), - # 瑗щ━留⑦ - : "臾몄 - 紐 - 1 -" - "footer_pattern": re.compile(r'.+[- ]\s*\d+\s*[- ]$'), - } - - # 嫄고 -ㅽ ⑦ - 대 - REMOVE_PATTERNS = [ - re.compile(r'^-\s*\d+\s*-$'), # "- 1 -" - re.compile(r'[- ]\s*\d+\s*[- ]\s*$'), # "臾몄 - 紐 - 1 -" - re.compile(r'^\d+\s* \s*\d+$'), # "643 236" ( ш린) - re.compile(r'^\[ :.*\]$'), # "[ : xxx]" - re.compile(r'^\[洹몃┝\s*\d+-\d+\]$'), # "[洹몃┝ 1-1]" - ] - - def __init__(self): - self.elements: List[StyledElement] = [] - self.current_section = DocumentSection.CONTENT - self.seen_texts: Set[str] = set() # 以蹂 諛⑹ - self.document_title = "" # 臾몄 - 紐 (瑗щ━留嫄곗 ) - - def analyze(self, html: str) -> List[StyledElement]: - """HTML 臾몄 - 遺 - - ㅼ 遺 - 瑜 由ъㅽ 諛 - 泥 : 遺 - 嫄 - self._preprocess(soup) - - # 2. 臾몄 - 紐 異異 (瑗щ━留⑦ - 媛 ) - self._extract_document_title(soup) - - # 3. - 뱀 - 諛 - self._detect_and_process_sections(soup) - - # 4. - 泥 : 以蹂 諛 遺 - 嫄 - self._postprocess() - - return self.elements - - def _preprocess(self, soup: BeautifulSoup): - """HTML - 泥 - 遺 - 嫄""" - print(" HTML - 泥 以...") - - # 1. ㅽщ┰ / ㅽ 洹 嫄 - removed_count = 0 - for tag in soup(['script', 'style', 'noscript', 'meta', 'link', 'head']): - tag.decompose() - removed_count += 1 - - if removed_count > 0: - print(f" - script/style {removed_count}媛 嫄") - - # 2. 癒몃━留щ━留 嫄 (湲踰 HTML 援ъ“) - header_footer_count = 0 - for selector in ['.page-header', '.page-footer', '.header', '.footer', - '[class*="header"]', '[class*="footer"]', - '.running-header', '.running-footer']: - for elem in soup.select(selector): - # ㅼ 肄 - ㅻ媛 - ㅻ留 嫄 - text = elem.get_text(strip=True) - if self._is_header_footer_text(text): - elem.decompose() - header_footer_count += 1 - - if header_footer_count > 0: - print(f" - 癒몃━留щ━留 嫄") - - # 3. ④꺼吏 - 嫄 - hidden_count = 0 - for elem in soup.select('[style*="display:none"], [style*="display: none"]'): - elem.decompose() - hidden_count += 1 - for elem in soup.select('[style*="visibility:hidden"], [style*="visibility: hidden"]'): - elem.decompose() - hidden_count += 1 - - # 4. #raw-container 몃 嫄 (湲踰 援ъ“) - raw_container = soup.find(id='raw-container') - if raw_container: - print(" - 湲踰 援ъ“ 媛: #raw-container 곗 - ъ ") - # raw-container 몃 ⑤ .sheet 嫄 - for sheet in soup.select('.sheet'): - if not self._is_descendant_of(sheet, raw_container): - sheet.decompose() - - def _extract_document_title(self, soup: BeautifulSoup): - """臾몄 - 紐 異異 (瑗щ━留⑦ - 媛 )""" - # 吏 - 紐 李얘린 - cover = soup.find(id='box-cover') or soup.find(class_='box-cover') - if cover: - h1 = cover.find('h1') - if h1: - self.document_title = h1.get_text(strip=True) - print(f" - 臾몄 - 紐 媛: {self.document_title[:30]}...") - - def _is_header_footer_text(self, text: str) -> bool: - """癒몃━留щ━留 - ㅽ몄 """ - if not text: - return False - - # 踰 ⑦ - - if self.PATTERNS['page_number'].match(text): - return True - - # "臾몄 - 紐 - 1 -" ⑦ - - if self.PATTERNS['footer_pattern'].match(text): - return True - - # 臾몄 - 紐 + 踰 議고 - if self.document_title and self.document_title in text: - if re.search(r'[- ]\s*\d+\s*[- ]', text): - return True - - return False - - def _should_skip_text(self, text: str) -> bool: - """嫄대 - ㅽ몄 """ - if not text: - return True - - # 嫄 ⑦ - 泥댄 - for pattern in self.REMOVE_PATTERNS: - if pattern.match(text): - return True - - # 癒몃━留щ━留댄 - if self._is_header_footer_text(text): - return True - - # 臾몄 - 紐⑸ 以 - (瑗щ━留 - 寃) - if self.document_title and text.strip() == self.document_title: - # 吏 - 泥ы쇰㈃ ㅽ - if any(e.role == 'COVER_TITLE' and self.document_title in e.text - for e in self.elements): - return True - - return False - - def _is_descendant_of(self, element: Tag, ancestor: Tag) -> bool: - """element媛 ancestor - 뱀 - 諛 泥 """ - - # 湲踰 援ъ“ (#raw-container) 곗 - 泥 - raw = soup.find(id='raw-container') - if raw: - self._process_geulbeot_structure(raw) - return - - # .sheet 援ъ“ 泥 ( 留 ) - sheets = soup.select('.sheet') - if sheets: - self._process_sheet_structure(sheets) - return - - # ъ“ 泥 - self._process_generic_html(soup) - - def _process_geulbeot_structure(self, raw: Tag): - """湲踰 HTML #raw-container 援ъ“ 泥 """ - print(" - 湲踰 #raw-container 援ъ“ 泥 以...") - - # 吏 - cover = raw.find(id='box-cover') - if cover: - print(" - 吏 - 뱀 -⑹감 - toc = raw.find(id='box-toc') - if toc: - print(" - 紐⑹감 - 뱀 - - summary = raw.find(id='box-summary') - if summary: - print(" - - 뱀 -몃Ц - content = raw.find(id='box-content') - if content: - print(" - 蹂몃Ц - 뱀 -踰 .sheet 援ъ“ 泥 """ - print(f" - .sheet 援ъ“ 泥 以... ({len(sheets)} )") - - for i, sheet in enumerate(sheets): - # body-content留 異異 - body_content = sheet.select_one('.body-content') - if body_content: - self._process_content_element(body_content) - else: - # body-content媛 쇰㈃ 癒몃━留щ━留명 泥 - for child in sheet.children: - if isinstance(child, Tag): - classes = child.get('class', []) - class_str = ' '.join(classes) if classes else '' - - # 癒몃━留щ━留ㅽ - if any(x in class_str.lower() for x in ['header', 'footer']): - continue - - self._process_content_element(child) - - def _process_generic_html(self, soup: BeautifulSoup): - """ ъ“ 泥 """ - print(" - ъ“ 泥 以...") - - # 吏 - cover = soup.find(class_=re.compile(r'cover|title-page|box-cover')) - if cover: - self.current_section = DocumentSection.COVER - self._process_cover(cover) - - # 紐⑹감 - toc = soup.find(class_=re.compile(r'toc|table-of-contents')) - if toc: - self.current_section = DocumentSection.TOC - self._process_toc(toc) - - # 蹂몃Ц - self.current_section = DocumentSection.CONTENT - main_content = soup.find('main') or soup.find('article') or soup.find('body') or soup - - for child in main_content.children: - if isinstance(child, Tag): - self._process_content_element(child) - - def _process_cover(self, cover: Tag): - """ 吏 泥 """ - # H1 = 紐 - h1 = cover.find('h1') - if h1: - text = h1.get_text(strip=True) - if text and not self._is_duplicate(text): - self.elements.append(StyledElement( - role="COVER_TITLE", - text=text, - tag="h1", - html=str(h1)[:200], - section="cover", - attributes={} - )) - - # H2 = 遺 紐 - h2 = cover.find('h2') - if h2: - text = h2.get_text(strip=True) - if text and not self._is_duplicate(text): - self.elements.append(StyledElement( - role="COVER_SUBTITLE", - text=text, - tag="h2", - html=str(h2)[:200], - section="cover", - attributes={} - )) - - # P = 蹂 - for p in cover.find_all('p'): - text = p.get_text(strip=True) - if text and not self._is_duplicate(text): - self.elements.append(StyledElement( - role="COVER_INFO", - text=text, - tag="p", - html=str(p)[:200], - section="cover", - attributes={} - )) - - def _process_toc(self, toc: Tag): - """紐⑹감 泥 """ - # UL/OL 湲 紐⑹감 - for li in toc.find_all('li'): - text = li.get_text(strip=True) - if not text or self._is_duplicate(text): - continue - - classes = li.get('class', []) - class_str = ' '.join(classes) if classes else '' - - # 踰 (援ъ껜 - !) - if 'lvl-1' in class_str or 'toc-lvl-1' in class_str: - role = "TOC_H1" - elif 'lvl-2' in class_str or 'toc-lvl-2' in class_str: - role = "TOC_H2" - elif 'lvl-3' in class_str or 'toc-lvl-3' in class_str: - role = "TOC_H3" - elif self.PATTERNS['h4_num'].match(text): # 1.1.1 癒쇱 ! - role = "TOC_H3" - elif self.PATTERNS['h3_num'].match(text): # 1.1 洹몃 - role = "TOC_H2" - elif self.PATTERNS['h2_num'].match(text): # 1. 洹몃 - role = "TOC_H1" - else: - role = "TOC_H1" - - self.elements.append(StyledElement( - role=role, - text=text, - tag="li", - html=str(li)[:200], - section="toc", - attributes={} - )) - - def _process_content_element(self, element: Tag): - """蹂몃Ц 泥 """ - if not isinstance(element, Tag): - return - - tag_name = element.name.lower() if element.name else "" - classes = element.get('class', []) - class_str = ' '.join(classes) if classes else '' - - # 癒몃━留щ━留대 ㅽ - if any(x in class_str.lower() for x in ['header', 'footer', 'page-num']): - return - - # - ㅽ 뱀 - if tag_name == 'table': - self._process_table(element) - return - - # 洹몃┝ 뱀 - if tag_name in ['figure', 'img']: - self._process_figure(element) - return - - # - ㅽ 異異 - text = self._get_direct_text(element) - - if text: - # 嫄대 - ㅽ 泥댄 - if self._should_skip_text(text): - pass # 怨 - 泥 - elif not self._is_duplicate(text): - role = self._classify_role(element, tag_name, classes, text) - if role: - self.elements.append(StyledElement( - role=role, - text=text, - tag=tag_name, - html=str(element)[:200], - section=self.current_section.value, - attributes=dict(element.attrs) if element.attrs else {} - )) - - # 泥 (而⑦ - - 洹 ) - if tag_name in ['div', 'section', 'article', 'aside', 'main', 'body', - 'ul', 'ol', 'dl', 'blockquote']: - for child in element.children: - if isinstance(child, Tag): - self._process_content_element(child) - - def _get_direct_text(self, element: Tag) -> str: - """ ㅽ몃 異異 ( - - )""" - # 紐 洹 - 泥 - ㅽ몃 - if element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li', 'td', 'th', 'caption']: - return element.get_text(strip=True) - - # 而⑦ - - 洹 吏 - ㅽ몃 - texts = [] - for child in element.children: - if isinstance(child, NavigableString): - t = str(child).strip() - if t: - texts.append(t) - - return ' '.join(texts) - - def _is_duplicate(self, text: str) -> bool: - """以蹂 - ㅽ몄 洹 - normalized = re.sub(r'\s+', ' ', text.strip()) - - # 吏㏃ - ㅽ몃 以蹂 (踰 ) - if len(normalized) < 10: - return False - - # 泥 50 泥댄 - key = normalized[:50] - - if key in self.seen_texts: - return True - - self.seen_texts.add(key) - return False - - def _classify_role(self, element: Tag, tag: str, classes: List[str], text: str) -> Optional[str]: - """ 遺 - 瑜 截 以 : ⑦ - 留㼼 諛 援ъ껜 - 濡 ! - 1.1.1 1.1 1. 1 - (1) 1) - 媛) 媛. - """ - - class_str = ' '.join(classes) if classes else '' - - # ============ 紐 洹 (HTML 洹 곗 - ) ============ - if tag == 'h1': - return "H1" - if tag == 'h2': - return "H2" - if tag == 'h3': - return "H3" - if tag == 'h4': - return "H4" - if tag == 'h5': - return "H5" - if tag == 'h6': - return "H6" - - # ============ 蹂몃Ц (p, div ) - 踰 ⑦ - 댁쇰 遺 - 瑜 ⑦ - (援ъ껜 - !) ------ - - # "1.1.1" ⑦ - (媛 援ъ껜 - 癒쇱 泥댄 !) - if self.PATTERNS['h4_num'].match(text): - if len(text) < 100: - return "H3" - return "BODY" - - # "1.1 " ⑦ - - if self.PATTERNS['h3_num'].match(text): - if len(text) < 100: - return "H2" - return "BODY" - - # "1." ⑦ - - if self.PATTERNS['h2_num'].match(text): - if len(text) < 100: - return "H1" - return "BODY" - - # "1 媛..." ⑦ - ( 듬갚+ 湲) - if self.PATTERNS['h1_num'].match(text): - return "H1" - - # ------ 愿 - ⑦ - (援ъ껜 - !) ------ - - # "(1)" ⑦ - (愿 - 媛 寃 援ъ껜 - 癒쇱 泥댄 !) - if self.PATTERNS['h6_paren'].match(text): - if element.find('strong') or len(text) < 80: - return "H5" - return "BODY" - - # "1)" ⑦ - - if self.PATTERNS['h5_paren'].match(text): - if element.find('strong') or len(text) < 80: - return "H4" - return "BODY" - - # ------ 湲 ⑦ - (援ъ껜 - !) ------ - - # "媛)" ⑦ - (愿 - 媛 寃 援ъ껜 - 癒쇱 泥댄 !) - if self.PATTERNS['h5_korean'].match(text): - return "H5" - - # "媛." ⑦ - - if self.PATTERNS['h4_korean'].match(text): - return "H4" - - # ------ 뱀고 ⑦ - ------ - - # "△" ⑦ - - if self.PATTERNS['h6_circle'].match(text): - return "H6" - - # ------ 湲고 ------ - - # 媛議 諛 - if any(x in class_str for x in ['highlight', 'box', 'note', 'tip']): - return "HIGHLIGHT_BOX" - - # 몃Ц - return "BODY" - - # ============ 紐⑹감 ⑸ - 泥 - 援ъ“ 곗 - ы """ - - # 罹≪ - 援ъ“ 곗 - - table_data = {'rows': [], 'caption': caption_text} - - for tr in table.find_all('tr'): - row = [] - for cell in tr.find_all(['th', 'td']): - cell_info = { - 'text': cell.get_text(strip=True), - 'is_header': cell.name == 'th', - 'colspan': int(cell.get('colspan', 1)), - 'rowspan': int(cell.get('rowspan', 1)), - 'bg_color': self._extract_bg_color(cell), - } - row.append(cell_info) - if row: - table_data['rows'].append(row) - - # TABLE 濡 異媛 (媛蹂 - TH/TD ) - if table_data['rows']: - self.elements.append(StyledElement( - role="TABLE", - text=f"[ : {len(table_data['rows'])} ]", - tag="table", - html=str(table)[:200], - section=self.current_section.value, - attributes={'table_data': table_data} - )) - - def _extract_bg_color(self, element: Tag) -> str: - """ - 諛곌꼍 異異 - style = element.get('style', '') - - # background-color 異異 - match = re.search(r'background-color:\s*([^;]+)', style) - if match: - return self._normalize_color(match.group(1)) - - # bgcolor - - bgcolor = element.get('bgcolor', '') - if bgcolor: - return self._normalize_color(bgcolor) - - return '' - - def _process_figure(self, element: Tag): - """洹몃┝ 泥 """ - img = element.find('img') if element.name == 'figure' else element - - if img and img.name == 'img': - src = img.get('src', '') - alt = img.get('alt', '') - - if src: # src媛 - 留 異媛 - self.elements.append(StyledElement( - role="FIGURE", - text=alt or " ", - tag="img", - html=str(img)[:100], - section=self.current_section.value, - attributes={"src": src, "alt": alt} - )) - - # 罹≪ - -泥 : 遺 - 嫄""" - print(f" 泥 以... (泥 - : {len(self.elements)}媛)") - - filtered = [] - for elem in self.elements: - # 鍮 - ㅽ 嫄 - if not elem.text or not elem.text.strip(): - continue - - # 癒몃━留щ━留 - ㅽ 嫄 - if self._is_header_footer_text(elem.text): - continue - - # 嫄 ⑦ - 泥댄 - skip = False - for pattern in self.REMOVE_PATTERNS: - if pattern.match(elem.text.strip()): - skip = True - break - - if not skip: - filtered.append(elem) - - self.elements = filtered - print(f" - 泥 - : {len(self.elements)}媛") - - def get_role_summary(self) -> Dict[str, int]: - """ 蹂 - """ - summary = {} - for elem in self.elements: - summary[elem.role] = summary.get(elem.role, 0) + 1 - return dict(sorted(summary.items())) - - - def extract_css_styles(self, html: str) -> Dict[str, Dict]: - """ - HTML - 蹂 - CSS ㅽ 異 - Returns: { : {font_size, color, bold, ...}} - """ - soup = BeautifulSoup(html, 'html.parser') - role_styles = {} - - # - -
-1) Digitization 몃Ц 댁 -. 異⑸ - 湲 - ㅽ몃 蹂몃Ц쇰 댁 ⑸ .
-(1) ④ - 蹂 - 吏 -
-痢〓 湲곗 - - 깃낵臾쇱⑥곕 - .
-泥 踰吏 ぉ
-| 댁 | -|
|---|---|
| ぉ1 | - ㅻ -1 |