Upload style_analyzer.py

2026-03-19 09:02:27 +09:00
parent 3b8ff63a93
commit 0eebebf26f
1 changed files with 994 additions and 0 deletions
--- a/03.Code/업로드용/converters/style_analyzer.py
+++ b/03.Code/업로드용/converters/style_analyzer.py
@@ -0,0 +1,994 @@
+"""
+HTML ㅽ
+         
+          瑜 遺
+                 
+                  ㅼ (Role)
+                                 遺
+                                    瑜
+ v3.0 蹂寃쎌ы :
+- 湲踰 HTML 援ъ“ 
+                  踰 吏щ━留踰  嫄
+- 媛 ν 以蹂 肄
+ 
+   
+    곕
+- 紐 怨
+         痢 援ъ“    
+                        
+                         뱀
+"""
+    COVER = "cover"      # 吏
+    TOC = "toc"          # 紐⑹감
+    CONTENT = "content"  # 蹂몃Ц
+
+
+@dataclass
+class StyledElement:
+    """ㅽ   """
+    role: str           #    (H1, BODY, TH  )
+    text: str           # 
+ㅽ  댁
+    tag: str            #   HTML  洹
+    html: str           #   HTML
+    section: str        # 
+                           뱀
+媛  
+      ( src  )
+
+    def __repr__(self):
+        preview = self.text[:30] + "..." if len(self.text) > 30 else self.text
+        return f"<{self.role}> {preview}"
+
+
+class StyleAnalyzer:
+    """HTML 臾몄
+                瑜 遺
+                      
+                       ㅼ  遺
+                                瑜  ⑦
+                                      踰 : " 1 ", " 2 "
+        "chapter": re.compile(r'^ \s*\d+\s* '),
+        # 1 ④
+               紐 : "1 ", "2 " ( 듬갚,  )
+        "h1_num": re.compile(r'^(\d+)\s+[媛- ]'),
+        # ぉ: "1.", "2."
+        "h2_num": re.compile(r'^(\d+)\.\s'),
+        # 以ぉ: "1.1 ", "1.2 "
+        "h3_num": re.compile(r'^(\d+)\.\d+\s'),
+        # ぉ: "1.1.1"
+        "h4_num": re.compile(r'^(\d+)\.(\d+)\.(\d+)'),
+        # 
+            몃 : "1)", "2)"
+        "h5_paren": re.compile(r'^(\d+)\)\s*'),
+        # 
+           몄
+             몃 : "(1)", "(2)"
+        "h6_paren": re.compile(r'^\((\d+)\)\s*'),
+        # 媛: "媛.", " - 愿
+                                 : "媛)", " -몄 ", "  "
+        "h6_circle": re.compile(r'^[△™ｂㅲβ╈㎮ⓥ]'),
+        # 紐⑹감 ", "-", " "
+        "list_bullet": re.compile(r'^[\-]\s'),
+        #   踰  ⑦
+                   : "- 1 -", "- 12 -"
+        "page_number": re.compile(r'^-\s*\d+\s*-$'),
+        # 瑗щ━留⑦
+                 : "臾몄
+                         紐 - 1 -"
+        "footer_pattern": re.compile(r'.+[- ]\s*\d+\s*[- ]$'),
+    }
+
+    #  嫄고  
+ㅽ  ⑦
+       대 
+    REMOVE_PATTERNS = [
+        re.compile(r'^-\s*\d+\s*-$'),                    # "- 1 -"
+        re.compile(r'[- ]\s*\d+\s*[- ]\s*$'),           # "臾몄
+                                                               紐 - 1 -"
+        re.compile(r'^\d+\s* \s*\d+$'),                  # "643   236" (  ш린)
+        re.compile(r'^\[  :.*\]$'),              # "[  : xxx]"
+        re.compile(r'^\[洹몃┝\s*\d+-\d+\]$'),              # "[洹몃┝ 1-1]"
+    ]
+
+    def __init__(self):
+        self.elements: List[StyledElement] = []
+        self.current_section = DocumentSection.CONTENT
+        self.seen_texts: Set[str] = set()  # 以蹂 諛⑹ 
+        self.document_title = ""  # 臾몄
+                                         紐 (瑗щ━留嫄곗 )
+
+    def analyze(self, html: str) -> List[StyledElement]:
+        """HTML 臾몄
+                     遺
+                       
+                        ㅼ  遺
+                                瑜   由ъㅽ 諛
+                                               泥 : 遺
+                                                          嫄
+        self._preprocess(soup)
+
+        # 2. 臾몄
+                  紐 異異 (瑗щ━留⑦
+                                    媛 )
+        self._extract_document_title(soup)
+
+        # 3. 
+              뱀
+ 諛  
+        self._detect_and_process_sections(soup)
+
+        # 4. 
+              泥 : 以蹂 諛  遺
+                                   嫄
+        self._postprocess()
+
+        return self.elements
+
+    def _preprocess(self, soup: BeautifulSoup):
+        """HTML 
+                 泥  - 遺
+                             嫄"""
+        print("   HTML 
+                          泥 以...")
+
+        # 1.  ㅽщ┰ / ㅽ 洹   嫄
+        removed_count = 0
+        for tag in soup(['script', 'style', 'noscript', 'meta', 'link', 'head']):
+            tag.decompose()
+            removed_count += 1
+
+        if removed_count > 0:
+            print(f"     - script/style   {removed_count}媛  嫄")
+
+        # 2. 癒몃━留щ━留   嫄 (湲踰 HTML 援ъ“)
+        header_footer_count = 0
+        for selector in ['.page-header', '.page-footer', '.header', '.footer',
+                        '[class*="header"]', '[class*="footer"]',
+                        '.running-header', '.running-footer']:
+            for elem in soup.select(selector):
+                #  ㅼ 肄
+  ㅻ媛 
+             ㅻ留  嫄
+                text = elem.get_text(strip=True)
+                if self._is_header_footer_text(text):
+                    elem.decompose()
+                    header_footer_count += 1
+
+        if header_footer_count > 0:
+            print(f"     - 癒몃━留щ━留   嫄")
+
+        # 3.  ④꺼吏
+                     嫄
+        hidden_count = 0
+        for elem in soup.select('[style*="display:none"], [style*="display: none"]'):
+            elem.decompose()
+            hidden_count += 1
+        for elem in soup.select('[style*="visibility:hidden"], [style*="visibility: hidden"]'):
+            elem.decompose()
+            hidden_count += 1
+
+        # 4. #raw-container  몃  嫄 (湲踰 援ъ“)
+        raw_container = soup.find(id='raw-container')
+        if raw_container:
+            print("     - 湲踰 援ъ“ 媛: #raw-container 곗
+                                                             ъ ")
+            # raw-container  몃 ⑤ .sheet  嫄
+            for sheet in soup.select('.sheet'):
+                if not self._is_descendant_of(sheet, raw_container):
+                    sheet.decompose()
+
+    def _extract_document_title(self, soup: BeautifulSoup):
+        """臾몄
+                紐 異異 (瑗щ━留⑦
+                                  媛 )"""
+        # 吏
+               紐 李얘린
+        cover = soup.find(id='box-cover') or soup.find(class_='box-cover')
+        if cover:
+            h1 = cover.find('h1')
+            if h1:
+                self.document_title = h1.get_text(strip=True)
+                print(f"     - 臾몄
+                                    紐 媛: {self.document_title[:30]}...")
+
+    def _is_header_footer_text(self, text: str) -> bool:
+        """癒몃━留щ━留
+ ㅽ몄  """
+        if not text:
+            return False
+
+        #   踰  ⑦
+                   
+        if self.PATTERNS['page_number'].match(text):
+            return True
+
+        # "臾몄
+               紐 - 1 -"  ⑦
+                            
+        if self.PATTERNS['footer_pattern'].match(text):
+            return True
+
+        # 臾몄
+               紐 +  踰 議고 
+        if self.document_title and self.document_title in text:
+            if re.search(r'[- ]\s*\d+\s*[- ]', text):
+                return True
+
+        return False
+
+    def _should_skip_text(self, text: str) -> bool:
+        """嫄대
+                 ㅽ몄  """
+        if not text:
+            return True
+
+        #  嫄   ⑦
+                  泥댄 
+        for pattern in self.REMOVE_PATTERNS:
+            if pattern.match(text):
+                return True
+
+        # 癒몃━留щ━留댄 
+        if self._is_header_footer_text(text):
+            return True
+
+        # 臾몄
+               紐⑸   以
+                           (瑗щ━留
+                                    寃)
+        if self.document_title and text.strip() == self.document_title:
+            # 吏
+                   泥ы쇰㈃  ㅽ 
+            if any(e.role == 'COVER_TITLE' and self.document_title in e.text
+                   for e in self.elements):
+                return True
+
+        return False
+
+    def _is_descendant_of(self, element: Tag, ancestor: Tag) -> bool:
+        """element媛 ancestor   
+                                 뱀
+ 諛  泥 """
+
+        # 湲踰 援ъ“ (#raw-container) 곗
+                                         泥 
+        raw = soup.find(id='raw-container')
+        if raw:
+            self._process_geulbeot_structure(raw)
+            return
+
+        # .sheet 援ъ“ 泥  ( 留  )
+        sheets = soup.select('.sheet')
+        if sheets:
+            self._process_sheet_structure(sheets)
+            return
+
+        #  ъ“ 泥 
+        self._process_generic_html(soup)
+
+    def _process_geulbeot_structure(self, raw: Tag):
+        """湲踰 HTML #raw-container 援ъ“ 泥 """
+        print("  
+                   湲踰 #raw-container 援ъ“ 泥 以...")
+
+        # 吏
+        cover = raw.find(id='box-cover')
+        if cover:
+            print("     - 吏  
+                               뱀
+⑹감
+        toc = raw.find(id='box-toc')
+        if toc:
+            print("     - 紐⑹감  
+                                 뱀
+ 
+        summary = raw.find(id='box-summary')
+        if summary:
+            print("     -     
+                               뱀
+몃Ц
+        content = raw.find(id='box-content')
+        if content:
+            print("     - 蹂몃Ц  
+                                 뱀
+踰 .sheet   援ъ“ 泥 """
+        print(f"  
+                    .sheet   援ъ“ 泥 以... ({len(sheets)} )")
+
+        for i, sheet in enumerate(sheets):
+            #   body-content留 異異
+            body_content = sheet.select_one('.body-content')
+            if body_content:
+                self._process_content_element(body_content)
+            else:
+                # body-content媛  쇰㈃ 癒몃━留щ━留명 泥 
+                for child in sheet.children:
+                    if isinstance(child, Tag):
+                        classes = child.get('class', [])
+                        class_str = ' '.join(classes) if classes else ''
+
+                        # 癒몃━留щ━留ㅽ 
+                        if any(x in class_str.lower() for x in ['header', 'footer']):
+                            continue
+
+                        self._process_content_element(child)
+
+    def _process_generic_html(self, soup: BeautifulSoup):
+        """ ъ“ 泥 """
+        print("  
+                    ъ“ 泥 以...")
+
+        # 吏
+        cover = soup.find(class_=re.compile(r'cover|title-page|box-cover'))
+        if cover:
+            self.current_section = DocumentSection.COVER
+            self._process_cover(cover)
+
+        # 紐⑹감
+        toc = soup.find(class_=re.compile(r'toc|table-of-contents'))
+        if toc:
+            self.current_section = DocumentSection.TOC
+            self._process_toc(toc)
+
+        # 蹂몃Ц
+        self.current_section = DocumentSection.CONTENT
+        main_content = soup.find('main') or soup.find('article') or soup.find('body') or soup
+
+        for child in main_content.children:
+            if isinstance(child, Tag):
+                self._process_content_element(child)
+
+    def _process_cover(self, cover: Tag):
+        """ 吏 泥 """
+        # H1 =  紐 
+        h1 = cover.find('h1')
+        if h1:
+            text = h1.get_text(strip=True)
+            if text and not self._is_duplicate(text):
+                self.elements.append(StyledElement(
+                    role="COVER_TITLE",
+                    text=text,
+                    tag="h1",
+                    html=str(h1)[:200],
+                    section="cover",
+                    attributes={}
+                ))
+
+        # H2 = 遺 紐 
+        h2 = cover.find('h2')
+        if h2:
+            text = h2.get_text(strip=True)
+            if text and not self._is_duplicate(text):
+                self.elements.append(StyledElement(
+                    role="COVER_SUBTITLE",
+                    text=text,
+                    tag="h2",
+                    html=str(h2)[:200],
+                    section="cover",
+                    attributes={}
+                ))
+
+        # P =  蹂 
+        for p in cover.find_all('p'):
+            text = p.get_text(strip=True)
+            if text and not self._is_duplicate(text):
+                self.elements.append(StyledElement(
+                    role="COVER_INFO",
+                    text=text,
+                    tag="p",
+                    html=str(p)[:200],
+                    section="cover",
+                    attributes={}
+                ))
+
+    def _process_toc(self, toc: Tag):
+        """紐⑹감 泥 """
+        # UL/OL 湲 紐⑹감
+        for li in toc.find_all('li'):
+            text = li.get_text(strip=True)
+            if not text or self._is_duplicate(text):
+                continue
+
+            classes = li.get('class', [])
+            class_str = ' '.join(classes) if classes else ''
+
+            #  踰    (援ъ껜   
+                                  !)
+            if 'lvl-1' in class_str or 'toc-lvl-1' in class_str:
+                role = "TOC_H1"
+            elif 'lvl-2' in class_str or 'toc-lvl-2' in class_str:
+                role = "TOC_H2"
+            elif 'lvl-3' in class_str or 'toc-lvl-3' in class_str:
+                role = "TOC_H3"
+            elif self.PATTERNS['h4_num'].match(text):   # 1.1.1 癒쇱 !
+                role = "TOC_H3"
+            elif self.PATTERNS['h3_num'].match(text):   # 1.1 洹몃 
+                role = "TOC_H2"
+            elif self.PATTERNS['h2_num'].match(text):   # 1. 洹몃 
+                role = "TOC_H1"
+            else:
+                role = "TOC_H1"
+
+            self.elements.append(StyledElement(
+                role=role,
+                text=text,
+                tag="li",
+                html=str(li)[:200],
+                section="toc",
+                attributes={}
+            ))
+
+    def _process_content_element(self, element: Tag):
+        """蹂몃Ц    泥 """
+        if not isinstance(element, Tag):
+            return
+
+        tag_name = element.name.lower() if element.name else ""
+        classes = element.get('class', [])
+        class_str = ' '.join(classes) if classes else ''
+
+        # 癒몃━留щ━留대   ㅽ 
+        if any(x in class_str.lower() for x in ['header', 'footer', 'page-num']):
+            return
+
+        #  
+  ㅽ 뱀 
+        if tag_name == 'table':
+            self._process_table(element)
+            return
+
+        # 洹몃┝ 뱀 
+        if tag_name in ['figure', 'img']:
+            self._process_figure(element)
+            return
+
+        #  
+ ㅽ 異異
+        text = self._get_direct_text(element)
+
+        if text:
+            # 嫄대
+                   ㅽ 泥댄 
+            if self._should_skip_text(text):
+                pass  #    怨
+                            泥 
+            elif not self._is_duplicate(text):
+                role = self._classify_role(element, tag_name, classes, text)
+                if role:
+                    self.elements.append(StyledElement(
+                        role=role,
+                        text=text,
+                        tag=tag_name,
+                        html=str(element)[:200],
+                        section=self.current_section.value,
+                        attributes=dict(element.attrs) if element.attrs else {}
+                    ))
+
+        #    泥 (而⑦
+ 
+   洹 )
+        if tag_name in ['div', 'section', 'article', 'aside', 'main', 'body',
+                        'ul', 'ol', 'dl', 'blockquote']:
+            for child in element.children:
+                if isinstance(child, Tag):
+                    self._process_content_element(child)
+
+    def _get_direct_text(self, element: Tag) -> str:
+        """     ㅽ몃 異異 ( 
+ 
+   )"""
+        #  紐    洹 
+                      泥  
+ ㅽ몃 
+        if element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li', 'td', 'th', 'caption']:
+            return element.get_text(strip=True)
+
+        # 而⑦
+ 
+   洹  吏  
+ ㅽ몃 
+        texts = []
+        for child in element.children:
+            if isinstance(child, NavigableString):
+                t = str(child).strip()
+                if t:
+                    texts.append(t)
+
+        return ' '.join(texts)
+
+    def _is_duplicate(self, text: str) -> bool:
+        """以蹂   
+ ㅽ몄  洹
+        normalized = re.sub(r'\s+', ' ', text.strip())
+
+        # 吏㏃  
+ ㅽ몃 以蹂  (踰  )
+        if len(normalized) < 10:
+            return False
+
+        # 泥 50  泥댄 
+        key = normalized[:50]
+
+        if key in self.seen_texts:
+            return True
+
+        self.seen_texts.add(key)
+        return False
+
+    def _classify_role(self, element: Tag, tag: str, classes: List[str], text: str) -> Optional[str]:
+        """      遺
+                   瑜 截 以 :  ⑦
+                                 留㼼 諛  援ъ껜   
+                                                              濡 !
+           1.1.1   1.1   1.   1
+           (1)   1)
+           媛)   媛.
+        """
+
+        class_str = ' '.join(classes) if classes else ''
+
+        # ============  紐    洹 (HTML  洹  곗
+                                                ) ============
+        if tag == 'h1':
+            return "H1"
+        if tag == 'h2':
+            return "H2"
+        if tag == 'h3':
+            return "H3"
+        if tag == 'h4':
+            return "H4"
+        if tag == 'h5':
+            return "H5"
+        if tag == 'h6':
+            return "H6"
+
+        # ============ 蹂몃Ц (p, div  ) - 踰  ⑦
+                                                 댁쇰 遺
+                                                         瑜 ⑦
+                                                                (援ъ껜   
+                                                                              !) ------
+
+            # "1.1.1"  ⑦
+                         (媛 援ъ껜 - 癒쇱 泥댄 !)
+            if self.PATTERNS['h4_num'].match(text):
+                if len(text) < 100:
+                    return "H3"
+                return "BODY"
+
+            # "1.1 "  ⑦
+                       
+            if self.PATTERNS['h3_num'].match(text):
+                if len(text) < 100:
+                    return "H2"
+                return "BODY"
+
+            # "1."  ⑦
+                     
+            if self.PATTERNS['h2_num'].match(text):
+                if len(text) < 100:
+                    return "H1"
+                return "BODY"
+
+            # "1 媛..."  ⑦
+                            ( 듬갚+ 湲)
+            if self.PATTERNS['h1_num'].match(text):
+                return "H1"
+
+            # ------ 愿
+                        ⑦
+                            (援ъ껜   
+                                         !) ------
+
+            # "(1)"  ⑦
+                       (愿
+                            媛 寃   援ъ껜 - 癒쇱 泥댄 !)
+            if self.PATTERNS['h6_paren'].match(text):
+                if element.find('strong') or len(text) < 80:
+                    return "H5"
+                return "BODY"
+
+            # "1)"  ⑦
+                     
+            if self.PATTERNS['h5_paren'].match(text):
+                if element.find('strong') or len(text) < 80:
+                    return "H4"
+                return "BODY"
+
+            # ------  湲  ⑦
+                            (援ъ껜   
+                                         !) ------
+
+            # "媛)"  ⑦
+                       (愿
+                             媛 寃   援ъ껜 - 癒쇱 泥댄 !)
+            if self.PATTERNS['h5_korean'].match(text):
+                return "H5"
+
+            # "媛."  ⑦
+                      
+            if self.PATTERNS['h4_korean'].match(text):
+                return "H4"
+
+            # ------  뱀고  ⑦
+                             ------
+
+            # "△"  ⑦
+                        
+            if self.PATTERNS['h6_circle'].match(text):
+                return "H6"
+
+            # ------ 湲고 ------
+
+            # 媛議  諛 
+            if any(x in class_str for x in ['highlight', 'box', 'note', 'tip']):
+                return "HIGHLIGHT_BOX"
+
+            #  몃Ц
+            return "BODY"
+
+        # ============ 紐⑹감 ⑸ 
+  泥 - 援ъ“  곗
+                 ы """
+
+        # 罹≪
+   援ъ“  곗
+             
+        table_data = {'rows': [], 'caption': caption_text}
+
+        for tr in table.find_all('tr'):
+            row = []
+            for cell in tr.find_all(['th', 'td']):
+                cell_info = {
+                    'text': cell.get_text(strip=True),
+                    'is_header': cell.name == 'th',
+                    'colspan': int(cell.get('colspan', 1)),
+                    'rowspan': int(cell.get('rowspan', 1)),
+                    'bg_color': self._extract_bg_color(cell),
+                }
+                row.append(cell_info)
+            if row:
+                table_data['rows'].append(row)
+
+        #   TABLE  濡 異媛 (媛蹂
+                                  TH/TD  )
+        if table_data['rows']:
+            self.elements.append(StyledElement(
+                role="TABLE",
+                text=f"[ : {len(table_data['rows'])} ]",
+                tag="table",
+                html=str(table)[:200],
+                section=self.current_section.value,
+                attributes={'table_data': table_data}
+            ))
+
+    def _extract_bg_color(self, element: Tag) -> str:
+        """ 
+               諛곌꼍 異異
+        style = element.get('style', '')
+
+        # background-color 異異
+        match = re.search(r'background-color:\s*([^;]+)', style)
+        if match:
+            return self._normalize_color(match.group(1))
+
+        # bgcolor  
+                    
+        bgcolor = element.get('bgcolor', '')
+        if bgcolor:
+            return self._normalize_color(bgcolor)
+
+        return ''
+
+    def _process_figure(self, element: Tag):
+        """洹몃┝ 泥 """
+        img = element.find('img') if element.name == 'figure' else element
+
+        if img and img.name == 'img':
+            src = img.get('src', '')
+            alt = img.get('alt', '')
+
+            if src:  # src媛  
+                                留 異媛
+                self.elements.append(StyledElement(
+                    role="FIGURE",
+                    text=alt or " ",
+                    tag="img",
+                    html=str(img)[:100],
+                    section=self.current_section.value,
+                    attributes={"src": src, "alt": alt}
+                ))
+
+        # 罹≪
+
+泥 : 遺
+          嫄"""
+        print(f"    泥  以... (泥   
+                                     : {len(self.elements)}媛)")
+
+        filtered = []
+        for elem in self.elements:
+            # 鍮  
+ ㅽ  嫄
+            if not elem.text or not elem.text.strip():
+                continue
+
+            # 癒몃━留щ━留
+ ㅽ  嫄
+            if self._is_header_footer_text(elem.text):
+                continue
+
+            #  嫄   ⑦
+                     泥댄 
+            skip = False
+            for pattern in self.REMOVE_PATTERNS:
+                if pattern.match(elem.text.strip()):
+                    skip = True
+                    break
+
+            if not skip:
+                filtered.append(elem)
+
+        self.elements = filtered
+        print(f"     - 泥   
+                            : {len(self.elements)}媛")
+
+    def get_role_summary(self) -> Dict[str, int]:
+        """   蹂
+                  """
+        summary = {}
+        for elem in self.elements:
+            summary[elem.role] = summary.get(elem.role, 0) + 1
+        return dict(sorted(summary.items()))
+
+
+    def extract_css_styles(self, html: str) -> Dict[str, Dict]:
+        """
+        HTML 
+               蹂
+                    CSS  ㅽ 異
+        Returns: {   : {font_size, color, bold, ...}}
+        """
+        soup = BeautifulSoup(html, 'html.parser')
+        role_styles = {}
+
+        # <style>  洹몄
+                        CSS  
+        style_tag = soup.find('style')
+        if style_tag:
+            css_text = style_tag.string or ''
+            role_styles.update(self._parse_css_rules(css_text))
+
+        #  ㅽ
+                異異 (  蹂
+                         )
+        for elem in self.elements:
+            if elem.role not in role_styles:
+                role_styles[elem.role] = self._extract_inline_style(elem.html)
+
+        return role_styles
+
+    def _parse_css_rules(self, css_text: str) -> Dict[str, Dict]:
+        """CSS  
+ ㅽ몄
+      洹移  """
+        import re
+        rules = {}
+
+        # h1, h2, .section-title  깆 ⑦
+                                     
+        pattern = r'([^{]+)\{([^}]+)\}'
+        for match in re.finditer(pattern, css_text):
+            selector = match.group(1).strip()
+            properties = match.group(2)
+
+            style = {}
+            for prop in properties.split(';'):
+                if ':' in prop:
+                    key, value = prop.split(':', 1)
+                    key = key.strip().lower()
+                    value = value.strip()
+
+                    if key == 'font-size':
+                        style['font_size'] = self._parse_font_size(value)
+                    elif key == 'color':
+                        style['color'] = self._normalize_color(value)
+                    elif key == 'font-weight':
+                        style['bold'] = value in ['bold', '700', '800', '900']
+                    elif key == 'text-align':
+                        style['align'] = value
+
+            #  
+  
+    留㼼
+            role = self._selector_to_role(selector)
+            if role:
+                rules[role] = style
+
+        return rules
+
+    def _selector_to_role(self, selector: str) -> str:
+        """CSS  
+  
+    留㼼"""
+        selector = selector.lower().strip()
+        mapping = {
+            'h1': 'H1', 'h2': 'H2', 'h3': 'H3', 'h4': 'H4',
+            '.cover-title': 'COVER_TITLE',
+            '.section-title': 'H1',
+            'th': 'TH', 'td': 'TD',
+            'p': 'BODY',
+        }
+        for key, role in mapping.items():
+            if key in selector:
+                return role
+        return None
+
+    def _parse_font_size(self, value: str) -> float:
+        """고   ш린   (pt  ⑥
+                                濡 蹂  pt
+            elif unit in ['em', 'rem']:
+                size = size * 11  # 湲곕낯 11pt 湲곗 
+            return size
+        return 11.0
+
+    def _normalize_color(self, value: str) -> str:
+        """媛  洹 (#RRGGBB)"""
+        import re
+        value = value.strip().lower()
+
+        #  #rrggbb
+        if re.match(r'^#[0-9a-f]{3}$', value):
+            return f'#{value[1]*2}{value[2]*2}{value[3]*2}'.upper()
+
+        # rgb(r, g, b)
+        match = re.search(r'rgb\s*\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)', value)
+        if match:
+            r, g, b = int(match.group(1)), int(match.group(2)), int(match.group(3))
+            return f'#{r:02X}{g:02X}{b:02X}'
+
+        #   
+
+        color_names = {
+            'black': '#000000', 'white': '#FFFFFF',
+            'red': '#FF0000', 'green': '#008000', 'blue': '#0000FF',
+            'navy': '#1A365D',
+        }
+        return color_names.get(value, '#000000')
+
+    def _extract_inline_style(self, html: str) -> Dict:
+        """HTML  
+                     ㅽ 異
+        style = {}
+
+        # style  
+                   李얘린
+        match = re.search(r'style\s*=\s*["\']([^"\']+)["\']', html)
+        if match:
+            style_str = match.group(1)
+            for prop in style_str.split(';'):
+                if ':' in prop:
+                    key, value = prop.split(':', 1)
+                    key = key.strip().lower()
+                    value = value.strip()
+
+                    if key == 'font-size':
+                        style['font_size'] = self._parse_font_size(value)
+                    elif key == 'color':
+                        style['color'] = self._normalize_color(value)
+                    elif key == 'font-weight':
+                        style['bold'] = value in ['bold', '700', '800', '900']
+                    elif key == 'text-align':
+                        style['align'] = value
+                    elif key == 'background-color':
+                        style['bg_color'] = self._normalize_color(value)
+
+        return style
+
+    def _extract_bg_color(self, element) -> str:
+        """ 
+               諛곌꼍 異異
+        if not hasattr(element, 'get'):
+            return ''
+
+        style = element.get('style', '')
+
+        # background-color 異異
+        match = re.search(r'background-color:\s*([^;]+)', style)
+        if match:
+            return self._normalize_color(match.group(1))
+
+        # bgcolor  
+                    
+        bgcolor = element.get('bgcolor', '')
+        if bgcolor:
+            return self._normalize_color(bgcolor)
+
+        return ''
+
+
+    def export_for_hwp(self) -> List[Dict]:
+            """HWP 蹂  곗
+                             대낫닿린"""
+            return [
+                {
+                    "role": e.role,
+                    "text": e.text,
+                    "tag": e.tag,
+                    "section": e.section,
+                    "attributes": e.attributes
+                }
+                for e in self.elements
+            ]
+
+
+if __name__ == "__main__":
+    # 
+ㅽ 
+    test_html = """
+    <html>
+    <head>
+        <script>var x = 1;</script>
+        <style>.test { color: red; }</style>
+    </head>
+    <body>
+        <div class="sheet">
+            <div class="page-header">嫄댁
+                                         ㅒ룻 紐  痢〓 DX  ㅻТ吏移 </div>
+            <div class="body-content">
+                <h1>1 DX 媛  湲곕낯 媛 
+ DX  
+     
+       </h2>
+                <h3>1.1.1 痢〓 DX 諛  
+                                       ④ 
+                                         </h3>
+                <p>1) <strong>Digitization  몃Ц  댁 
+.   異⑸ 
+             湲   
+ ㅽ몃 蹂몃Ц쇰   댁  ⑸ .</p>
+                <p>(1)  ④ 
+                          蹂 
+                                  吏 
+                                   </p>
+                <p>痢〓 湲곗  
+                                  
+                                    깃낵臾쇱⑥곕
+                                                   .</p>
+            </div>
+            <div class="page-footer">嫄댁
+                                         ㅒ룻 紐  痢〓 DX  ㅻТ吏移 - 1 -</div>
+        </div>
+
+        <div class="sheet">
+            <div class="page-header">嫄댁
+                                         ㅒ룻 紐  痢〓 DX  ㅻТ吏移 </div>
+            <div class="body-content">
+                <p>   泥  踰吏   ぉ</p>
+                <table>
+                    <caption>  1. 곗
+                                      鍮
+                                         援
+                                           </th><th> 댁 </th></tr>
+                    <tr><td> ぉ1</td><td> 
+                                            ㅻ 
+1</td></tr>
+                </table>
+            </div>
+            <div class="page-footer">嫄댁
+                                         ㅒ룻 紐  痢〓 DX  ㅻТ吏移 - 2 -</div>
+        </div>
+    </body>
+    </html>
+    """
+
+    analyzer = StyleAnalyzer()
+    elements = analyzer.analyze(test_html)
+
+    print("\n" + "="*60)
+    print("遺
+             
+              ")
+    print("="*60)
+    for role, count in analyzer.get_role_summary().items():
+        print(f"  {role}: {count}")