From 900cb210fb98795846993dd875c75ab416ee341b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EC=9D=B4=EA=B2=BD=EB=AF=BC?= Date: Thu, 19 Mar 2026 09:02:26 +0900 Subject: [PATCH] Upload hwpx_style_injector.py --- .../converters/hwpx_style_injector.py | 803 ++++++++++++++++++ 1 file changed, 803 insertions(+) create mode 100644 03.Code/업로드용/converters/hwpx_style_injector.py diff --git a/03.Code/업로드용/converters/hwpx_style_injector.py b/03.Code/업로드용/converters/hwpx_style_injector.py new file mode 100644 index 0000000..34d9431 --- /dev/null +++ b/03.Code/업로드용/converters/hwpx_style_injector.py @@ -0,0 +1,803 @@ +""" +HWPX ㅽ +湲 +pyhwpx濡 +ы濡: 깅 HWPX ㅼㅽ +1. HWPX 異 댁 +2. header.xmlㅼㅽ + ㅽ媛 щ 二쇱 +3. section*.xml + 蹂 + styleIDRef 留ㅽ +import os +import re +import zipfile +import shutil +import tempfile +from pathlib import Path +from typing import Dict, List, Optional +from dataclasses import dataclass + + +@dataclass +class StyleDefinition: + """ㅽ 媛 (-1=, 0=1, 1=2, ...) + + + + + + + + + + + + + + +# ㅽ ㅽ +ROLE_STYLES: Dict[str, StyleDefinition] = { + # 媛 臾몃 ( 踰 留ㅺ린湲!) + 'H1': StyleDefinition( + id=101, name='1 紐', font_size=2200, font_bold=True, + font_color='#006400', align='CENTER', line_spacing=200, + indent_left=0, indent_first=0, space_before=400, space_after=200, + outline_level=0 # ^1 + ), + 'H2': StyleDefinition( + id=102, name='1.1 紐', font_size=1500, font_bold=True, + font_color='#03581d', align='LEFT', line_spacing=200, + indent_left=0, indent_first=0, space_before=300, space_after=100, + outline_level=1 # ^1.^2 + ), + 'H3': StyleDefinition( + id=103, name='1.1.1 紐', font_size=1400, font_bold=True, + font_color='#228B22', align='LEFT', line_spacing=200, + indent_left=500, indent_first=0, space_before=200, space_after=100, + outline_level=2 # ^1.^2.^3 + ), + 'H4': StyleDefinition( + id=104, name='媛. 紐', font_size=1300, font_bold=True, + font_color='#000000', align='LEFT', line_spacing=200, + indent_left=1000, indent_first=0, space_before=150, space_after=50, + outline_level=3 # ^4. + ), + 'H5': StyleDefinition( + id=105, name='1) 紐', font_size=1200, font_bold=True, + font_color='#000000', align='LEFT', line_spacing=200, + indent_left=1500, indent_first=0, space_before=100, space_after=50, + outline_level=4 # ^5) + ), + 'H6': StyleDefinition( + id=106, name='媛) 紐', font_size=1150, font_bold=True, + font_color='#000000', align='LEFT', line_spacing=200, + indent_left=2000, indent_first=0, space_before=100, space_after=50, + outline_level=5 # ^6) + ), + 'H7': StyleDefinition( + id=115, name=' 紐', font_size=1100, font_bold=True, + font_color='#000000', align='LEFT', line_spacing=200, + indent_left=2300, indent_first=0, space_before=100, space_after=50, + outline_level=6 # ^7 (몄몃Ц ㅽ + 蹂몃Ц', font_size=1100, font_bold=False, + font_color='#000000', align='JUSTIFY', line_spacing=200, + indent_left=1500, indent_first=0, space_before=0, space_after=0 + ), + 'LIST_ITEM': StyleDefinition( + id=108, name='蹂몃Ц', font_size=1050, font_bold=False, + font_color='#000000', align='JUSTIFY', line_spacing=200, + indent_left=2500, indent_first=0, space_before=0, space_after=0 + ), + 'TABLE_CAPTION': StyleDefinition( + id=109, name='< 紐>', font_size=1100, font_bold=True, + font_color='#000000', align='LEFT', line_spacing=130, + indent_left=0, indent_first=0, space_before=200, space_after=100 + ), + 'FIGURE_CAPTION': StyleDefinition( + id=110, name='<洹몃┝ 紐>', font_size=1100, font_bold=True, + font_color='#000000', align='CENTER', line_spacing=130, + indent_left=0, indent_first=0, space_before=100, space_after=200 + ), + 'COVER_TITLE': StyleDefinition( + id=111, name='吏紐', font_size=2800, font_bold=True, + font_color='#1a365d', align='CENTER', line_spacing=150, + indent_left=0, indent_first=0, space_before=0, space_after=200 + ), + 'COVER_SUBTITLE': StyleDefinition( + id=112, name='吏遺', font_size=1800, font_bold=False, + font_color='#2d3748', align='CENTER', line_spacing=150, + indent_left=0, indent_first=0, space_before=0, space_after=100 + ), + 'TOC_1': StyleDefinition( + id=113, name='紐⑹감1', font_size=1200, font_bold=True, + font_color='#000000', align='LEFT', line_spacing=180, + indent_left=0, indent_first=0, space_before=100, space_after=50 + ), + 'TOC_2': StyleDefinition( + id=114, name='紐⑹감2', font_size=1100, font_bold=False, + font_color='#000000', align='LEFT', line_spacing=180, + indent_left=500, indent_first=0, space_before=0, space_after=0 + ), +} + +# 截 媛 踰 湲곕 + 깊! +# idRef="0" numbering id=1 + 李몄“濡, 대 ⑦ + 댁 + 援댄 + + +class HwpxStyleInjector: + """HWPX ㅽ +湲""" + + def __init__(self): + self.temp_dir: Optional[Path] = None + self.role_to_style_id: Dict[str, int] = {} + self.role_to_para_id: Dict[str, int] = {} # + self.role_to_char_id: Dict[str, int] = {} # + self.next_char_id = 0 + self.next_para_id = 0 + self.next_style_id = 0 + + def _find_max_ids(self): + """ ㅽ : 諛湲(id=0)留 吏, 몄 곕━ ㅽ 援""" + header_path = self.temp_dir / "Contents" / "header.xml" + if not header_path.exists(): + self.next_char_id = 1 + self.next_para_id = 1 + self.next_style_id = 1 + return + + content = header_path.read_text(encoding='utf-8') + + # 湲곗〈 "蹂몃Ц", "媛 1~10" ㅽ嫄 (id=1~22) + # 諛湲(id=0)留 吏! + + # style id=1~30 嫄 (諛湲 ) + content = re.sub(r'\s*', '', content) + + # itemCnt + + + # + header_path.write_text(content, encoding='utf-8') + print(f" [INFO] 湲곗〈 ㅽ 1~10 ) 嫄 + 猷") + + # charPr, paraPr 湲곗〈 寃 ㅼ遺 + (李몄“ 源⑥吏 + 濡ㅽ 1遺 + ! (Ctrl+2 = id=1, Ctrl+3 = id=2, ...) + self.next_style_id = 1 + + def inject(self, hwpx_path: str, role_positions: Dict[str, List[tuple]]) -> str: + """ + HWPX ㅼㅽ + ㅽ + + + Args: + hwpx_path: HWPX + role_positions: 蹂 + + 移蹂 {role: [(section_idx, para_idx), ...]} + + Returns: + HWPX + """ + print(f"\n HWPX ㅽ + ...") + print(f" +: {hwpx_path}") + + # 1. + 대 異 댁 + self.temp_dir = Path(tempfile.mkdtemp(prefix='hwpx_inject_')) + print(f" 대 : {self.temp_dir}") + + try: + with zipfile.ZipFile(hwpx_path, 'r') as zf: + zf.extractall(self.temp_dir) + + # 異 댁 吏 + section ш린 湲곗〈 理 ID 李얘린 ( ID 뱀 + + ) + self._find_max_ids() + print(f" [DEBUG] Starting IDs: char={self.next_char_id}, para={self.next_para_id}, style={self.next_style_id}") + + # 2. header.xmlㅽ媛 + used_roles = set(role_positions.keys()) + self._inject_header_styles(used_roles) + + # 3. section*.xml + self._inject_section_styles(role_positions) + + # 4. ㅼ 異 + output_path = hwpx_path # 댁곌린 + self._repack_hwpx(output_path) + + print(f" ㅽ + 猷: {output_path}") + return output_path + + finally: + # + 대 由 + if self.temp_dir and self.temp_dir.exists(): + shutil.rmtree(self.temp_dir) + + def _inject_header_styles(self, used_roles: set): + """header.xmlㅽ媛 (紐⑤ ROLE_STYLES 二쇱 +)""" + header_path = self.temp_dir / "Contents" / "header.xml" + if not header_path.exists(): + print(" [寃쎄 ] header.xml ") + return + + content = header_path.read_text(encoding='utf-8') + + # 紐⑤ ROLE_STYLES 二쇱 + (used_roles 臾댁) + char_props = [] + para_props = [] + styles = [] + + for role, style_def in ROLE_STYLES.items(): + char_id = self.next_char_id + para_id = self.next_para_id + style_id = self.next_style_id + + self.role_to_style_id[role] = style_id + self.role_to_para_id[role] = para_id # + self.role_to_char_id[role] = char_id # + + # charPr + + char_props.append(self._make_char_pr(char_id, style_def)) + + # paraPr + + para_props.append(self._make_para_pr(para_id, style_def)) + + # style + + styles.append(self._make_style(style_id, style_def.name, para_id, char_id)) + + self.next_char_id += 1 + self.next_para_id += 1 + self.next_style_id += 1 + + if not styles: + print(" [ 蹂 ] 二쇱 + ㅽ ") + return + + # charProperties媛 + content = self._insert_before_tag( + content, '', '\n'.join(char_props) + '\n' + ) + + # paraProperties媛 + content = self._insert_before_tag( + content, '', '\n'.join(para_props) + '\n' + ) + + # styles媛 + content = self._insert_before_tag( + content, '', '\n'.join(styles) + '\n' + ) + + # numbering id=1 ⑦ + 援 (idRef="0" 湲곕낯 踰 紐⑥) + # 대 媛 踰 1, 1.1, 1.1.1... ! + content = self._replace_default_numbering(content) + + # itemCnt + + content = self._update_item_counts(content) + + header_path.write_text(content, encoding='utf-8') + print(f" header.xml + 猷 ({len(styles)}媛 ㅽ媛)") + + def _make_char_pr(self, id: int, style: StyleDefinition) -> str: + """charPr XML + ( 以 + !)""" + color = style.font_color.lstrip('#') + font_id = "1" if style.font_bold else "0" + + return f'' + + def _make_para_pr(self, id: int, style: StyleDefinition) -> str: + """paraPr XML + ( 以 + !)""" + # 媛 臾몃 + 硫 type="NONE" + # idRef="0" numbering id=1 (湲곕낯 踰 紐⑥) + 李몄“ + if style.outline_level >= 0: + heading = f'' + else: + heading = '' + + return f'{heading}' + + def _make_style(self, id: int, name: str, para_id: int, char_id: int) -> str: + """style XML + """ + safe_name = name.replace('<', '<').replace('>', '>') + return f'' + + def _insert_before_tag(self, content: str, tag: str, insert_text: str) -> str: + """ ㅽ 쎌 +""" + return content.replace(tag, insert_text + tag) + + def _update_item_counts(self, content: str) -> str: + """itemCnt + + """ + # charProperties itemCnt + char_count = content.count(' str: + """numbering id=1 ⑦ + 댁 + 곕━ ⑦ + 댁쇰 援""" + # 곕━媛 媛 踰 ⑦ + + new_patterns = [ + {'level': '1', 'format': 'DIGIT', 'pattern': '1'}, + {'level': '2', 'format': 'DIGIT', 'pattern': '^1.^2'}, + {'level': '3', 'format': 'DIGIT', 'pattern': '^1.^2.^3'}, + {'level': '4', 'format': 'HANGUL_SYLLABLE', 'pattern': '^4.'}, + {'level': '5', 'format': 'DIGIT', 'pattern': '^5)'}, + {'level': '6', 'format': 'HANGUL_SYLLABLE', 'pattern': '^6)'}, + {'level': '7', 'format': 'CIRCLED_DIGIT', 'pattern': '^7'}, + ] + + # numbering id="1" 李얘린 + match = re.search(r'(]*>)(.*?)()', content, re.DOTALL) + if not match: + print(" [寃쎄 ] numbering id=1 , 援 嫄대 + ") + return content + + numbering_content = match.group(2) + + for np in new_patterns: + level = np['level'] + fmt = np['format'] + pattern = np['pattern'] + + # 대 level쇰 + + 援 + def replace_parahead(m): + tag = m.group(0) + # numFormat 蹂寃 + tag = re.sub(r'numFormat="[^"]*"', f'numFormat="{fmt}"', tag) + # ⑦ + ( ㅽ 댁 ) 蹂寃 + tag = re.sub(r'>([^<]*)', f'>{pattern}', tag) + return tag + + numbering_content = re.sub( + rf']*level="{level}"[^>]*>.*?', + replace_parahead, + numbering_content + ) + + new_content = match.group(1) + numbering_content + match.group(3) + print(" [INFO] numbering id=1 ⑦ + 援 猷 (1, ^1.^2, ^1.^2.^3...)") + return content.replace(match.group(0), new_content) + + def _adjust_tables(self, content: str) -> str: + """ + ш린 議곗 + + 1. 800 hwpunit ( 댁 諛⑹ ) + 2. + 鍮 + : + 泥 + 鍮 + 瑜 媛 洹 遺 + 諛 ( 泥 醫寃) + """ + + def adjust_table(match): + tbl = match.group(0) + + # + 泥 + 鍮 + 異異 + sz_match = re.search(r' 1 else table_width + + # 媛 + ㅼ + min_height = 800 # 8mm + + # + ш린 議곗 + col_idx = [0] # closure + + def adjust_cell_sz(cell_match): + width = int(cell_match.group(1)) + height = int(cell_match.group(2)) + + # + new_height = max(height, min_height) + + return f'' + + tbl = re.sub( + r'', + adjust_cell_sz, + tbl + ) + + return tbl + + return re.sub(r']*>.*?', adjust_table, content, flags=re.DOTALL) + + def _inject_section_styles(self, role_positions: Dict[str, List[tuple]]): + """section*.xml ( ㅽ 留㼼 諛⑹ 踰 + 洹 : role_to_style_id 李얘린 + section_files = sorted(contents_dir.glob("section*.xml")) + print(f" [DEBUG] section files: {[f.name for f in section_files]}") + + total_modified = 0 + + for section_file in section_files: + print(f" [DEBUG] Processing: {section_file.name}") + original_content = section_file.read_text(encoding='utf-8') + print(f" [DEBUG] File size: {len(original_content)} bytes") + + content = original_content # + 蹂듭щ낯 + + # 癒몃━留щ━留 蹂댁〈 (placeholder濡 援 ) + header_footer_map = {} + placeholder_idx = 0 + + def save_header_footer(match): + nonlocal placeholder_idx + key = f"__HF_PLACEHOLDER_{placeholder_idx}__" + header_footer_map[key] = match.group(0) + placeholder_idx += 1 + return key + + # 癒몃━留щ━留 + 援 + content = re.sub(r']*>.*?', save_header_footer, content, flags=re.DOTALL) + content = re.sub(r']*>.*?', save_header_footer, content, flags=re.DOTALL) + + # 紐⑤ ㅼ + ㅽ 異異 + para_pattern = r'(]*>)(.*?)()' + + section_modified = 0 + + def replace_style(match): + nonlocal total_modified, section_modified + open_tag = match.group(1) + inner = match.group(2) + close_tag = match.group(3) + + # ㅽ 異異 ( 嫄 ) + text = re.sub(r'<[^>]+>', '', inner).strip() + if not text: + return match.group(0) + + # ㅽ 遺 + 쇰 + text_start = text[:50] # 泥 50 + + matched_role = None + matched_style_id = None + matched_para_id = None + matched_char_id = None + + # 紐 ⑦ + 留㼼 (뱀몄 ) + # Unicode: \u25a0 \u25b8 \u25c6 \u25b6 \u25cf \u25cb \u25aa \u25ba +\u2605 \u203b 쨌\u00b7 + prefix = r'^[\u25a0\u25b8\u25c6\u25b6\u25cf\u25cb\u25aa\u25ba\u261e\u2605\u203b\u00b7\s]*' + + # FIGURE_CAPTION: "[洹몃┝ 1-1]", "[洹몃┝ 1-2]" (媛 癒쇱 泥댄 !) + # 洹몃┝ = \uadf8\ub9bc + if re.match(r'^\[\uadf8\ub9bc\s*[\d-]+\]', text_start): + matched_role = 'FIGURE_CAPTION' + # TABLE_CAPTION: "< 1-1>", "[ 1-1]" + # = \ud45c + elif re.match(r'^[<\[]\ud45c\s*[\d-]+[>\]]', text_start): + matched_role = 'TABLE_CAPTION' + # H1: "1", "1 媛 " + elif re.match(prefix + r'\uc81c?\s*\d+\uc7a5?\s', text_start) or re.match(prefix + r'[1-9]\s+[\uac00-\ud7a3]', text_start): + matched_role = 'H1' + # H3: "1.1.1 " (H2蹂대 癒쇱 泥댄 !) + elif re.match(prefix + r'\d+\.\d+\.\d+\s', text_start): + matched_role = 'H3' + # H2: "1.1 " + elif re.match(prefix + r'\d+\.\d+\s', text_start): + matched_role = 'H2' + # H4: "媛. " + elif re.match(prefix + r'[\uac00-\ud7a3]\.\s', text_start): + matched_role = 'H4' + # H5: "1) " + elif re.match(prefix + r'\d+\)\s', text_start): + matched_role = 'H5' + # H6: "(1) " "媛) " + elif re.match(prefix + r'\(\d+\)\s', text_start): + matched_role = 'H6' + elif re.match(prefix + r'[\uac00-\ud7a3]\)\s', text_start): + matched_role = 'H6' + # LIST_ITEM: " ", " ", " " + elif re.match(r'^[\u25cb\u25cf\u25e6\u2022\u2023\u25b8]\s', text_start): + matched_role = 'LIST_ITEM' + elif re.match(r'^[-\u2013\u2014]\s', text_start): + matched_role = 'LIST_ITEM' + + # 留㼼 怨 ㅽ 쇰㈃ + if matched_role and matched_role in self.role_to_style_id: + matched_style_id = self.role_to_style_id[matched_role] + matched_para_id = self.role_to_para_id[matched_role] + matched_char_id = self.role_to_char_id[matched_role] + elif 'BODY' in self.role_to_style_id and len(text) > 20: + # 湲 + ㅽ몃 蹂몃Ц쇰 媛 + 二 + matched_role = 'BODY' + matched_style_id = self.role_to_style_id['BODY'] + matched_para_id = self.role_to_para_id['BODY'] + matched_char_id = self.role_to_char_id['BODY'] + + if matched_style_id: + # 1. hp:p 媛 + if 'styleIDRef="' in open_tag: + new_open = re.sub(r'styleIDRef="[^"]*"', f'styleIDRef="{matched_style_id}"', open_tag) + else: + new_open = open_tag.replace('= 0: + new_inner = self._remove_manual_numbering(new_inner, matched_role) + + total_modified += 1 + section_modified += 1 + return new_open + new_inner + close_tag + + return match.group(0) + + new_content = re.sub(para_pattern, replace_style, content, flags=re.DOTALL) + + # + ш린 議곗 + new_content = self._adjust_tables(new_content) + + # outlineShapeIDRef瑜 1濡 蹂寃 (곕━媛 援댄 numbering id=1 ъ ) + new_content = re.sub( + r'outlineShapeIDRef="[^"]*"', + 'outlineShapeIDRef="1"', + new_content + ) + + + # 癒몃━留щ━留듭뱀 깆ㅼ몃 styleIDRef 蹂寃""" + # 李얘린 + pattern = r']*>' + matches = list(re.finditer(pattern, content)) + + if para_idx >= len(matches): + return content + + match = matches[para_idx] + old_tag = match.group(0) + + # styleIDRef + 蹂寃 異媛 + if 'styleIDRef=' in old_tag: + new_tag = re.sub(r'styleIDRef="[^"]*"', f'styleIDRef="{style_id}"', old_tag) + else: + # + 異媛 + new_tag = old_tag.replace(' str: + """ 媛 臾몃 + 踰 嫄 ( 踰 遺쇰源!) + + HTML + "1 DX 媛 " "DX 媛 " (쇰 "1" 遺 ) + HTML + "1.1 痢〓 DX" "痢〓 DX" (쇰 "1.1" 遺 ) + """ + # + 踰 ⑦ + + patterns = { + 'H1': r'^( \s*\d+\s* \s*)', # "1 " 嫄 + 'H2': r'^(\d+\.\d+\s+)', # "1.1 " 嫄 + 'H3': r'^(\d+\.\d+\.\d+\s+)', # "1.1.1 " 嫄 + 'H4': r'^([媛- ]\.\s+)', # "媛. " 嫄 + 'H5': r'^(\d+\)\s+)', # "1) " 嫄 + 'H6': r'^([媛- ]\)\s+|\(\d+\)\s+)', # "媛) " "(1) " 嫄 + 'H7': r'^([△™bㅲβ╈㎮ⓥ]+\s*)', # " " 嫄 + } + + if role not in patterns: + return inner + + pattern = patterns[role] + + # ㅼ + 踰 嫄 + def remove_number(match): + text = match.group(1) + # 泥 踰吏 + 留 踰 嫄 + new_text = re.sub(pattern, '', text, count=1) + return f'{new_text}' + + # 泥 踰吏 hp:t 泥 + new_inner = re.sub(r'([^<]*)', remove_number, inner, count=1) + + return new_inner + + def _repack_hwpx(self, output_path: str): + """HWPX 異""" + print(f" [DEBUG] Repacking to: {output_path}") + print(f" [DEBUG] Source dir: {self.temp_dir}") + + # 異 + section ш린 + + + temp_output = output_path + ".tmp" + + with zipfile.ZipFile(temp_output, 'w', zipfile.ZIP_DEFLATED) as zf: + # mimetype 異 吏몃 + mimetype_path = self.temp_dir / "mimetype" + if mimetype_path.exists(): + zf.write(mimetype_path, "mimetype", compress_type=zipfile.ZIP_STORED) + + # 몄 + file_count = 0 + for root, dirs, files in os.walk(self.temp_dir): + for file in files: + if file == "mimetype": + continue + file_path = Path(root) / file + arcname = file_path.relative_to(self.temp_dir) + zf.write(file_path, arcname) + file_count += 1 + + print(f" [DEBUG] Total files zipped: {file_count}") + + # + + + 쇰 蹂寃 + import time + for attempt in range(3): + try: + if os.path.exists(output_path): + os.remove(output_path) + os.rename(temp_output, output_path) + break + except PermissionError: + print(f" [DEBUG] 湲 湲 以... ({attempt + 1}/3)") + time.sleep(0.5) + else: + # 3踰 + ㅽ + + 쇰 吏 + print(f" [寃쎄 ] 댁곌린 ㅽ, + ъ : {temp_output}") + output_path = temp_output + + # 異 + 寃곌낵 + 移異 + ㅽ + + + Args: + hwpx_path: HWPX + elements: StyleAnalyzerъ ㅽ + + Returns: + HWPX + """ + # + + 移 + # 李멸 : section 0, para + 濡 媛 + role_positions: Dict[str, List[tuple]] = {} + + for idx, elem in enumerate(elements): + role = elem.role + if role not in role_positions: + role_positions[role] = [] + # (section_idx, para_idx) - + section 0 媛 + role_positions[role].append((0, idx)) + + injector = HwpxStyleInjector() + return injector.inject(hwpx_path, role_positions) + + +# +ㅽ +if __name__ == "__main__": + # +ㅽ몄 + test_positions = { + 'H1': [(0, 0), (0, 5)], + 'H2': [(0, 1), (0, 6)], + 'BODY': [(0, 2), (0, 3), (0, 4)], + } + + # injector = HwpxStyleInjector() + # injector.inject("test.hwpx", test_positions) + print("HwpxStyleInjector 紐⑤ 濡 + 猷")