From 555a954458d289a86a63f561f8a3dfb9a6a3868b Mon Sep 17 00:00:00 2001 From: kyeongmin Date: Thu, 5 Mar 2026 11:32:29 +0900 Subject: [PATCH] =?UTF-8?q?=F0=9F=93=A6=20Initialize=20Geulbeot=20structur?= =?UTF-8?q?e=20and=20merge=20Prompts=20&=20test=20projects?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../codedomain/국토일보_한건신문_Python_v01.py | 13 + .../codedomain/날짜_형식을_Python_v01.py | 11 + .../codedomain/다음_로우데이터를_Python_v01.py | 13 + .../codedomain/단위일_가능성_Python_v01.py | 20 + .../codedomain/단일_기사_Python_v01.py | 12 + .../codedomain/당신은_보고서_Python_v01.py | 8 + .../codedomain/로그_전체_Python_v01.py | 13 + .../codedomain/로그_파일을_Python_v01.py | 17 + .../codedomain/리스트_페이지_Python_v01.py | 18 + .../codedomain/멀티라인_대응_Python_v01.py | 11 + .../codedomain/미분류_과업_Python_v01.py | 14 + .../codedomain/법령_지침_Python_v01.py | 11 + .../codedomain/보고서_섹션에_Python_v01.py | 16 + .../codedomain/사이트별_함수_Python_v01.py | 13 + .../codedomain/설명이_없습니다_Python_v01.py | 7 + .../codedomain/설정_브라우저가_Python_v01.py | 11 + .../codedomain/수식_자체가_Python_v01.py | 17 + .../codedomain/수식_주소를_Python_v01.py | 11 + .../codedomain/수식을_가져오기_Python_v01.py | 14 + .../codedomain/아래_디자인_Python_v01.py | 11 + .../엔지니어링데일리_기자_Python_v01.py | 13 + .../codedomain/엔티티_불필요한_Python_v01.py | 9 + .../codedomain/인증서_검증_Python_v01.py | 11 + .../codedomain/카테고리_내용_Python_v01.py | 14 + .../codedomain/커버_슬라이드_Python_v01.py | 14 + .../codedomain/합계_기준_Python_v01.py | 15 + .../domain/도메인_문서생성_가계_고금리_v01.md | 7 + .../domain/도메인_문서생성_경제_진단_v01.md | 10 + .../domain/도메인_문서생성_구조적_요인_v01.md | 7 + .../domain/도메인_문서생성_굵게_기울임_v01.md | 5 + .../domain/도메인_문서생성_그림_캡션_v01.md | 6 + .../도메인_문서생성_그림은_전체폭이며_v01.md | 12 + .../domain/도메인_문서생성_글로벌_경제_v01.md | 8 + .../domain/도메인_문서생성_날짜_호수_v01.md | 6 + .../domain/도메인_문서생성_대략_색감만_v01.md | 5 + .../domain/도메인_문서생성_대제목_전체폭_v01.md | 9 + .../domain/도메인_문서생성_리드문_리드문_v01.md | 3 + .../domain/도메인_문서생성_리스크_요인_v01.md | 5 + .../도메인_문서생성_마지막으로_로드한_v01.md | 11 + .../domain/도메인_문서생성_메시지_처리_v01.md | 3 + .../domain/도메인_문서생성_목차_목차_v01.md | 10 + .../domain/도메인_문서생성_문서_편집기_v01.md | 10 + .../domain/도메인_문서생성_물가와_금리_v01.md | 3 + .../domain/도메인_문서생성_미분류_과업_v01.md | 6 + .../domain/도메인_문서생성_본고딕_맑은_v01.md | 7 + .../domain/도메인_문서생성_본문_다단_v01.md | 5 + .../domain/도메인_문서생성_본문_대제목_v01.md | 11 + .../domain/도메인_문서생성_본문_레이아웃_v01.md | 13 + .../domain/도메인_문서생성_본문_본문_v01.md | 8 + .../domain/도메인_문서생성_소제목은_내부_v01.md | 12 + .../domain/도메인_문서생성_수출과_산업_v01.md | 3 + .../domain/도메인_문서생성_양식_목록_v01.md | 6 + .../domain/도메인_문서생성_양식_선택_v01.md | 9 + .../domain/도메인_문서생성_양식_추가는_v01.md | 3 + .../domain/도메인_문서생성_에서_검증된_v01.md | 12 + .../domain/도메인_문서생성_요약_박스_v01.md | 9 + .../domain/도메인_문서생성_요약_요약_v01.md | 9 + .../domain/도메인_문서생성_요즘_경제_v01.md | 9 + .../domain/도메인_문서생성_인쇄_모드_v01.md | 6 + .../domain/도메인_문서생성_읽기_전용_v01.md | 9 + .../domain/도메인_문서생성_자료_출처_v01.md | 3 + .../domain/도메인_문서생성_전문적인_네이비_v01.md | 9 + .../domain/도메인_문서생성_정리_위기는_v01.md | 4 + .../domain/도메인_문서생성_제목_강조_v01.md | 3 + .../domain/도메인_문서생성_제목_뒤에서_v01.md | 10 + .../domain/도메인_문서생성_제목만_덩그러니_v01.md | 4 + .../domain/도메인_문서생성_줄바꿈_다음줄_v01.md | 13 + .../domain/도메인_문서생성_지금_경제_v01.md | 9 + .../도메인_문서생성_타이포그래피_설정_v01.md | 9 + .../domain/도메인_문서생성_투자자_불확실성_v01.md | 7 + .../domain/도메인_문서생성_파일을_놓으세요_v01.md | 9 + .../domain/도메인_문서생성_파일을_먼저_v01.md | 5 + .../domain/도메인_문서생성_페이지_끝에_v01.md | 9 + .../domain/도메인_문서생성_페이지_분할_v01.md | 4 + .../domain/도메인_문서생성_표지_날짜_v01.md | 12 + .../domain/도메인_문서생성_표지_목차_v01.md | 16 + .../domain/도메인_문서생성_표지_특집_v01.md | 7 + .../domain/도메인_문서생성_표지나_특정_v01.md | 6 + .../domain/도메인_문서생성_푸터_출처_v01.md | 8 + .../domain/도메인_문서생성_한국_경제_v01.md | 5 + .../domain/도메인_문서생성_화면_확인용_v01.md | 10 + .../domain/도메인_문서생성_화면에서_처럼_v01.md | 11 + 02. Prompts/문서생성/exclude/unknown_v01.txt | 1 + .../prompt/GPT_문서생성_다음을_한국어로_v01.md | 5 + .../prompt/GPT_문서생성_미분류_과업_v01.md | 9 + .../prompt/GPT_문서생성_아래_내용을_v01.md | 15 + 03. Code/geulbeot_10th/.env.sample | 7 + 03. Code/geulbeot_10th/.gitignore | 11 + .../hwpx/out/out/context/domain_prompt.txt | 909 ++++ 03. Code/geulbeot_10th/Procfile | 1 + 03. Code/geulbeot_10th/README.md | 453 ++ 03. Code/geulbeot_10th/api_config.py | 30 + 03. Code/geulbeot_10th/app.py | 684 +++ 03. Code/geulbeot_10th/converters/__init__.py | 0 .../geulbeot_10th/converters/html_to_hwp.py | 1115 ++++ .../converters/html_to_hwp_briefing.py | 616 +++ .../converters/hwp_style_mapping.py | 434 ++ .../converters/hwpx_generator.py | 431 ++ .../converters/hwpx_style_injector.py | 750 +++ .../converters/hwpx_table_injector.py | 174 + .../converters/pipeline/__init__.py | 1 + .../converters/pipeline/router.py | 176 + .../converters/pipeline/step1_convert.py | 784 +++ .../converters/pipeline/step2_extract.py | 789 +++ .../converters/pipeline/step3_domain.py | 265 + .../converters/pipeline/step4_chunk.py | 357 ++ .../converters/pipeline/step5_rag.py | 141 + .../converters/pipeline/step6_corpus.py | 232 + .../converters/pipeline/step7_index.py | 504 ++ .../converters/pipeline/step8_content.py | 1021 ++++ .../converters/pipeline/step9_html.py | 1249 +++++ .../converters/style_analyzer.py | 935 ++++ 03. Code/geulbeot_10th/domain/__init__.py | 0 03. Code/geulbeot_10th/domain/civil/dx.txt | 0 .../geulbeot_10th/domain/civil/general.txt | 1 + .../domain/civil/specialties/anlysis.txt | 0 .../domain/civil/specialties/bim.txt | 27 + .../domain/civil/specialties/bridge.txt | 27 + .../civil/specialties/communication.txt | 27 + .../domain/civil/specialties/construction.txt | 27 + .../domain/civil/specialties/geotechnical.txt | 27 + .../domain/civil/specialties/planning.txt | 27 + .../domain/civil/specialties/quality_env.txt | 27 + .../domain/civil/specialties/road.txt | 27 + .../domain/civil/specialties/safety.txt | 27 + .../civil/specialties/schedule_cost.txt | 27 + .../domain/civil/specialties/structure.txt | 27 + .../domain/civil/specialties/survey.txt | 27 + .../domain/civil/specialties/tunnel.txt | 27 + .../geulbeot_10th/domain/hwpx/__init__.py | 0 .../domain/hwpx/hwpx_domain_guide.md | 769 +++ .../geulbeot_10th/domain/hwpx/hwpx_utils.py | 323 ++ .../domain/report_guide/domain_prompt.txt | 22 + .../report_guide/outline_issue_report.txt | 20 + .../domain/report_guide/report_guide.txt | 848 +++ 03. Code/geulbeot_10th/domain_api.py | 457 ++ 03. Code/geulbeot_10th/domain_config.json | 46 + 03. Code/geulbeot_10th/handlers/__init__.py | 8 + .../handlers/briefing/__init__.py | 5 + .../handlers/briefing/processor.py | 279 + .../briefing/prompts/step1_5_plan.txt | 104 + .../briefing/prompts/step1_extract.txt | 122 + .../briefing/prompts/step2_generate.txt | 440 ++ 03. Code/geulbeot_10th/handlers/common.py | 84 + .../geulbeot_10th/handlers/doc/__init__.py | 2 + .../handlers/doc/content_analyzer.py | 640 +++ .../handlers/doc/custom_doc_type.py | 555 ++ .../handlers/doc/doc_type_analyzer.py | 1058 ++++ .../geulbeot_10th/handlers/report/__init__.py | 5 + .../handlers/report/processor.py | 161 + .../report/prompts/refine_selection.txt | 104 + .../handlers/template/__init__.py | 3 + .../template/doc_template_analyzer.py | 150 + .../template/html_table_template_css.txt | 1442 +++++ .../handlers/template/processor.py | 625 +++ .../template/prompts/analyze_template.txt | 28 + .../handlers/template/semantic_mapper.py | 382 ++ .../handlers/template/style_generator.py | 824 +++ .../handlers/template/template_manager.py | 1010 ++++ .../handlers/template/tools/__init__.py | 51 + .../handlers/template/tools/border_fill.py | 127 + .../handlers/template/tools/char_style.py | 133 + .../handlers/template/tools/content_order.py | 550 ++ .../handlers/template/tools/font.py | 82 + .../handlers/template/tools/header_footer.py | 200 + .../handlers/template/tools/image.py | 98 + .../handlers/template/tools/numbering.py | 136 + .../handlers/template/tools/page_setup.py | 110 + .../handlers/template/tools/para_style.py | 185 + .../handlers/template/tools/section.py | 120 + .../handlers/template/tools/style_def.py | 68 + .../handlers/template/tools/table.py | 328 ++ 03. Code/geulbeot_10th/static/css/editor.css | 297 ++ 03. Code/geulbeot_10th/static/css/main.css | 1826 +++++++ 03. Code/geulbeot_10th/static/js/ai_edit.js | 143 + 03. Code/geulbeot_10th/static/js/demo_mode.js | 371 ++ 03. Code/geulbeot_10th/static/js/doc_type.js | 587 +++ .../static/js/domain_selector.js | 288 + 03. Code/geulbeot_10th/static/js/editor.js | 1208 +++++ 03. Code/geulbeot_10th/static/js/export.js | 72 + 03. Code/geulbeot_10th/static/js/generator.js | 484 ++ 03. Code/geulbeot_10th/static/js/modals.js | 135 + 03. Code/geulbeot_10th/static/js/template.js | 189 + 03. Code/geulbeot_10th/static/js/ui.js | 91 + .../geulbeot_10th/static/result/brief_1.html | 315 ++ .../geulbeot_10th/static/result/brief_2.html | 427 ++ .../geulbeot_10th/static/result/report.html | 1097 ++++ .../geulbeot_10th/static/result/slide.html | 513 ++ .../default/doc_types/briefing/config.json | 26 + .../doc_types/presentation/config.json | 27 + .../default/doc_types/report/config.json | 26 + 03. Code/geulbeot_10th/templates/hwp_guide.md | 302 ++ .../templates/hwp_html_defaults.json | 116 + 03. Code/geulbeot_10th/templates/index.html | 782 +++ 03. Code/geulbeot_10th/샘플 예시.html | 1097 ++++ 03. Code/geulbeot_1st/.gitignore | 29 + 03. Code/geulbeot_1st/Procfile | 1 + 03. Code/geulbeot_1st/README.md | 82 + 03. Code/geulbeot_1st/app.py | 538 ++ .../geulbeot_1st/prompts/step1_5_plan.txt | 104 + .../geulbeot_1st/prompts/step1_extract.txt | 122 + .../geulbeot_1st/prompts/step2_generate.txt | 440 ++ .../geulbeot_1st/prompts/system_prompt.txt | 605 +++ 03. Code/geulbeot_1st/railway.json | 13 + 03. Code/geulbeot_1st/requirements.txt | 5 + .../geulbeot_1st/templates/hwp_guide.html | 343 ++ 03. Code/geulbeot_1st/templates/index.html | 340 ++ 03. Code/geulbeot_2nd/.gitignore | 32 + 03. Code/geulbeot_2nd/Procfile | 1 + 03. Code/geulbeot_2nd/README.md | 82 + 03. Code/geulbeot_2nd/api_config.py | 17 + 03. Code/geulbeot_2nd/app.py | 492 ++ .../geulbeot_2nd/prompts/step1_5_plan.txt | 104 + .../geulbeot_2nd/prompts/step1_extract.txt | 122 + .../geulbeot_2nd/prompts/step2_generate.txt | 440 ++ 03. Code/geulbeot_2nd/railway.json | 13 + 03. Code/geulbeot_2nd/requirements.txt | 5 + 03. Code/geulbeot_2nd/static/css/editor.css | 205 + 03. Code/geulbeot_2nd/static/js/editor.js | 554 ++ .../geulbeot_2nd/templates/hwp_guide.html | 343 ++ 03. Code/geulbeot_2nd/templates/index.html | 2099 ++++++++ 03. Code/geulbeot_3rd/.gitignore | 32 + 03. Code/geulbeot_3rd/Procfile | 1 + 03. Code/geulbeot_3rd/README.md | 146 + 03. Code/geulbeot_3rd/api_config.py | 17 + 03. Code/geulbeot_3rd/app.py | 579 ++ 03. Code/geulbeot_3rd/converters/__init__.py | 0 .../geulbeot_3rd/converters/html_to_hwp.py | 573 ++ .../converters/html_to_hwp_briefing.py | 573 ++ .../converters/pipeline/__init__.py | 1 + .../converters/pipeline/router.py | 139 + .../converters/pipeline/step1_convert.py | 784 +++ .../converters/pipeline/step2_extract.py | 789 +++ .../converters/pipeline/step3_domain.py | 265 + .../converters/pipeline/step4_chunk.py | 357 ++ .../converters/pipeline/step5_rag.py | 141 + .../converters/pipeline/step6_corpus.py | 232 + .../converters/pipeline/step7_index.py | 504 ++ .../converters/pipeline/step8_content.py | 1021 ++++ .../converters/pipeline/step9_html.py | 1249 +++++ .../output/assets/1_1_1_img01.png | Bin 0 -> 120519 bytes .../output/assets/1_1_1_img02.png | Bin 0 -> 36607 bytes .../output/assets/1_1_1_img03.png | Bin 0 -> 98658 bytes .../output/assets/1_1_2_img01.png | Bin 0 -> 120519 bytes .../output/assets/1_1_2_img02.png | Bin 0 -> 36607 bytes .../output/assets/1_1_2_img03.png | Bin 0 -> 170732 bytes .../output/assets/1_1_3_img01.png | Bin 0 -> 261943 bytes .../output/assets/1_1_3_img02.png | Bin 0 -> 85636 bytes .../output/assets/1_2_1_img03.png | Bin 0 -> 72039 bytes .../output/assets/1_2_2_img01.png | Bin 0 -> 4970 bytes .../output/assets/1_2_2_img02.png | Bin 0 -> 5161 bytes .../output/assets/1_2_2_img03.png | Bin 0 -> 172819 bytes .../geulbeot_3rd/prompts/step1_5_plan.txt | 104 + .../geulbeot_3rd/prompts/step1_extract.txt | 122 + .../geulbeot_3rd/prompts/step2_generate.txt | 440 ++ 03. Code/geulbeot_3rd/railway.json | 13 + 03. Code/geulbeot_3rd/requirements.txt | 5 + 03. Code/geulbeot_3rd/static/css/editor.css | 205 + 03. Code/geulbeot_3rd/static/js/editor.js | 554 ++ .../geulbeot_3rd/templates/hwp_guide.html | 343 ++ 03. Code/geulbeot_3rd/templates/index.html | 2247 ++++++++ 03. Code/geulbeot_4th/.env.sample | 7 + 03. Code/geulbeot_4th/.gitignore | 32 + 03. Code/geulbeot_4th/Procfile | 1 + 03. Code/geulbeot_4th/README.md | 309 ++ 03. Code/geulbeot_4th/api_config.py | 30 + 03. Code/geulbeot_4th/app.py | 292 + 03. Code/geulbeot_4th/converters/__init__.py | 0 03. Code/geulbeot_4th/converters/dkdl.py | 37 + .../geulbeot_4th/converters/html_to_hwp.py | 1013 ++++ .../converters/html_to_hwp_briefing.py | 616 +++ .../converters/hwp_style_mapping.py | 434 ++ .../geulbeot_4th/converters/hwpx_generator.py | 431 ++ .../converters/pipeline/__init__.py | 1 + .../converters/pipeline/router.py | 139 + .../converters/pipeline/step1_convert.py | 784 +++ .../converters/pipeline/step2_extract.py | 789 +++ .../converters/pipeline/step3_domain.py | 265 + .../converters/pipeline/step4_chunk.py | 357 ++ .../converters/pipeline/step5_rag.py | 141 + .../converters/pipeline/step6_corpus.py | 232 + .../converters/pipeline/step7_index.py | 504 ++ .../converters/pipeline/step8_content.py | 1021 ++++ .../converters/pipeline/step9_html.py | 1249 +++++ .../geulbeot_4th/converters/style_analyzer.py | 935 ++++ 03. Code/geulbeot_4th/handlers/__init__.py | 5 + .../handlers/briefing/__init__.py | 5 + .../handlers/briefing/processor.py | 279 + .../briefing/prompts/step1_5_plan.txt | 104 + .../briefing/prompts/step1_extract.txt | 122 + .../briefing/prompts/step2_generate.txt | 440 ++ 03. Code/geulbeot_4th/handlers/common.py | 84 + .../geulbeot_4th/handlers/report/__init__.py | 5 + .../geulbeot_4th/handlers/report/processor.py | 152 + .../report/prompts/refine_selection.txt | 104 + .../output/assets/1_1_1_img01.png | Bin 0 -> 120519 bytes .../output/assets/1_1_1_img02.png | Bin 0 -> 36607 bytes .../output/assets/1_1_1_img03.png | Bin 0 -> 98658 bytes .../output/assets/1_1_2_img01.png | Bin 0 -> 120519 bytes .../output/assets/1_1_2_img02.png | Bin 0 -> 36607 bytes .../output/assets/1_1_2_img03.png | Bin 0 -> 170732 bytes .../output/assets/1_1_3_img01.png | Bin 0 -> 261943 bytes .../output/assets/1_1_3_img02.png | Bin 0 -> 85636 bytes .../output/assets/1_2_1_img03.png | Bin 0 -> 72039 bytes .../output/assets/1_2_2_img01.png | Bin 0 -> 4970 bytes .../output/assets/1_2_2_img02.png | Bin 0 -> 5161 bytes .../output/assets/1_2_2_img03.png | Bin 0 -> 172819 bytes .../geulbeot_4th/prompts/step1_5_plan.txt | 104 + .../geulbeot_4th/prompts/step1_extract.txt | 122 + .../geulbeot_4th/prompts/step2_generate.txt | 440 ++ 03. Code/geulbeot_4th/requirements.txt | 5 + 03. Code/geulbeot_4th/static/css/editor.css | 297 ++ 03. Code/geulbeot_4th/static/js/editor.js | 1208 +++++ .../geulbeot_4th/templates/hwp_guide.html | 343 ++ 03. Code/geulbeot_4th/templates/index.html | 2356 +++++++++ 03. Code/geulbeot_5th/.env.sample | 7 + 03. Code/geulbeot_5th/.gitignore | 32 + 03. Code/geulbeot_5th/Procfile | 1 + 03. Code/geulbeot_5th/README.md | 338 ++ 03. Code/geulbeot_5th/api_config.py | 30 + 03. Code/geulbeot_5th/app.py | 298 ++ 03. Code/geulbeot_5th/converters/__init__.py | 0 .../geulbeot_5th/converters/html_to_hwp.py | 1123 ++++ .../converters/html_to_hwp_briefing.py | 616 +++ .../converters/hwp_style_mapping.py | 434 ++ .../geulbeot_5th/converters/hwpx_generator.py | 431 ++ .../converters/hwpx_style_injector.py | 750 +++ .../converters/hwpx_table_injector.py | 174 + .../converters/pipeline/__init__.py | 1 + .../converters/pipeline/router.py | 139 + .../converters/pipeline/step1_convert.py | 784 +++ .../converters/pipeline/step2_extract.py | 789 +++ .../converters/pipeline/step3_domain.py | 265 + .../converters/pipeline/step4_chunk.py | 357 ++ .../converters/pipeline/step5_rag.py | 141 + .../converters/pipeline/step6_corpus.py | 232 + .../converters/pipeline/step7_index.py | 504 ++ .../converters/pipeline/step8_content.py | 1021 ++++ .../converters/pipeline/step9_html.py | 1249 +++++ .../geulbeot_5th/converters/style_analyzer.py | 935 ++++ 03. Code/geulbeot_5th/handlers/__init__.py | 5 + .../handlers/briefing/__init__.py | 5 + .../handlers/briefing/processor.py | 279 + .../briefing/prompts/step1_5_plan.txt | 104 + .../briefing/prompts/step1_extract.txt | 122 + .../briefing/prompts/step2_generate.txt | 440 ++ 03. Code/geulbeot_5th/handlers/common.py | 84 + .../geulbeot_5th/handlers/report/__init__.py | 5 + .../geulbeot_5th/handlers/report/processor.py | 152 + .../report/prompts/refine_selection.txt | 104 + .../output/assets/1_1_1_img01.png | Bin 0 -> 120519 bytes .../output/assets/1_1_1_img02.png | Bin 0 -> 36607 bytes .../output/assets/1_1_1_img03.png | Bin 0 -> 98658 bytes .../output/assets/1_1_2_img01.png | Bin 0 -> 120519 bytes .../output/assets/1_1_2_img02.png | Bin 0 -> 36607 bytes .../output/assets/1_1_2_img03.png | Bin 0 -> 170732 bytes .../output/assets/1_1_3_img01.png | Bin 0 -> 261943 bytes .../output/assets/1_1_3_img02.png | Bin 0 -> 85636 bytes .../output/assets/1_2_1_img03.png | Bin 0 -> 72039 bytes .../output/assets/1_2_2_img01.png | Bin 0 -> 4970 bytes .../output/assets/1_2_2_img02.png | Bin 0 -> 5161 bytes .../output/assets/1_2_2_img03.png | Bin 0 -> 172819 bytes .../geulbeot_5th/prompts/step1_5_plan.txt | 104 + .../geulbeot_5th/prompts/step1_extract.txt | 122 + .../geulbeot_5th/prompts/step2_generate.txt | 440 ++ 03. Code/geulbeot_5th/requirements.txt | 5 + 03. Code/geulbeot_5th/static/css/editor.css | 297 ++ 03. Code/geulbeot_5th/static/js/editor.js | 1208 +++++ .../geulbeot_5th/templates/hwp_guide.html | 343 ++ 03. Code/geulbeot_5th/templates/index.html | 2356 +++++++++ 03. Code/geulbeot_6th/.env.sample | 7 + 03. Code/geulbeot_6th/.gitignore | 32 + 03. Code/geulbeot_6th/Procfile | 1 + 03. Code/geulbeot_6th/README.md | 359 ++ 03. Code/geulbeot_6th/api_config.py | 30 + 03. Code/geulbeot_6th/app.py | 355 ++ 03. Code/geulbeot_6th/converters/__init__.py | 0 .../geulbeot_6th/converters/html_to_hwp.py | 1123 ++++ .../converters/html_to_hwp_briefing.py | 616 +++ .../converters/hwp_style_mapping.py | 434 ++ .../geulbeot_6th/converters/hwpx_generator.py | 431 ++ .../converters/hwpx_style_injector.py | 750 +++ .../converters/hwpx_table_injector.py | 174 + .../converters/pipeline/__init__.py | 1 + .../converters/pipeline/router.py | 165 + .../converters/pipeline/step1_convert.py | 784 +++ .../converters/pipeline/step2_extract.py | 789 +++ .../converters/pipeline/step3_domain.py | 265 + .../converters/pipeline/step4_chunk.py | 357 ++ .../converters/pipeline/step5_rag.py | 141 + .../converters/pipeline/step6_corpus.py | 232 + .../converters/pipeline/step7_index.py | 504 ++ .../converters/pipeline/step8_content.py | 1021 ++++ .../converters/pipeline/step9_html.py | 1249 +++++ .../geulbeot_6th/converters/style_analyzer.py | 935 ++++ 03. Code/geulbeot_6th/handlers/__init__.py | 5 + .../handlers/briefing/__init__.py | 5 + .../handlers/briefing/processor.py | 279 + .../briefing/prompts/step1_5_plan.txt | 104 + .../briefing/prompts/step1_extract.txt | 122 + .../briefing/prompts/step2_generate.txt | 440 ++ 03. Code/geulbeot_6th/handlers/common.py | 84 + .../geulbeot_6th/handlers/report/__init__.py | 5 + .../geulbeot_6th/handlers/report/processor.py | 161 + .../report/prompts/refine_selection.txt | 104 + .../handlers/template/__init__.py | 3 + .../handlers/template/processor.py | 625 +++ .../template/prompts/analyze_template.txt | 28 + .../output/assets/1_1_1_img01.png | Bin 0 -> 120519 bytes .../output/assets/1_1_1_img02.png | Bin 0 -> 36607 bytes .../output/assets/1_1_1_img03.png | Bin 0 -> 98658 bytes .../output/assets/1_1_2_img01.png | Bin 0 -> 120519 bytes .../output/assets/1_1_2_img02.png | Bin 0 -> 36607 bytes .../output/assets/1_1_2_img03.png | Bin 0 -> 170732 bytes .../output/assets/1_1_3_img01.png | Bin 0 -> 261943 bytes .../output/assets/1_1_3_img02.png | Bin 0 -> 85636 bytes .../output/assets/1_2_1_img03.png | Bin 0 -> 72039 bytes .../output/assets/1_2_2_img01.png | Bin 0 -> 4970 bytes .../output/assets/1_2_2_img02.png | Bin 0 -> 5161 bytes .../output/assets/1_2_2_img03.png | Bin 0 -> 172819 bytes .../geulbeot_6th/prompts/step1_5_plan.txt | 104 + .../geulbeot_6th/prompts/step1_extract.txt | 122 + .../geulbeot_6th/prompts/step2_generate.txt | 440 ++ 03. Code/geulbeot_6th/requirements.txt | 5 + 03. Code/geulbeot_6th/static/css/editor.css | 297 ++ 03. Code/geulbeot_6th/static/js/editor.js | 1208 +++++ .../geulbeot_6th/templates/hwp_guide.html | 343 ++ 03. Code/geulbeot_6th/templates/index.html | 2975 +++++++++++ 03. Code/geulbeot_7th/.env.sample | 7 + 03. Code/geulbeot_7th/.gitignore | 32 + 03. Code/geulbeot_7th/7th.zip | Bin 0 -> 1465968 bytes 03. Code/geulbeot_7th/Procfile | 1 + 03. Code/geulbeot_7th/README.md | 291 + 03. Code/geulbeot_7th/api_config.py | 30 + 03. Code/geulbeot_7th/app.py | 355 ++ 03. Code/geulbeot_7th/converters/__init__.py | 0 .../geulbeot_7th/converters/html_to_hwp.py | 1123 ++++ .../converters/html_to_hwp_briefing.py | 616 +++ .../converters/hwp_style_mapping.py | 434 ++ .../geulbeot_7th/converters/hwpx_generator.py | 431 ++ .../converters/hwpx_style_injector.py | 750 +++ .../converters/hwpx_table_injector.py | 174 + .../converters/pipeline/__init__.py | 1 + .../converters/pipeline/router.py | 165 + .../converters/pipeline/step1_convert.py | 784 +++ .../converters/pipeline/step2_extract.py | 789 +++ .../converters/pipeline/step3_domain.py | 265 + .../converters/pipeline/step4_chunk.py | 357 ++ .../converters/pipeline/step5_rag.py | 141 + .../converters/pipeline/step6_corpus.py | 232 + .../converters/pipeline/step7_index.py | 504 ++ .../converters/pipeline/step8_content.py | 1021 ++++ .../converters/pipeline/step9_html.py | 1249 +++++ .../geulbeot_7th/converters/style_analyzer.py | 935 ++++ 03. Code/geulbeot_7th/handlers/__init__.py | 5 + .../handlers/briefing/__init__.py | 5 + .../handlers/briefing/processor.py | 279 + .../briefing/prompts/step1_5_plan.txt | 104 + .../briefing/prompts/step1_extract.txt | 122 + .../briefing/prompts/step2_generate.txt | 440 ++ 03. Code/geulbeot_7th/handlers/common.py | 84 + .../geulbeot_7th/handlers/report/__init__.py | 5 + .../geulbeot_7th/handlers/report/processor.py | 161 + .../report/prompts/refine_selection.txt | 104 + .../handlers/template/__init__.py | 3 + .../handlers/template/processor.py | 625 +++ .../template/prompts/analyze_template.txt | 28 + .../output/assets/1_1_1_img01.png | Bin 0 -> 120519 bytes .../output/assets/1_1_1_img02.png | Bin 0 -> 36607 bytes .../output/assets/1_1_1_img03.png | Bin 0 -> 98658 bytes .../output/assets/1_1_2_img01.png | Bin 0 -> 120519 bytes .../output/assets/1_1_2_img02.png | Bin 0 -> 36607 bytes .../output/assets/1_1_2_img03.png | Bin 0 -> 170732 bytes .../output/assets/1_1_3_img01.png | Bin 0 -> 261943 bytes .../output/assets/1_1_3_img02.png | Bin 0 -> 85636 bytes .../output/assets/1_2_1_img03.png | Bin 0 -> 72039 bytes .../output/assets/1_2_2_img01.png | Bin 0 -> 4970 bytes .../output/assets/1_2_2_img02.png | Bin 0 -> 5161 bytes .../output/assets/1_2_2_img03.png | Bin 0 -> 172819 bytes .../geulbeot_7th/prompts/step1_5_plan.txt | 104 + .../geulbeot_7th/prompts/step1_extract.txt | 122 + .../geulbeot_7th/prompts/step2_generate.txt | 440 ++ 03. Code/geulbeot_7th/requirements.txt | 5 + 03. Code/geulbeot_7th/static/css/editor.css | 297 ++ 03. Code/geulbeot_7th/static/js/editor.js | 1208 +++++ .../geulbeot_7th/templates/hwp_guide.html | 343 ++ 03. Code/geulbeot_7th/templates/index.html | 3401 ++++++++++++ 03. Code/geulbeot_8th/.env.sample | 7 + 03. Code/geulbeot_8th/.gitignore | 32 + 03. Code/geulbeot_8th/Procfile | 1 + 03. Code/geulbeot_8th/README.md | 446 ++ 03. Code/geulbeot_8th/api_config.py | 30 + 03. Code/geulbeot_8th/app.py | 683 +++ 03. Code/geulbeot_8th/converters/__init__.py | 0 .../geulbeot_8th/converters/html_to_hwp.py | 1123 ++++ .../converters/html_to_hwp_briefing.py | 616 +++ .../converters/hwp_style_mapping.py | 434 ++ .../geulbeot_8th/converters/hwpx_generator.py | 431 ++ .../converters/hwpx_style_injector.py | 750 +++ .../converters/hwpx_table_injector.py | 174 + .../converters/pipeline/__init__.py | 1 + .../converters/pipeline/router.py | 165 + .../converters/pipeline/step1_convert.py | 784 +++ .../converters/pipeline/step2_extract.py | 789 +++ .../converters/pipeline/step3_domain.py | 265 + .../converters/pipeline/step4_chunk.py | 357 ++ .../converters/pipeline/step5_rag.py | 141 + .../converters/pipeline/step6_corpus.py | 232 + .../converters/pipeline/step7_index.py | 504 ++ .../converters/pipeline/step8_content.py | 1021 ++++ .../converters/pipeline/step9_html.py | 1249 +++++ .../geulbeot_8th/converters/style_analyzer.py | 935 ++++ 03. Code/geulbeot_8th/domain/__init__.py | 0 03. Code/geulbeot_8th/domain/hwpx/__init__.py | 0 .../domain/hwpx/hwpx_domain_guide.md | 769 +++ .../geulbeot_8th/domain/hwpx/hwpx_utils.py | 323 ++ 03. Code/geulbeot_8th/handlers/__init__.py | 7 + .../handlers/briefing/__init__.py | 5 + .../handlers/briefing/processor.py | 279 + .../briefing/prompts/step1_5_plan.txt | 104 + .../briefing/prompts/step1_extract.txt | 122 + .../briefing/prompts/step2_generate.txt | 440 ++ 03. Code/geulbeot_8th/handlers/common.py | 84 + .../geulbeot_8th/handlers/content_analyzer.py | 640 +++ .../geulbeot_8th/handlers/custom_doc_type.py | 555 ++ .../handlers/doc_template_analyzer.py | 150 + .../handlers/doc_type_analyzer.py | 1058 ++++ .../geulbeot_8th/handlers/report/__init__.py | 5 + .../geulbeot_8th/handlers/report/processor.py | 161 + .../report/prompts/refine_selection.txt | 104 + .../geulbeot_8th/handlers/semantic_mapper.py | 382 ++ .../geulbeot_8th/handlers/style_generator.py | 824 +++ .../handlers/template/__init__.py | 3 + .../template/html_table_template_css.txt | 1442 +++++ .../handlers/template/processor.py | 625 +++ .../template/prompts/analyze_template.txt | 28 + .../geulbeot_8th/handlers/template_manager.py | 1008 ++++ .../geulbeot_8th/handlers/tools/__init__.py | 51 + .../handlers/tools/border_fill.py | 127 + .../geulbeot_8th/handlers/tools/char_style.py | 133 + .../handlers/tools/content_order.py | 529 ++ 03. Code/geulbeot_8th/handlers/tools/font.py | 82 + .../handlers/tools/header_footer.py | 200 + 03. Code/geulbeot_8th/handlers/tools/image.py | 98 + .../geulbeot_8th/handlers/tools/numbering.py | 136 + .../geulbeot_8th/handlers/tools/page_setup.py | 110 + .../geulbeot_8th/handlers/tools/para_style.py | 185 + .../geulbeot_8th/handlers/tools/section.py | 120 + .../geulbeot_8th/handlers/tools/style_def.py | 68 + 03. Code/geulbeot_8th/handlers/tools/table.py | 328 ++ .../output/assets/1_1_1_img01.png | Bin 0 -> 120519 bytes .../output/assets/1_1_1_img02.png | Bin 0 -> 36607 bytes .../output/assets/1_1_1_img03.png | Bin 0 -> 98658 bytes .../output/assets/1_1_2_img01.png | Bin 0 -> 120519 bytes .../output/assets/1_1_2_img02.png | Bin 0 -> 36607 bytes .../output/assets/1_1_2_img03.png | Bin 0 -> 170732 bytes .../output/assets/1_1_3_img01.png | Bin 0 -> 261943 bytes .../output/assets/1_1_3_img02.png | Bin 0 -> 85636 bytes .../output/assets/1_2_1_img03.png | Bin 0 -> 72039 bytes .../output/assets/1_2_2_img01.png | Bin 0 -> 4970 bytes .../output/assets/1_2_2_img02.png | Bin 0 -> 5161 bytes .../output/assets/1_2_2_img03.png | Bin 0 -> 172819 bytes .../geulbeot_8th/prompts/step1_5_plan.txt | 104 + .../geulbeot_8th/prompts/step1_extract.txt | 122 + .../geulbeot_8th/prompts/step2_generate.txt | 440 ++ 03. Code/geulbeot_8th/requirements.txt | 5 + 03. Code/geulbeot_8th/static/css/editor.css | 297 ++ 03. Code/geulbeot_8th/static/js/editor.js | 1208 +++++ .../default/doc_types/briefing/config.json | 26 + .../doc_types/presentation/config.json | 27 + .../default/doc_types/report/config.json | 26 + 03. Code/geulbeot_8th/templates/hwp_guide.md | 302 ++ .../templates/hwp_html_defaults.json | 116 + 03. Code/geulbeot_8th/templates/index.html | 3766 +++++++++++++ .../doc_types/user_1770300969/config.json | 165 + .../user_1770300969/content_prompt.json | 267 + .../doc_types/user_1770301063/config.json | 184 + .../user_1770301063/content_prompt.json | 295 ++ .../user/templates/tpl_1770300969/meta.json | 15 + .../tpl_1770300969/semantic_map.json | 222 + .../user/templates/tpl_1770300969/style.json | 4688 +++++++++++++++++ .../templates/tpl_1770300969/template.html | 590 +++ .../user/templates/tpl_1770301063/meta.json | 13 + .../tpl_1770301063/semantic_map.json | 94 + .../user/templates/tpl_1770301063/style.json | 3355 ++++++++++++ .../templates/tpl_1770301063/template.html | 507 ++ 03. Code/geulbeot_9th/.env.sample | 7 + 03. Code/geulbeot_9th/.gitignore | 32 + .../0206용/report_2026-02-05 (10).html | 588 +++ .../0206용/report_2026-02-06.html | 250 + .../geulbeot_9th/0206용/report_2026-02-06.hwp | Bin 0 -> 888749 bytes 03. Code/geulbeot_9th/Procfile | 1 + 03. Code/geulbeot_9th/README.md | 389 ++ 03. Code/geulbeot_9th/api_config.py | 30 + 03. Code/geulbeot_9th/app.py | 683 +++ 03. Code/geulbeot_9th/converters/__init__.py | 0 .../geulbeot_9th/converters/html_to_hwp.py | 1123 ++++ .../converters/html_to_hwp_briefing.py | 616 +++ .../converters/hwp_style_mapping.py | 434 ++ .../geulbeot_9th/converters/hwpx_generator.py | 431 ++ .../converters/hwpx_style_injector.py | 750 +++ .../converters/hwpx_table_injector.py | 174 + .../converters/pipeline/__init__.py | 1 + .../converters/pipeline/router.py | 165 + .../converters/pipeline/step1_convert.py | 784 +++ .../converters/pipeline/step2_extract.py | 789 +++ .../converters/pipeline/step3_domain.py | 265 + .../converters/pipeline/step4_chunk.py | 357 ++ .../converters/pipeline/step5_rag.py | 141 + .../converters/pipeline/step6_corpus.py | 232 + .../converters/pipeline/step7_index.py | 504 ++ .../converters/pipeline/step8_content.py | 1021 ++++ .../converters/pipeline/step9_html.py | 1249 +++++ .../geulbeot_9th/converters/style_analyzer.py | 935 ++++ 03. Code/geulbeot_9th/domain/__init__.py | 0 03. Code/geulbeot_9th/domain/hwpx/__init__.py | 0 .../domain/hwpx/hwpx_domain_guide.md | 769 +++ .../geulbeot_9th/domain/hwpx/hwpx_utils.py | 323 ++ 03. Code/geulbeot_9th/handlers/__init__.py | 7 + .../handlers/briefing/__init__.py | 5 + .../handlers/briefing/processor.py | 279 + .../briefing/prompts/step1_5_plan.txt | 104 + .../briefing/prompts/step1_extract.txt | 122 + .../briefing/prompts/step2_generate.txt | 440 ++ 03. Code/geulbeot_9th/handlers/common.py | 84 + .../geulbeot_9th/handlers/content_analyzer.py | 640 +++ .../geulbeot_9th/handlers/custom_doc_type.py | 555 ++ .../handlers/doc_template_analyzer.py | 150 + .../handlers/doc_type_analyzer.py | 1058 ++++ .../geulbeot_9th/handlers/report/__init__.py | 5 + .../geulbeot_9th/handlers/report/processor.py | 161 + .../report/prompts/refine_selection.txt | 104 + .../geulbeot_9th/handlers/semantic_mapper.py | 382 ++ .../geulbeot_9th/handlers/style_generator.py | 824 +++ .../handlers/template/__init__.py | 3 + .../template/html_table_template_css.txt | 1442 +++++ .../handlers/template/processor.py | 625 +++ .../template/prompts/analyze_template.txt | 28 + .../geulbeot_9th/handlers/template_manager.py | 1010 ++++ .../geulbeot_9th/handlers/tools/__init__.py | 51 + .../handlers/tools/border_fill.py | 127 + .../geulbeot_9th/handlers/tools/char_style.py | 133 + .../handlers/tools/content_order.py | 550 ++ 03. Code/geulbeot_9th/handlers/tools/font.py | 82 + .../handlers/tools/header_footer.py | 200 + 03. Code/geulbeot_9th/handlers/tools/image.py | 98 + .../geulbeot_9th/handlers/tools/numbering.py | 136 + .../geulbeot_9th/handlers/tools/page_setup.py | 110 + .../geulbeot_9th/handlers/tools/para_style.py | 185 + .../geulbeot_9th/handlers/tools/section.py | 120 + .../geulbeot_9th/handlers/tools/style_def.py | 68 + 03. Code/geulbeot_9th/handlers/tools/table.py | 328 ++ .../output/assets/1_1_1_img01.png | Bin 0 -> 120519 bytes .../output/assets/1_1_1_img02.png | Bin 0 -> 36607 bytes .../output/assets/1_1_1_img03.png | Bin 0 -> 98658 bytes .../output/assets/1_1_2_img01.png | Bin 0 -> 120519 bytes .../output/assets/1_1_2_img02.png | Bin 0 -> 36607 bytes .../output/assets/1_1_2_img03.png | Bin 0 -> 170732 bytes .../output/assets/1_1_3_img01.png | Bin 0 -> 261943 bytes .../output/assets/1_1_3_img02.png | Bin 0 -> 85636 bytes .../output/assets/1_2_1_img03.png | Bin 0 -> 72039 bytes .../output/assets/1_2_2_img01.png | Bin 0 -> 4970 bytes .../output/assets/1_2_2_img02.png | Bin 0 -> 5161 bytes .../output/assets/1_2_2_img03.png | Bin 0 -> 172819 bytes .../geulbeot_9th/prompts/step1_5_plan.txt | 104 + .../geulbeot_9th/prompts/step1_extract.txt | 122 + .../geulbeot_9th/prompts/step2_generate.txt | 440 ++ 03. Code/geulbeot_9th/requirements.txt | 5 + 03. Code/geulbeot_9th/static/css/editor.css | 297 ++ 03. Code/geulbeot_9th/static/js/editor.js | 1208 +++++ .../default/doc_types/briefing/config.json | 26 + .../doc_types/presentation/config.json | 27 + .../default/doc_types/report/config.json | 26 + 03. Code/geulbeot_9th/templates/hwp_guide.md | 302 ++ .../templates/hwp_html_defaults.json | 116 + 03. Code/geulbeot_9th/templates/index.html | 3764 +++++++++++++ .../doc_types/user_1770335603/config.json | 165 + .../user_1770335603/content_prompt.json | 267 + .../user/templates/tpl_1770333144/meta.json | 15 + .../tpl_1770333144/semantic_map.json | 222 + .../user/templates/tpl_1770333144/style.json | 4688 +++++++++++++++++ .../templates/tpl_1770333144/template.html | 590 +++ .../user/templates/tpl_1770335603/meta.json | 15 + .../tpl_1770335603/semantic_map.json | 222 + .../user/templates/tpl_1770335603/style.json | 4688 +++++++++++++++++ .../templates/tpl_1770335603/template.html | 590 +++ .../geulbeot_9th/기준 프롬프트(0206_0706).txt | 486 ++ 687 files changed, 205247 insertions(+) create mode 100644 02. Prompts/문서생성/codedomain/국토일보_한건신문_Python_v01.py create mode 100644 02. Prompts/문서생성/codedomain/날짜_형식을_Python_v01.py create mode 100644 02. Prompts/문서생성/codedomain/다음_로우데이터를_Python_v01.py create mode 100644 02. Prompts/문서생성/codedomain/단위일_가능성_Python_v01.py create mode 100644 02. Prompts/문서생성/codedomain/단일_기사_Python_v01.py create mode 100644 02. Prompts/문서생성/codedomain/당신은_보고서_Python_v01.py create mode 100644 02. Prompts/문서생성/codedomain/로그_전체_Python_v01.py create mode 100644 02. Prompts/문서생성/codedomain/로그_파일을_Python_v01.py create mode 100644 02. Prompts/문서생성/codedomain/리스트_페이지_Python_v01.py create mode 100644 02. Prompts/문서생성/codedomain/멀티라인_대응_Python_v01.py create mode 100644 02. Prompts/문서생성/codedomain/미분류_과업_Python_v01.py create mode 100644 02. Prompts/문서생성/codedomain/법령_지침_Python_v01.py create mode 100644 02. Prompts/문서생성/codedomain/보고서_섹션에_Python_v01.py create mode 100644 02. Prompts/문서생성/codedomain/사이트별_함수_Python_v01.py create mode 100644 02. Prompts/문서생성/codedomain/설명이_없습니다_Python_v01.py create mode 100644 02. Prompts/문서생성/codedomain/설정_브라우저가_Python_v01.py create mode 100644 02. Prompts/문서생성/codedomain/수식_자체가_Python_v01.py create mode 100644 02. Prompts/문서생성/codedomain/수식_주소를_Python_v01.py create mode 100644 02. Prompts/문서생성/codedomain/수식을_가져오기_Python_v01.py create mode 100644 02. Prompts/문서생성/codedomain/아래_디자인_Python_v01.py create mode 100644 02. Prompts/문서생성/codedomain/엔지니어링데일리_기자_Python_v01.py create mode 100644 02. Prompts/문서생성/codedomain/엔티티_불필요한_Python_v01.py create mode 100644 02. Prompts/문서생성/codedomain/인증서_검증_Python_v01.py create mode 100644 02. Prompts/문서생성/codedomain/카테고리_내용_Python_v01.py create mode 100644 02. Prompts/문서생성/codedomain/커버_슬라이드_Python_v01.py create mode 100644 02. Prompts/문서생성/codedomain/합계_기준_Python_v01.py create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_가계_고금리_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_경제_진단_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_구조적_요인_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_굵게_기울임_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_그림_캡션_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_그림은_전체폭이며_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_글로벌_경제_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_날짜_호수_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_대략_색감만_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_대제목_전체폭_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_리드문_리드문_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_리스크_요인_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_마지막으로_로드한_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_메시지_처리_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_목차_목차_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_문서_편집기_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_물가와_금리_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_미분류_과업_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_본고딕_맑은_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_본문_다단_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_본문_대제목_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_본문_레이아웃_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_본문_본문_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_소제목은_내부_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_수출과_산업_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_양식_목록_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_양식_선택_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_양식_추가는_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_에서_검증된_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_요약_박스_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_요약_요약_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_요즘_경제_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_인쇄_모드_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_읽기_전용_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_자료_출처_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_전문적인_네이비_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_정리_위기는_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_제목_강조_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_제목_뒤에서_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_제목만_덩그러니_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_줄바꿈_다음줄_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_지금_경제_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_타이포그래피_설정_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_투자자_불확실성_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_파일을_놓으세요_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_파일을_먼저_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_페이지_끝에_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_페이지_분할_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_표지_날짜_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_표지_목차_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_표지_특집_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_표지나_특정_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_푸터_출처_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_한국_경제_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_화면_확인용_v01.md create mode 100644 02. Prompts/문서생성/domain/도메인_문서생성_화면에서_처럼_v01.md create mode 100644 02. Prompts/문서생성/exclude/unknown_v01.txt create mode 100644 02. Prompts/문서생성/prompt/GPT_문서생성_다음을_한국어로_v01.md create mode 100644 02. Prompts/문서생성/prompt/GPT_문서생성_미분류_과업_v01.md create mode 100644 02. Prompts/문서생성/prompt/GPT_문서생성_아래_내용을_v01.md create mode 100644 03. Code/geulbeot_10th/.env.sample create mode 100644 03. Code/geulbeot_10th/.gitignore create mode 100644 03. Code/geulbeot_10th/00.test/hwpx/out/out/context/domain_prompt.txt create mode 100644 03. Code/geulbeot_10th/Procfile create mode 100644 03. Code/geulbeot_10th/README.md create mode 100644 03. Code/geulbeot_10th/api_config.py create mode 100644 03. Code/geulbeot_10th/app.py create mode 100644 03. Code/geulbeot_10th/converters/__init__.py create mode 100644 03. Code/geulbeot_10th/converters/html_to_hwp.py create mode 100644 03. Code/geulbeot_10th/converters/html_to_hwp_briefing.py create mode 100644 03. Code/geulbeot_10th/converters/hwp_style_mapping.py create mode 100644 03. Code/geulbeot_10th/converters/hwpx_generator.py create mode 100644 03. Code/geulbeot_10th/converters/hwpx_style_injector.py create mode 100644 03. Code/geulbeot_10th/converters/hwpx_table_injector.py create mode 100644 03. Code/geulbeot_10th/converters/pipeline/__init__.py create mode 100644 03. Code/geulbeot_10th/converters/pipeline/router.py create mode 100644 03. Code/geulbeot_10th/converters/pipeline/step1_convert.py create mode 100644 03. Code/geulbeot_10th/converters/pipeline/step2_extract.py create mode 100644 03. Code/geulbeot_10th/converters/pipeline/step3_domain.py create mode 100644 03. Code/geulbeot_10th/converters/pipeline/step4_chunk.py create mode 100644 03. Code/geulbeot_10th/converters/pipeline/step5_rag.py create mode 100644 03. Code/geulbeot_10th/converters/pipeline/step6_corpus.py create mode 100644 03. Code/geulbeot_10th/converters/pipeline/step7_index.py create mode 100644 03. Code/geulbeot_10th/converters/pipeline/step8_content.py create mode 100644 03. Code/geulbeot_10th/converters/pipeline/step9_html.py create mode 100644 03. Code/geulbeot_10th/converters/style_analyzer.py create mode 100644 03. Code/geulbeot_10th/domain/__init__.py create mode 100644 03. Code/geulbeot_10th/domain/civil/dx.txt create mode 100644 03. Code/geulbeot_10th/domain/civil/general.txt create mode 100644 03. Code/geulbeot_10th/domain/civil/specialties/anlysis.txt create mode 100644 03. Code/geulbeot_10th/domain/civil/specialties/bim.txt create mode 100644 03. Code/geulbeot_10th/domain/civil/specialties/bridge.txt create mode 100644 03. Code/geulbeot_10th/domain/civil/specialties/communication.txt create mode 100644 03. Code/geulbeot_10th/domain/civil/specialties/construction.txt create mode 100644 03. Code/geulbeot_10th/domain/civil/specialties/geotechnical.txt create mode 100644 03. Code/geulbeot_10th/domain/civil/specialties/planning.txt create mode 100644 03. Code/geulbeot_10th/domain/civil/specialties/quality_env.txt create mode 100644 03. Code/geulbeot_10th/domain/civil/specialties/road.txt create mode 100644 03. Code/geulbeot_10th/domain/civil/specialties/safety.txt create mode 100644 03. Code/geulbeot_10th/domain/civil/specialties/schedule_cost.txt create mode 100644 03. Code/geulbeot_10th/domain/civil/specialties/structure.txt create mode 100644 03. Code/geulbeot_10th/domain/civil/specialties/survey.txt create mode 100644 03. Code/geulbeot_10th/domain/civil/specialties/tunnel.txt create mode 100644 03. Code/geulbeot_10th/domain/hwpx/__init__.py create mode 100644 03. Code/geulbeot_10th/domain/hwpx/hwpx_domain_guide.md create mode 100644 03. Code/geulbeot_10th/domain/hwpx/hwpx_utils.py create mode 100644 03. Code/geulbeot_10th/domain/report_guide/domain_prompt.txt create mode 100644 03. Code/geulbeot_10th/domain/report_guide/outline_issue_report.txt create mode 100644 03. Code/geulbeot_10th/domain/report_guide/report_guide.txt create mode 100644 03. Code/geulbeot_10th/domain_api.py create mode 100644 03. Code/geulbeot_10th/domain_config.json create mode 100644 03. Code/geulbeot_10th/handlers/__init__.py create mode 100644 03. Code/geulbeot_10th/handlers/briefing/__init__.py create mode 100644 03. Code/geulbeot_10th/handlers/briefing/processor.py create mode 100644 03. Code/geulbeot_10th/handlers/briefing/prompts/step1_5_plan.txt create mode 100644 03. Code/geulbeot_10th/handlers/briefing/prompts/step1_extract.txt create mode 100644 03. Code/geulbeot_10th/handlers/briefing/prompts/step2_generate.txt create mode 100644 03. Code/geulbeot_10th/handlers/common.py create mode 100644 03. Code/geulbeot_10th/handlers/doc/__init__.py create mode 100644 03. Code/geulbeot_10th/handlers/doc/content_analyzer.py create mode 100644 03. Code/geulbeot_10th/handlers/doc/custom_doc_type.py create mode 100644 03. Code/geulbeot_10th/handlers/doc/doc_type_analyzer.py create mode 100644 03. Code/geulbeot_10th/handlers/report/__init__.py create mode 100644 03. Code/geulbeot_10th/handlers/report/processor.py create mode 100644 03. Code/geulbeot_10th/handlers/report/prompts/refine_selection.txt create mode 100644 03. Code/geulbeot_10th/handlers/template/__init__.py create mode 100644 03. Code/geulbeot_10th/handlers/template/doc_template_analyzer.py create mode 100644 03. Code/geulbeot_10th/handlers/template/html_table_template_css.txt create mode 100644 03. Code/geulbeot_10th/handlers/template/processor.py create mode 100644 03. Code/geulbeot_10th/handlers/template/prompts/analyze_template.txt create mode 100644 03. Code/geulbeot_10th/handlers/template/semantic_mapper.py create mode 100644 03. Code/geulbeot_10th/handlers/template/style_generator.py create mode 100644 03. Code/geulbeot_10th/handlers/template/template_manager.py create mode 100644 03. Code/geulbeot_10th/handlers/template/tools/__init__.py create mode 100644 03. Code/geulbeot_10th/handlers/template/tools/border_fill.py create mode 100644 03. Code/geulbeot_10th/handlers/template/tools/char_style.py create mode 100644 03. Code/geulbeot_10th/handlers/template/tools/content_order.py create mode 100644 03. Code/geulbeot_10th/handlers/template/tools/font.py create mode 100644 03. Code/geulbeot_10th/handlers/template/tools/header_footer.py create mode 100644 03. Code/geulbeot_10th/handlers/template/tools/image.py create mode 100644 03. Code/geulbeot_10th/handlers/template/tools/numbering.py create mode 100644 03. Code/geulbeot_10th/handlers/template/tools/page_setup.py create mode 100644 03. Code/geulbeot_10th/handlers/template/tools/para_style.py create mode 100644 03. Code/geulbeot_10th/handlers/template/tools/section.py create mode 100644 03. Code/geulbeot_10th/handlers/template/tools/style_def.py create mode 100644 03. Code/geulbeot_10th/handlers/template/tools/table.py create mode 100644 03. Code/geulbeot_10th/static/css/editor.css create mode 100644 03. Code/geulbeot_10th/static/css/main.css create mode 100644 03. Code/geulbeot_10th/static/js/ai_edit.js create mode 100644 03. Code/geulbeot_10th/static/js/demo_mode.js create mode 100644 03. Code/geulbeot_10th/static/js/doc_type.js create mode 100644 03. Code/geulbeot_10th/static/js/domain_selector.js create mode 100644 03. Code/geulbeot_10th/static/js/editor.js create mode 100644 03. Code/geulbeot_10th/static/js/export.js create mode 100644 03. Code/geulbeot_10th/static/js/generator.js create mode 100644 03. Code/geulbeot_10th/static/js/modals.js create mode 100644 03. Code/geulbeot_10th/static/js/template.js create mode 100644 03. Code/geulbeot_10th/static/js/ui.js create mode 100644 03. Code/geulbeot_10th/static/result/brief_1.html create mode 100644 03. Code/geulbeot_10th/static/result/brief_2.html create mode 100644 03. Code/geulbeot_10th/static/result/report.html create mode 100644 03. Code/geulbeot_10th/static/result/slide.html create mode 100644 03. Code/geulbeot_10th/templates/default/doc_types/briefing/config.json create mode 100644 03. Code/geulbeot_10th/templates/default/doc_types/presentation/config.json create mode 100644 03. Code/geulbeot_10th/templates/default/doc_types/report/config.json create mode 100644 03. Code/geulbeot_10th/templates/hwp_guide.md create mode 100644 03. Code/geulbeot_10th/templates/hwp_html_defaults.json create mode 100644 03. Code/geulbeot_10th/templates/index.html create mode 100644 03. Code/geulbeot_10th/샘플 예시.html create mode 100644 03. Code/geulbeot_1st/.gitignore create mode 100644 03. Code/geulbeot_1st/Procfile create mode 100644 03. Code/geulbeot_1st/README.md create mode 100644 03. Code/geulbeot_1st/app.py create mode 100644 03. Code/geulbeot_1st/prompts/step1_5_plan.txt create mode 100644 03. Code/geulbeot_1st/prompts/step1_extract.txt create mode 100644 03. Code/geulbeot_1st/prompts/step2_generate.txt create mode 100644 03. Code/geulbeot_1st/prompts/system_prompt.txt create mode 100644 03. Code/geulbeot_1st/railway.json create mode 100644 03. Code/geulbeot_1st/requirements.txt create mode 100644 03. Code/geulbeot_1st/templates/hwp_guide.html create mode 100644 03. Code/geulbeot_1st/templates/index.html create mode 100644 03. Code/geulbeot_2nd/.gitignore create mode 100644 03. Code/geulbeot_2nd/Procfile create mode 100644 03. Code/geulbeot_2nd/README.md create mode 100644 03. Code/geulbeot_2nd/api_config.py create mode 100644 03. Code/geulbeot_2nd/app.py create mode 100644 03. Code/geulbeot_2nd/prompts/step1_5_plan.txt create mode 100644 03. Code/geulbeot_2nd/prompts/step1_extract.txt create mode 100644 03. Code/geulbeot_2nd/prompts/step2_generate.txt create mode 100644 03. Code/geulbeot_2nd/railway.json create mode 100644 03. Code/geulbeot_2nd/requirements.txt create mode 100644 03. Code/geulbeot_2nd/static/css/editor.css create mode 100644 03. Code/geulbeot_2nd/static/js/editor.js create mode 100644 03. Code/geulbeot_2nd/templates/hwp_guide.html create mode 100644 03. Code/geulbeot_2nd/templates/index.html create mode 100644 03. Code/geulbeot_3rd/.gitignore create mode 100644 03. Code/geulbeot_3rd/Procfile create mode 100644 03. Code/geulbeot_3rd/README.md create mode 100644 03. Code/geulbeot_3rd/api_config.py create mode 100644 03. Code/geulbeot_3rd/app.py create mode 100644 03. Code/geulbeot_3rd/converters/__init__.py create mode 100644 03. Code/geulbeot_3rd/converters/html_to_hwp.py create mode 100644 03. Code/geulbeot_3rd/converters/html_to_hwp_briefing.py create mode 100644 03. Code/geulbeot_3rd/converters/pipeline/__init__.py create mode 100644 03. Code/geulbeot_3rd/converters/pipeline/router.py create mode 100644 03. Code/geulbeot_3rd/converters/pipeline/step1_convert.py create mode 100644 03. Code/geulbeot_3rd/converters/pipeline/step2_extract.py create mode 100644 03. Code/geulbeot_3rd/converters/pipeline/step3_domain.py create mode 100644 03. Code/geulbeot_3rd/converters/pipeline/step4_chunk.py create mode 100644 03. Code/geulbeot_3rd/converters/pipeline/step5_rag.py create mode 100644 03. Code/geulbeot_3rd/converters/pipeline/step6_corpus.py create mode 100644 03. Code/geulbeot_3rd/converters/pipeline/step7_index.py create mode 100644 03. Code/geulbeot_3rd/converters/pipeline/step8_content.py create mode 100644 03. Code/geulbeot_3rd/converters/pipeline/step9_html.py create mode 100644 03. Code/geulbeot_3rd/output/assets/1_1_1_img01.png create mode 100644 03. Code/geulbeot_3rd/output/assets/1_1_1_img02.png create mode 100644 03. Code/geulbeot_3rd/output/assets/1_1_1_img03.png create mode 100644 03. Code/geulbeot_3rd/output/assets/1_1_2_img01.png create mode 100644 03. Code/geulbeot_3rd/output/assets/1_1_2_img02.png create mode 100644 03. Code/geulbeot_3rd/output/assets/1_1_2_img03.png create mode 100644 03. Code/geulbeot_3rd/output/assets/1_1_3_img01.png create mode 100644 03. Code/geulbeot_3rd/output/assets/1_1_3_img02.png create mode 100644 03. Code/geulbeot_3rd/output/assets/1_2_1_img03.png create mode 100644 03. Code/geulbeot_3rd/output/assets/1_2_2_img01.png create mode 100644 03. Code/geulbeot_3rd/output/assets/1_2_2_img02.png create mode 100644 03. Code/geulbeot_3rd/output/assets/1_2_2_img03.png create mode 100644 03. Code/geulbeot_3rd/prompts/step1_5_plan.txt create mode 100644 03. Code/geulbeot_3rd/prompts/step1_extract.txt create mode 100644 03. Code/geulbeot_3rd/prompts/step2_generate.txt create mode 100644 03. Code/geulbeot_3rd/railway.json create mode 100644 03. Code/geulbeot_3rd/requirements.txt create mode 100644 03. Code/geulbeot_3rd/static/css/editor.css create mode 100644 03. Code/geulbeot_3rd/static/js/editor.js create mode 100644 03. Code/geulbeot_3rd/templates/hwp_guide.html create mode 100644 03. Code/geulbeot_3rd/templates/index.html create mode 100644 03. Code/geulbeot_4th/.env.sample create mode 100644 03. Code/geulbeot_4th/.gitignore create mode 100644 03. Code/geulbeot_4th/Procfile create mode 100644 03. Code/geulbeot_4th/README.md create mode 100644 03. Code/geulbeot_4th/api_config.py create mode 100644 03. Code/geulbeot_4th/app.py create mode 100644 03. Code/geulbeot_4th/converters/__init__.py create mode 100644 03. Code/geulbeot_4th/converters/dkdl.py create mode 100644 03. Code/geulbeot_4th/converters/html_to_hwp.py create mode 100644 03. Code/geulbeot_4th/converters/html_to_hwp_briefing.py create mode 100644 03. Code/geulbeot_4th/converters/hwp_style_mapping.py create mode 100644 03. Code/geulbeot_4th/converters/hwpx_generator.py create mode 100644 03. Code/geulbeot_4th/converters/pipeline/__init__.py create mode 100644 03. Code/geulbeot_4th/converters/pipeline/router.py create mode 100644 03. Code/geulbeot_4th/converters/pipeline/step1_convert.py create mode 100644 03. Code/geulbeot_4th/converters/pipeline/step2_extract.py create mode 100644 03. Code/geulbeot_4th/converters/pipeline/step3_domain.py create mode 100644 03. Code/geulbeot_4th/converters/pipeline/step4_chunk.py create mode 100644 03. Code/geulbeot_4th/converters/pipeline/step5_rag.py create mode 100644 03. Code/geulbeot_4th/converters/pipeline/step6_corpus.py create mode 100644 03. Code/geulbeot_4th/converters/pipeline/step7_index.py create mode 100644 03. Code/geulbeot_4th/converters/pipeline/step8_content.py create mode 100644 03. Code/geulbeot_4th/converters/pipeline/step9_html.py create mode 100644 03. Code/geulbeot_4th/converters/style_analyzer.py create mode 100644 03. Code/geulbeot_4th/handlers/__init__.py create mode 100644 03. Code/geulbeot_4th/handlers/briefing/__init__.py create mode 100644 03. Code/geulbeot_4th/handlers/briefing/processor.py create mode 100644 03. Code/geulbeot_4th/handlers/briefing/prompts/step1_5_plan.txt create mode 100644 03. Code/geulbeot_4th/handlers/briefing/prompts/step1_extract.txt create mode 100644 03. Code/geulbeot_4th/handlers/briefing/prompts/step2_generate.txt create mode 100644 03. Code/geulbeot_4th/handlers/common.py create mode 100644 03. Code/geulbeot_4th/handlers/report/__init__.py create mode 100644 03. Code/geulbeot_4th/handlers/report/processor.py create mode 100644 03. Code/geulbeot_4th/handlers/report/prompts/refine_selection.txt create mode 100644 03. Code/geulbeot_4th/output/assets/1_1_1_img01.png create mode 100644 03. Code/geulbeot_4th/output/assets/1_1_1_img02.png create mode 100644 03. Code/geulbeot_4th/output/assets/1_1_1_img03.png create mode 100644 03. Code/geulbeot_4th/output/assets/1_1_2_img01.png create mode 100644 03. Code/geulbeot_4th/output/assets/1_1_2_img02.png create mode 100644 03. Code/geulbeot_4th/output/assets/1_1_2_img03.png create mode 100644 03. Code/geulbeot_4th/output/assets/1_1_3_img01.png create mode 100644 03. Code/geulbeot_4th/output/assets/1_1_3_img02.png create mode 100644 03. Code/geulbeot_4th/output/assets/1_2_1_img03.png create mode 100644 03. Code/geulbeot_4th/output/assets/1_2_2_img01.png create mode 100644 03. Code/geulbeot_4th/output/assets/1_2_2_img02.png create mode 100644 03. Code/geulbeot_4th/output/assets/1_2_2_img03.png create mode 100644 03. Code/geulbeot_4th/prompts/step1_5_plan.txt create mode 100644 03. Code/geulbeot_4th/prompts/step1_extract.txt create mode 100644 03. Code/geulbeot_4th/prompts/step2_generate.txt create mode 100644 03. Code/geulbeot_4th/requirements.txt create mode 100644 03. Code/geulbeot_4th/static/css/editor.css create mode 100644 03. Code/geulbeot_4th/static/js/editor.js create mode 100644 03. Code/geulbeot_4th/templates/hwp_guide.html create mode 100644 03. Code/geulbeot_4th/templates/index.html create mode 100644 03. Code/geulbeot_5th/.env.sample create mode 100644 03. Code/geulbeot_5th/.gitignore create mode 100644 03. Code/geulbeot_5th/Procfile create mode 100644 03. Code/geulbeot_5th/README.md create mode 100644 03. Code/geulbeot_5th/api_config.py create mode 100644 03. Code/geulbeot_5th/app.py create mode 100644 03. Code/geulbeot_5th/converters/__init__.py create mode 100644 03. Code/geulbeot_5th/converters/html_to_hwp.py create mode 100644 03. Code/geulbeot_5th/converters/html_to_hwp_briefing.py create mode 100644 03. Code/geulbeot_5th/converters/hwp_style_mapping.py create mode 100644 03. Code/geulbeot_5th/converters/hwpx_generator.py create mode 100644 03. Code/geulbeot_5th/converters/hwpx_style_injector.py create mode 100644 03. Code/geulbeot_5th/converters/hwpx_table_injector.py create mode 100644 03. Code/geulbeot_5th/converters/pipeline/__init__.py create mode 100644 03. Code/geulbeot_5th/converters/pipeline/router.py create mode 100644 03. Code/geulbeot_5th/converters/pipeline/step1_convert.py create mode 100644 03. Code/geulbeot_5th/converters/pipeline/step2_extract.py create mode 100644 03. Code/geulbeot_5th/converters/pipeline/step3_domain.py create mode 100644 03. Code/geulbeot_5th/converters/pipeline/step4_chunk.py create mode 100644 03. Code/geulbeot_5th/converters/pipeline/step5_rag.py create mode 100644 03. Code/geulbeot_5th/converters/pipeline/step6_corpus.py create mode 100644 03. Code/geulbeot_5th/converters/pipeline/step7_index.py create mode 100644 03. Code/geulbeot_5th/converters/pipeline/step8_content.py create mode 100644 03. Code/geulbeot_5th/converters/pipeline/step9_html.py create mode 100644 03. Code/geulbeot_5th/converters/style_analyzer.py create mode 100644 03. Code/geulbeot_5th/handlers/__init__.py create mode 100644 03. Code/geulbeot_5th/handlers/briefing/__init__.py create mode 100644 03. Code/geulbeot_5th/handlers/briefing/processor.py create mode 100644 03. Code/geulbeot_5th/handlers/briefing/prompts/step1_5_plan.txt create mode 100644 03. Code/geulbeot_5th/handlers/briefing/prompts/step1_extract.txt create mode 100644 03. Code/geulbeot_5th/handlers/briefing/prompts/step2_generate.txt create mode 100644 03. Code/geulbeot_5th/handlers/common.py create mode 100644 03. Code/geulbeot_5th/handlers/report/__init__.py create mode 100644 03. Code/geulbeot_5th/handlers/report/processor.py create mode 100644 03. Code/geulbeot_5th/handlers/report/prompts/refine_selection.txt create mode 100644 03. Code/geulbeot_5th/output/assets/1_1_1_img01.png create mode 100644 03. Code/geulbeot_5th/output/assets/1_1_1_img02.png create mode 100644 03. Code/geulbeot_5th/output/assets/1_1_1_img03.png create mode 100644 03. Code/geulbeot_5th/output/assets/1_1_2_img01.png create mode 100644 03. Code/geulbeot_5th/output/assets/1_1_2_img02.png create mode 100644 03. Code/geulbeot_5th/output/assets/1_1_2_img03.png create mode 100644 03. Code/geulbeot_5th/output/assets/1_1_3_img01.png create mode 100644 03. Code/geulbeot_5th/output/assets/1_1_3_img02.png create mode 100644 03. Code/geulbeot_5th/output/assets/1_2_1_img03.png create mode 100644 03. Code/geulbeot_5th/output/assets/1_2_2_img01.png create mode 100644 03. Code/geulbeot_5th/output/assets/1_2_2_img02.png create mode 100644 03. Code/geulbeot_5th/output/assets/1_2_2_img03.png create mode 100644 03. Code/geulbeot_5th/prompts/step1_5_plan.txt create mode 100644 03. Code/geulbeot_5th/prompts/step1_extract.txt create mode 100644 03. Code/geulbeot_5th/prompts/step2_generate.txt create mode 100644 03. Code/geulbeot_5th/requirements.txt create mode 100644 03. Code/geulbeot_5th/static/css/editor.css create mode 100644 03. Code/geulbeot_5th/static/js/editor.js create mode 100644 03. Code/geulbeot_5th/templates/hwp_guide.html create mode 100644 03. Code/geulbeot_5th/templates/index.html create mode 100644 03. Code/geulbeot_6th/.env.sample create mode 100644 03. Code/geulbeot_6th/.gitignore create mode 100644 03. Code/geulbeot_6th/Procfile create mode 100644 03. Code/geulbeot_6th/README.md create mode 100644 03. Code/geulbeot_6th/api_config.py create mode 100644 03. Code/geulbeot_6th/app.py create mode 100644 03. Code/geulbeot_6th/converters/__init__.py create mode 100644 03. Code/geulbeot_6th/converters/html_to_hwp.py create mode 100644 03. Code/geulbeot_6th/converters/html_to_hwp_briefing.py create mode 100644 03. Code/geulbeot_6th/converters/hwp_style_mapping.py create mode 100644 03. Code/geulbeot_6th/converters/hwpx_generator.py create mode 100644 03. Code/geulbeot_6th/converters/hwpx_style_injector.py create mode 100644 03. Code/geulbeot_6th/converters/hwpx_table_injector.py create mode 100644 03. Code/geulbeot_6th/converters/pipeline/__init__.py create mode 100644 03. Code/geulbeot_6th/converters/pipeline/router.py create mode 100644 03. Code/geulbeot_6th/converters/pipeline/step1_convert.py create mode 100644 03. Code/geulbeot_6th/converters/pipeline/step2_extract.py create mode 100644 03. Code/geulbeot_6th/converters/pipeline/step3_domain.py create mode 100644 03. Code/geulbeot_6th/converters/pipeline/step4_chunk.py create mode 100644 03. Code/geulbeot_6th/converters/pipeline/step5_rag.py create mode 100644 03. Code/geulbeot_6th/converters/pipeline/step6_corpus.py create mode 100644 03. Code/geulbeot_6th/converters/pipeline/step7_index.py create mode 100644 03. Code/geulbeot_6th/converters/pipeline/step8_content.py create mode 100644 03. Code/geulbeot_6th/converters/pipeline/step9_html.py create mode 100644 03. Code/geulbeot_6th/converters/style_analyzer.py create mode 100644 03. Code/geulbeot_6th/handlers/__init__.py create mode 100644 03. Code/geulbeot_6th/handlers/briefing/__init__.py create mode 100644 03. Code/geulbeot_6th/handlers/briefing/processor.py create mode 100644 03. Code/geulbeot_6th/handlers/briefing/prompts/step1_5_plan.txt create mode 100644 03. Code/geulbeot_6th/handlers/briefing/prompts/step1_extract.txt create mode 100644 03. Code/geulbeot_6th/handlers/briefing/prompts/step2_generate.txt create mode 100644 03. Code/geulbeot_6th/handlers/common.py create mode 100644 03. Code/geulbeot_6th/handlers/report/__init__.py create mode 100644 03. Code/geulbeot_6th/handlers/report/processor.py create mode 100644 03. Code/geulbeot_6th/handlers/report/prompts/refine_selection.txt create mode 100644 03. Code/geulbeot_6th/handlers/template/__init__.py create mode 100644 03. Code/geulbeot_6th/handlers/template/processor.py create mode 100644 03. Code/geulbeot_6th/handlers/template/prompts/analyze_template.txt create mode 100644 03. Code/geulbeot_6th/output/assets/1_1_1_img01.png create mode 100644 03. Code/geulbeot_6th/output/assets/1_1_1_img02.png create mode 100644 03. Code/geulbeot_6th/output/assets/1_1_1_img03.png create mode 100644 03. Code/geulbeot_6th/output/assets/1_1_2_img01.png create mode 100644 03. Code/geulbeot_6th/output/assets/1_1_2_img02.png create mode 100644 03. Code/geulbeot_6th/output/assets/1_1_2_img03.png create mode 100644 03. Code/geulbeot_6th/output/assets/1_1_3_img01.png create mode 100644 03. Code/geulbeot_6th/output/assets/1_1_3_img02.png create mode 100644 03. Code/geulbeot_6th/output/assets/1_2_1_img03.png create mode 100644 03. Code/geulbeot_6th/output/assets/1_2_2_img01.png create mode 100644 03. Code/geulbeot_6th/output/assets/1_2_2_img02.png create mode 100644 03. Code/geulbeot_6th/output/assets/1_2_2_img03.png create mode 100644 03. Code/geulbeot_6th/prompts/step1_5_plan.txt create mode 100644 03. Code/geulbeot_6th/prompts/step1_extract.txt create mode 100644 03. Code/geulbeot_6th/prompts/step2_generate.txt create mode 100644 03. Code/geulbeot_6th/requirements.txt create mode 100644 03. Code/geulbeot_6th/static/css/editor.css create mode 100644 03. Code/geulbeot_6th/static/js/editor.js create mode 100644 03. Code/geulbeot_6th/templates/hwp_guide.html create mode 100644 03. Code/geulbeot_6th/templates/index.html create mode 100644 03. Code/geulbeot_7th/.env.sample create mode 100644 03. Code/geulbeot_7th/.gitignore create mode 100644 03. Code/geulbeot_7th/7th.zip create mode 100644 03. Code/geulbeot_7th/Procfile create mode 100644 03. Code/geulbeot_7th/README.md create mode 100644 03. Code/geulbeot_7th/api_config.py create mode 100644 03. Code/geulbeot_7th/app.py create mode 100644 03. Code/geulbeot_7th/converters/__init__.py create mode 100644 03. Code/geulbeot_7th/converters/html_to_hwp.py create mode 100644 03. Code/geulbeot_7th/converters/html_to_hwp_briefing.py create mode 100644 03. Code/geulbeot_7th/converters/hwp_style_mapping.py create mode 100644 03. Code/geulbeot_7th/converters/hwpx_generator.py create mode 100644 03. Code/geulbeot_7th/converters/hwpx_style_injector.py create mode 100644 03. Code/geulbeot_7th/converters/hwpx_table_injector.py create mode 100644 03. Code/geulbeot_7th/converters/pipeline/__init__.py create mode 100644 03. Code/geulbeot_7th/converters/pipeline/router.py create mode 100644 03. Code/geulbeot_7th/converters/pipeline/step1_convert.py create mode 100644 03. Code/geulbeot_7th/converters/pipeline/step2_extract.py create mode 100644 03. Code/geulbeot_7th/converters/pipeline/step3_domain.py create mode 100644 03. Code/geulbeot_7th/converters/pipeline/step4_chunk.py create mode 100644 03. Code/geulbeot_7th/converters/pipeline/step5_rag.py create mode 100644 03. Code/geulbeot_7th/converters/pipeline/step6_corpus.py create mode 100644 03. Code/geulbeot_7th/converters/pipeline/step7_index.py create mode 100644 03. Code/geulbeot_7th/converters/pipeline/step8_content.py create mode 100644 03. Code/geulbeot_7th/converters/pipeline/step9_html.py create mode 100644 03. Code/geulbeot_7th/converters/style_analyzer.py create mode 100644 03. Code/geulbeot_7th/handlers/__init__.py create mode 100644 03. Code/geulbeot_7th/handlers/briefing/__init__.py create mode 100644 03. Code/geulbeot_7th/handlers/briefing/processor.py create mode 100644 03. Code/geulbeot_7th/handlers/briefing/prompts/step1_5_plan.txt create mode 100644 03. Code/geulbeot_7th/handlers/briefing/prompts/step1_extract.txt create mode 100644 03. Code/geulbeot_7th/handlers/briefing/prompts/step2_generate.txt create mode 100644 03. Code/geulbeot_7th/handlers/common.py create mode 100644 03. Code/geulbeot_7th/handlers/report/__init__.py create mode 100644 03. Code/geulbeot_7th/handlers/report/processor.py create mode 100644 03. Code/geulbeot_7th/handlers/report/prompts/refine_selection.txt create mode 100644 03. Code/geulbeot_7th/handlers/template/__init__.py create mode 100644 03. Code/geulbeot_7th/handlers/template/processor.py create mode 100644 03. Code/geulbeot_7th/handlers/template/prompts/analyze_template.txt create mode 100644 03. Code/geulbeot_7th/output/assets/1_1_1_img01.png create mode 100644 03. Code/geulbeot_7th/output/assets/1_1_1_img02.png create mode 100644 03. Code/geulbeot_7th/output/assets/1_1_1_img03.png create mode 100644 03. Code/geulbeot_7th/output/assets/1_1_2_img01.png create mode 100644 03. Code/geulbeot_7th/output/assets/1_1_2_img02.png create mode 100644 03. Code/geulbeot_7th/output/assets/1_1_2_img03.png create mode 100644 03. Code/geulbeot_7th/output/assets/1_1_3_img01.png create mode 100644 03. Code/geulbeot_7th/output/assets/1_1_3_img02.png create mode 100644 03. Code/geulbeot_7th/output/assets/1_2_1_img03.png create mode 100644 03. Code/geulbeot_7th/output/assets/1_2_2_img01.png create mode 100644 03. Code/geulbeot_7th/output/assets/1_2_2_img02.png create mode 100644 03. Code/geulbeot_7th/output/assets/1_2_2_img03.png create mode 100644 03. Code/geulbeot_7th/prompts/step1_5_plan.txt create mode 100644 03. Code/geulbeot_7th/prompts/step1_extract.txt create mode 100644 03. Code/geulbeot_7th/prompts/step2_generate.txt create mode 100644 03. Code/geulbeot_7th/requirements.txt create mode 100644 03. Code/geulbeot_7th/static/css/editor.css create mode 100644 03. Code/geulbeot_7th/static/js/editor.js create mode 100644 03. Code/geulbeot_7th/templates/hwp_guide.html create mode 100644 03. Code/geulbeot_7th/templates/index.html create mode 100644 03. Code/geulbeot_8th/.env.sample create mode 100644 03. Code/geulbeot_8th/.gitignore create mode 100644 03. Code/geulbeot_8th/Procfile create mode 100644 03. Code/geulbeot_8th/README.md create mode 100644 03. Code/geulbeot_8th/api_config.py create mode 100644 03. Code/geulbeot_8th/app.py create mode 100644 03. Code/geulbeot_8th/converters/__init__.py create mode 100644 03. Code/geulbeot_8th/converters/html_to_hwp.py create mode 100644 03. Code/geulbeot_8th/converters/html_to_hwp_briefing.py create mode 100644 03. Code/geulbeot_8th/converters/hwp_style_mapping.py create mode 100644 03. Code/geulbeot_8th/converters/hwpx_generator.py create mode 100644 03. Code/geulbeot_8th/converters/hwpx_style_injector.py create mode 100644 03. Code/geulbeot_8th/converters/hwpx_table_injector.py create mode 100644 03. Code/geulbeot_8th/converters/pipeline/__init__.py create mode 100644 03. Code/geulbeot_8th/converters/pipeline/router.py create mode 100644 03. Code/geulbeot_8th/converters/pipeline/step1_convert.py create mode 100644 03. Code/geulbeot_8th/converters/pipeline/step2_extract.py create mode 100644 03. Code/geulbeot_8th/converters/pipeline/step3_domain.py create mode 100644 03. Code/geulbeot_8th/converters/pipeline/step4_chunk.py create mode 100644 03. Code/geulbeot_8th/converters/pipeline/step5_rag.py create mode 100644 03. Code/geulbeot_8th/converters/pipeline/step6_corpus.py create mode 100644 03. Code/geulbeot_8th/converters/pipeline/step7_index.py create mode 100644 03. Code/geulbeot_8th/converters/pipeline/step8_content.py create mode 100644 03. Code/geulbeot_8th/converters/pipeline/step9_html.py create mode 100644 03. Code/geulbeot_8th/converters/style_analyzer.py create mode 100644 03. Code/geulbeot_8th/domain/__init__.py create mode 100644 03. Code/geulbeot_8th/domain/hwpx/__init__.py create mode 100644 03. Code/geulbeot_8th/domain/hwpx/hwpx_domain_guide.md create mode 100644 03. Code/geulbeot_8th/domain/hwpx/hwpx_utils.py create mode 100644 03. Code/geulbeot_8th/handlers/__init__.py create mode 100644 03. Code/geulbeot_8th/handlers/briefing/__init__.py create mode 100644 03. Code/geulbeot_8th/handlers/briefing/processor.py create mode 100644 03. Code/geulbeot_8th/handlers/briefing/prompts/step1_5_plan.txt create mode 100644 03. Code/geulbeot_8th/handlers/briefing/prompts/step1_extract.txt create mode 100644 03. Code/geulbeot_8th/handlers/briefing/prompts/step2_generate.txt create mode 100644 03. Code/geulbeot_8th/handlers/common.py create mode 100644 03. Code/geulbeot_8th/handlers/content_analyzer.py create mode 100644 03. Code/geulbeot_8th/handlers/custom_doc_type.py create mode 100644 03. Code/geulbeot_8th/handlers/doc_template_analyzer.py create mode 100644 03. Code/geulbeot_8th/handlers/doc_type_analyzer.py create mode 100644 03. Code/geulbeot_8th/handlers/report/__init__.py create mode 100644 03. Code/geulbeot_8th/handlers/report/processor.py create mode 100644 03. Code/geulbeot_8th/handlers/report/prompts/refine_selection.txt create mode 100644 03. Code/geulbeot_8th/handlers/semantic_mapper.py create mode 100644 03. Code/geulbeot_8th/handlers/style_generator.py create mode 100644 03. Code/geulbeot_8th/handlers/template/__init__.py create mode 100644 03. Code/geulbeot_8th/handlers/template/html_table_template_css.txt create mode 100644 03. Code/geulbeot_8th/handlers/template/processor.py create mode 100644 03. Code/geulbeot_8th/handlers/template/prompts/analyze_template.txt create mode 100644 03. Code/geulbeot_8th/handlers/template_manager.py create mode 100644 03. Code/geulbeot_8th/handlers/tools/__init__.py create mode 100644 03. Code/geulbeot_8th/handlers/tools/border_fill.py create mode 100644 03. Code/geulbeot_8th/handlers/tools/char_style.py create mode 100644 03. Code/geulbeot_8th/handlers/tools/content_order.py create mode 100644 03. Code/geulbeot_8th/handlers/tools/font.py create mode 100644 03. Code/geulbeot_8th/handlers/tools/header_footer.py create mode 100644 03. Code/geulbeot_8th/handlers/tools/image.py create mode 100644 03. Code/geulbeot_8th/handlers/tools/numbering.py create mode 100644 03. Code/geulbeot_8th/handlers/tools/page_setup.py create mode 100644 03. Code/geulbeot_8th/handlers/tools/para_style.py create mode 100644 03. Code/geulbeot_8th/handlers/tools/section.py create mode 100644 03. Code/geulbeot_8th/handlers/tools/style_def.py create mode 100644 03. Code/geulbeot_8th/handlers/tools/table.py create mode 100644 03. Code/geulbeot_8th/output/assets/1_1_1_img01.png create mode 100644 03. Code/geulbeot_8th/output/assets/1_1_1_img02.png create mode 100644 03. Code/geulbeot_8th/output/assets/1_1_1_img03.png create mode 100644 03. Code/geulbeot_8th/output/assets/1_1_2_img01.png create mode 100644 03. Code/geulbeot_8th/output/assets/1_1_2_img02.png create mode 100644 03. Code/geulbeot_8th/output/assets/1_1_2_img03.png create mode 100644 03. Code/geulbeot_8th/output/assets/1_1_3_img01.png create mode 100644 03. Code/geulbeot_8th/output/assets/1_1_3_img02.png create mode 100644 03. Code/geulbeot_8th/output/assets/1_2_1_img03.png create mode 100644 03. Code/geulbeot_8th/output/assets/1_2_2_img01.png create mode 100644 03. Code/geulbeot_8th/output/assets/1_2_2_img02.png create mode 100644 03. Code/geulbeot_8th/output/assets/1_2_2_img03.png create mode 100644 03. Code/geulbeot_8th/prompts/step1_5_plan.txt create mode 100644 03. Code/geulbeot_8th/prompts/step1_extract.txt create mode 100644 03. Code/geulbeot_8th/prompts/step2_generate.txt create mode 100644 03. Code/geulbeot_8th/requirements.txt create mode 100644 03. Code/geulbeot_8th/static/css/editor.css create mode 100644 03. Code/geulbeot_8th/static/js/editor.js create mode 100644 03. Code/geulbeot_8th/templates/default/doc_types/briefing/config.json create mode 100644 03. Code/geulbeot_8th/templates/default/doc_types/presentation/config.json create mode 100644 03. Code/geulbeot_8th/templates/default/doc_types/report/config.json create mode 100644 03. Code/geulbeot_8th/templates/hwp_guide.md create mode 100644 03. Code/geulbeot_8th/templates/hwp_html_defaults.json create mode 100644 03. Code/geulbeot_8th/templates/index.html create mode 100644 03. Code/geulbeot_8th/templates/user/doc_types/user_1770300969/config.json create mode 100644 03. Code/geulbeot_8th/templates/user/doc_types/user_1770300969/content_prompt.json create mode 100644 03. Code/geulbeot_8th/templates/user/doc_types/user_1770301063/config.json create mode 100644 03. Code/geulbeot_8th/templates/user/doc_types/user_1770301063/content_prompt.json create mode 100644 03. Code/geulbeot_8th/templates/user/templates/tpl_1770300969/meta.json create mode 100644 03. Code/geulbeot_8th/templates/user/templates/tpl_1770300969/semantic_map.json create mode 100644 03. Code/geulbeot_8th/templates/user/templates/tpl_1770300969/style.json create mode 100644 03. Code/geulbeot_8th/templates/user/templates/tpl_1770300969/template.html create mode 100644 03. Code/geulbeot_8th/templates/user/templates/tpl_1770301063/meta.json create mode 100644 03. Code/geulbeot_8th/templates/user/templates/tpl_1770301063/semantic_map.json create mode 100644 03. Code/geulbeot_8th/templates/user/templates/tpl_1770301063/style.json create mode 100644 03. Code/geulbeot_8th/templates/user/templates/tpl_1770301063/template.html create mode 100644 03. Code/geulbeot_9th/.env.sample create mode 100644 03. Code/geulbeot_9th/.gitignore create mode 100644 03. Code/geulbeot_9th/0206용/report_2026-02-05 (10).html create mode 100644 03. Code/geulbeot_9th/0206용/report_2026-02-06.html create mode 100644 03. Code/geulbeot_9th/0206용/report_2026-02-06.hwp create mode 100644 03. Code/geulbeot_9th/Procfile create mode 100644 03. Code/geulbeot_9th/README.md create mode 100644 03. Code/geulbeot_9th/api_config.py create mode 100644 03. Code/geulbeot_9th/app.py create mode 100644 03. Code/geulbeot_9th/converters/__init__.py create mode 100644 03. Code/geulbeot_9th/converters/html_to_hwp.py create mode 100644 03. Code/geulbeot_9th/converters/html_to_hwp_briefing.py create mode 100644 03. Code/geulbeot_9th/converters/hwp_style_mapping.py create mode 100644 03. Code/geulbeot_9th/converters/hwpx_generator.py create mode 100644 03. Code/geulbeot_9th/converters/hwpx_style_injector.py create mode 100644 03. Code/geulbeot_9th/converters/hwpx_table_injector.py create mode 100644 03. Code/geulbeot_9th/converters/pipeline/__init__.py create mode 100644 03. Code/geulbeot_9th/converters/pipeline/router.py create mode 100644 03. Code/geulbeot_9th/converters/pipeline/step1_convert.py create mode 100644 03. Code/geulbeot_9th/converters/pipeline/step2_extract.py create mode 100644 03. Code/geulbeot_9th/converters/pipeline/step3_domain.py create mode 100644 03. Code/geulbeot_9th/converters/pipeline/step4_chunk.py create mode 100644 03. Code/geulbeot_9th/converters/pipeline/step5_rag.py create mode 100644 03. Code/geulbeot_9th/converters/pipeline/step6_corpus.py create mode 100644 03. Code/geulbeot_9th/converters/pipeline/step7_index.py create mode 100644 03. Code/geulbeot_9th/converters/pipeline/step8_content.py create mode 100644 03. Code/geulbeot_9th/converters/pipeline/step9_html.py create mode 100644 03. Code/geulbeot_9th/converters/style_analyzer.py create mode 100644 03. Code/geulbeot_9th/domain/__init__.py create mode 100644 03. Code/geulbeot_9th/domain/hwpx/__init__.py create mode 100644 03. Code/geulbeot_9th/domain/hwpx/hwpx_domain_guide.md create mode 100644 03. Code/geulbeot_9th/domain/hwpx/hwpx_utils.py create mode 100644 03. Code/geulbeot_9th/handlers/__init__.py create mode 100644 03. Code/geulbeot_9th/handlers/briefing/__init__.py create mode 100644 03. Code/geulbeot_9th/handlers/briefing/processor.py create mode 100644 03. Code/geulbeot_9th/handlers/briefing/prompts/step1_5_plan.txt create mode 100644 03. Code/geulbeot_9th/handlers/briefing/prompts/step1_extract.txt create mode 100644 03. Code/geulbeot_9th/handlers/briefing/prompts/step2_generate.txt create mode 100644 03. Code/geulbeot_9th/handlers/common.py create mode 100644 03. Code/geulbeot_9th/handlers/content_analyzer.py create mode 100644 03. Code/geulbeot_9th/handlers/custom_doc_type.py create mode 100644 03. Code/geulbeot_9th/handlers/doc_template_analyzer.py create mode 100644 03. Code/geulbeot_9th/handlers/doc_type_analyzer.py create mode 100644 03. Code/geulbeot_9th/handlers/report/__init__.py create mode 100644 03. Code/geulbeot_9th/handlers/report/processor.py create mode 100644 03. Code/geulbeot_9th/handlers/report/prompts/refine_selection.txt create mode 100644 03. Code/geulbeot_9th/handlers/semantic_mapper.py create mode 100644 03. Code/geulbeot_9th/handlers/style_generator.py create mode 100644 03. Code/geulbeot_9th/handlers/template/__init__.py create mode 100644 03. Code/geulbeot_9th/handlers/template/html_table_template_css.txt create mode 100644 03. Code/geulbeot_9th/handlers/template/processor.py create mode 100644 03. Code/geulbeot_9th/handlers/template/prompts/analyze_template.txt create mode 100644 03. Code/geulbeot_9th/handlers/template_manager.py create mode 100644 03. Code/geulbeot_9th/handlers/tools/__init__.py create mode 100644 03. Code/geulbeot_9th/handlers/tools/border_fill.py create mode 100644 03. Code/geulbeot_9th/handlers/tools/char_style.py create mode 100644 03. Code/geulbeot_9th/handlers/tools/content_order.py create mode 100644 03. Code/geulbeot_9th/handlers/tools/font.py create mode 100644 03. Code/geulbeot_9th/handlers/tools/header_footer.py create mode 100644 03. Code/geulbeot_9th/handlers/tools/image.py create mode 100644 03. Code/geulbeot_9th/handlers/tools/numbering.py create mode 100644 03. Code/geulbeot_9th/handlers/tools/page_setup.py create mode 100644 03. Code/geulbeot_9th/handlers/tools/para_style.py create mode 100644 03. Code/geulbeot_9th/handlers/tools/section.py create mode 100644 03. Code/geulbeot_9th/handlers/tools/style_def.py create mode 100644 03. Code/geulbeot_9th/handlers/tools/table.py create mode 100644 03. Code/geulbeot_9th/output/assets/1_1_1_img01.png create mode 100644 03. Code/geulbeot_9th/output/assets/1_1_1_img02.png create mode 100644 03. Code/geulbeot_9th/output/assets/1_1_1_img03.png create mode 100644 03. Code/geulbeot_9th/output/assets/1_1_2_img01.png create mode 100644 03. Code/geulbeot_9th/output/assets/1_1_2_img02.png create mode 100644 03. Code/geulbeot_9th/output/assets/1_1_2_img03.png create mode 100644 03. Code/geulbeot_9th/output/assets/1_1_3_img01.png create mode 100644 03. Code/geulbeot_9th/output/assets/1_1_3_img02.png create mode 100644 03. Code/geulbeot_9th/output/assets/1_2_1_img03.png create mode 100644 03. Code/geulbeot_9th/output/assets/1_2_2_img01.png create mode 100644 03. Code/geulbeot_9th/output/assets/1_2_2_img02.png create mode 100644 03. Code/geulbeot_9th/output/assets/1_2_2_img03.png create mode 100644 03. Code/geulbeot_9th/prompts/step1_5_plan.txt create mode 100644 03. Code/geulbeot_9th/prompts/step1_extract.txt create mode 100644 03. Code/geulbeot_9th/prompts/step2_generate.txt create mode 100644 03. Code/geulbeot_9th/requirements.txt create mode 100644 03. Code/geulbeot_9th/static/css/editor.css create mode 100644 03. Code/geulbeot_9th/static/js/editor.js create mode 100644 03. Code/geulbeot_9th/templates/default/doc_types/briefing/config.json create mode 100644 03. Code/geulbeot_9th/templates/default/doc_types/presentation/config.json create mode 100644 03. Code/geulbeot_9th/templates/default/doc_types/report/config.json create mode 100644 03. Code/geulbeot_9th/templates/hwp_guide.md create mode 100644 03. Code/geulbeot_9th/templates/hwp_html_defaults.json create mode 100644 03. Code/geulbeot_9th/templates/index.html create mode 100644 03. Code/geulbeot_9th/templates/user/doc_types/user_1770335603/config.json create mode 100644 03. Code/geulbeot_9th/templates/user/doc_types/user_1770335603/content_prompt.json create mode 100644 03. Code/geulbeot_9th/templates/user/templates/tpl_1770333144/meta.json create mode 100644 03. Code/geulbeot_9th/templates/user/templates/tpl_1770333144/semantic_map.json create mode 100644 03. Code/geulbeot_9th/templates/user/templates/tpl_1770333144/style.json create mode 100644 03. Code/geulbeot_9th/templates/user/templates/tpl_1770333144/template.html create mode 100644 03. Code/geulbeot_9th/templates/user/templates/tpl_1770335603/meta.json create mode 100644 03. Code/geulbeot_9th/templates/user/templates/tpl_1770335603/semantic_map.json create mode 100644 03. Code/geulbeot_9th/templates/user/templates/tpl_1770335603/style.json create mode 100644 03. Code/geulbeot_9th/templates/user/templates/tpl_1770335603/template.html create mode 100644 03. Code/geulbeot_9th/기준 프롬프트(0206_0706).txt diff --git a/02. Prompts/문서생성/codedomain/국토일보_한건신문_Python_v01.py b/02. Prompts/문서생성/codedomain/국토일보_한건신문_Python_v01.py new file mode 100644 index 0000000..5c1a7e6 --- /dev/null +++ b/02. Prompts/문서생성/codedomain/국토일보_한건신문_Python_v01.py @@ -0,0 +1,13 @@ +def format_date(date_str, source): + try: + if source in ["국토일보", "한건신문"]: + # 기자 이름과 함께 있는 날짜 형식 처리 + date_obj = re.search(r'\d{4}-\d{2}-\d{2}', date_str) + if date_obj: + return date_obj.group(0) + elif source in ["엔지니어링데일리", "건설이코노미뉴스", "공학저널"]: + # 기자 이름과 함께 있는 날짜 형식 처리 + date_obj = re.search(r'\d{4}-\d{2}-\d{2}', date_str) + if date_obj: + return date_obj.group(0) + elif source == "연합 \ No newline at end of file diff --git a/02. Prompts/문서생성/codedomain/날짜_형식을_Python_v01.py b/02. Prompts/문서생성/codedomain/날짜_형식을_Python_v01.py new file mode 100644 index 0000000..769400d --- /dev/null +++ b/02. Prompts/문서생성/codedomain/날짜_형식을_Python_v01.py @@ -0,0 +1,11 @@ +def format_date(date_str: str, source: str) -> str: + """날짜 형식을 YYYY-MM-DD 로 변환""" + try: + match = re.search(r'\d{4}-\d{2}-\d{2}', date_str) + if match: + return match.group(0) + if source == '연합뉴스': + return datetime.strptime(date_str, '%m-%d %H:%M').strftime('2024-%m-%d') + return date_str + except Exception: + return date_str \ No newline at end of file diff --git a/02. Prompts/문서생성/codedomain/다음_로우데이터를_Python_v01.py b/02. Prompts/문서생성/codedomain/다음_로우데이터를_Python_v01.py new file mode 100644 index 0000000..548be38 --- /dev/null +++ b/02. Prompts/문서생성/codedomain/다음_로우데이터를_Python_v01.py @@ -0,0 +1,13 @@ +def summarize_data_for(section: str): + texts = [] + for path in sorted(os.listdir(DATA_DIR)): + with open(path, encoding="utf-8", errors="ignore") as f: + texts.append(f.read()) + prompt = ( + f"다음 로우데이터를 바탕으로 ‘{section}’ 섹션에 들어갈 핵심 사실과 수치를 200~300자로 요약해주세요.\n\n" + + "\n\n".join(texts) + ) + return call_claude(prompt) + + +# ─── 4) 이미지 자동 매핑 ───────────────────────── \ No newline at end of file diff --git a/02. Prompts/문서생성/codedomain/단위일_가능성_Python_v01.py b/02. Prompts/문서생성/codedomain/단위일_가능성_Python_v01.py new file mode 100644 index 0000000..90815dd --- /dev/null +++ b/02. Prompts/문서생성/codedomain/단위일_가능성_Python_v01.py @@ -0,0 +1,20 @@ +def is_likely_unit(cell_val): + """단위일 가능성 판별 (사용자 제안 로직)""" + if not cell_val: + return False + val = str(cell_val).strip() + + # 1. 빈 값 또는 너무 긴 텍스트 (단위는 보통 6자 이내) + if not val or len(val) > 6: + return False + + # 2. 순수 숫자는 제외 + cleaned = val.replace('.', '').replace(',', '').replace('-', '').replace(' ', '') + if cleaned.isdigit(): + return False + + # 3. 수식은 제외 + if val.startswith('='): + return False + + # 4. 일반적인 계산 기호 및 정크 기호 제외 \ No newline at end of file diff --git a/02. Prompts/문서생성/codedomain/단일_기사_Python_v01.py b/02. Prompts/문서생성/codedomain/단일_기사_Python_v01.py new file mode 100644 index 0000000..9bc490d --- /dev/null +++ b/02. Prompts/문서생성/codedomain/단일_기사_Python_v01.py @@ -0,0 +1,12 @@ +def fetch_article_content(url: str, source: str) -> str: + """단일 기사 본문 추출""" + try: + resp = requests.get(url, verify=False, timeout=10) + resp.encoding = 'utf-8' + resp.raise_for_status() + soup = BeautifulSoup(resp.text, 'html.parser') + paragraphs = soup.find_all('p') + content = ' '.join(clean_text(p.get_text()) for p in paragraphs) + content = content.replace('\n', ' ') + if not content.strip(): + logging.warning(f'No content for \ No newline at end of file diff --git a/02. Prompts/문서생성/codedomain/당신은_보고서_Python_v01.py b/02. Prompts/문서생성/codedomain/당신은_보고서_Python_v01.py new file mode 100644 index 0000000..3de070d --- /dev/null +++ b/02. Prompts/문서생성/codedomain/당신은_보고서_Python_v01.py @@ -0,0 +1,8 @@ +def analyze_references(): + files = sorted(os.listdir(REF_DIR)) + sys = "당신은 보고서 전문가입니다. 아래 파일명들을 보고, 이 프로젝트에 어울리는 보고서 스타일과 목차 구조를 요약해 주세요." + usr = "파일 목록:\n" + "\n".join(files) + return call_gpt(sys, usr) + + +# ─── 2) 가이드라인에서 필수 섹션 추출 ─────────── \ No newline at end of file diff --git a/02. Prompts/문서생성/codedomain/로그_전체_Python_v01.py b/02. Prompts/문서생성/codedomain/로그_전체_Python_v01.py new file mode 100644 index 0000000..17a4389 --- /dev/null +++ b/02. Prompts/문서생성/codedomain/로그_전체_Python_v01.py @@ -0,0 +1,13 @@ +def run_global_reconstruction(input_file): + print("로그: 전체 시트 통합 데이터를 분석 중입니다...") + df = pd.read_excel(input_file) + + # 1. 전역 주소록 생성: (시트명, 셀위치) -> 값 + # 예: { ('A1', 'G105'): 30.901, ('철근집계', 'C47'): 159.263 } + global_map = {} + for _, row in df.iterrows(): + global_map[(str(row['시트명']), str(row['셀위치']))] = row['현재값'] + + def trace_logic(formula, current_sheet): + if not isinstance(formula, str) or not formula.startswith("'="): + return formula \ No newline at end of file diff --git a/02. Prompts/문서생성/codedomain/로그_파일을_Python_v01.py b/02. Prompts/문서생성/codedomain/로그_파일을_Python_v01.py new file mode 100644 index 0000000..66d945d --- /dev/null +++ b/02. Prompts/문서생성/codedomain/로그_파일을_Python_v01.py @@ -0,0 +1,17 @@ +def extract_all_contents(file_path): + print(f"로그: 파일을 읽는 중입니다 (전체 내용 모드)...") + # 수식과 값을 동시에 비교하기 위해 data_only=False로 로드 + wb = openpyxl.load_workbook(file_path, data_only=False) + + all_content_data = [] + + for sheet_name in wb.sheetnames: + ws = wb[sheet_name] + print(f"\n" + "="*60) + print(f"▶ 시트 탐색 중: [ {sheet_name} ]") + print("="*60) + + # 시트의 모든 셀을 하나하나 검사 + for row in ws.iter_rows(): + for cell in row: + value = ce \ No newline at end of file diff --git a/02. Prompts/문서생성/codedomain/리스트_페이지_Python_v01.py b/02. Prompts/문서생성/codedomain/리스트_페이지_Python_v01.py new file mode 100644 index 0000000..0506475 --- /dev/null +++ b/02. Prompts/문서생성/codedomain/리스트_페이지_Python_v01.py @@ -0,0 +1,18 @@ +def fetch_articles( + base_url: str, + article_sel: str, + title_sel: str, + date_sel: str, + start_page: int, + end_page: int, + source: str, + url_prefix: str = '', + date_fmt_func=None +) -> list: + """리스트 페이지 순회하며 메타데이터 및 본문 수집""" + results = [] + for page in range(start_page, end_page + 1): + try: + page_url = f"{base_url}{page}" + resp = requests.get(page_url, verify=False, timeout=10) + soup = BeautifulSoup(resp.text, 'html.parser \ No newline at end of file diff --git a/02. Prompts/문서생성/codedomain/멀티라인_대응_Python_v01.py b/02. Prompts/문서생성/codedomain/멀티라인_대응_Python_v01.py new file mode 100644 index 0000000..656f615 --- /dev/null +++ b/02. Prompts/문서생성/codedomain/멀티라인_대응_Python_v01.py @@ -0,0 +1,11 @@ +def get_item_id_with_lookback(ws, row, col, section_start_row): + """멀티라인 대응 상향 번호 탐색 - 섹션 경계 존중""" + for r in range(row, section_start_row - 1, -1): + # 새로운 섹션을 만나면 탐색 중단 + f_val_check = str(ws.cell(row=r, column=6).value or "").strip() + if r != row and re.match(r'^\(.*\)$|^\[.*\]$', f_val_check): + break + + # F열에서 번호 탐색 + if re.search(ID_MARKER_PATTERN, f_val_check): + return re.search(ID_MARKER_PATTERN, f_val_check).group() \ No newline at end of file diff --git a/02. Prompts/문서생성/codedomain/미분류_과업_Python_v01.py b/02. Prompts/문서생성/codedomain/미분류_과업_Python_v01.py new file mode 100644 index 0000000..5a3abed --- /dev/null +++ b/02. Prompts/문서생성/codedomain/미분류_과업_Python_v01.py @@ -0,0 +1,14 @@ +def collect_app_usage(days_back): + server = 'localhost' + log_type = 'Security' + hand = win32evtlog.OpenEventLog(server, log_type) + flags = win32evtlog.EVENTLOG_BACKWARDS_READ | win32evtlog.EVENTLOG_SEQUENTIAL_READ + + usage_records = [] + cutoff_date = datetime.datetime.now() - datetime.timedelta(days=days_back) + + events = True + while events: + events = win32evtlog.ReadEventLog(hand, flags, 0) + for ev_obj in events: + event_time = ev_obj.TimeGenerated \ No newline at end of file diff --git a/02. Prompts/문서생성/codedomain/법령_지침_Python_v01.py b/02. Prompts/문서생성/codedomain/법령_지침_Python_v01.py new file mode 100644 index 0000000..73b9429 --- /dev/null +++ b/02. Prompts/문서생성/codedomain/법령_지침_Python_v01.py @@ -0,0 +1,11 @@ +def extract_must_have_sections(): + texts = [] + for path in sorted(os.listdir(GUIDELINE_DIR)): + with open(path, encoding="utf-8", errors="ignore") as f: + texts.append(f.read()) + sys = "법령·지침 문서를 바탕으로, 보고서에 반드시 들어가야 할 섹션(목차)을 순서대로 나열해 주세요." + usr = "\n\n---\n\n".join(texts) + return call_gpt(sys, usr).splitlines() + + +# ─── 3) 로우데이터에서 섹션별 내용 뽑기 ─────────── \ No newline at end of file diff --git a/02. Prompts/문서생성/codedomain/보고서_섹션에_Python_v01.py b/02. Prompts/문서생성/codedomain/보고서_섹션에_Python_v01.py new file mode 100644 index 0000000..551ccbb --- /dev/null +++ b/02. Prompts/문서생성/codedomain/보고서_섹션에_Python_v01.py @@ -0,0 +1,16 @@ +def pick_images_for(section: str): + names = sorted(os.listdir(IMAGE_DIR)) + prompt = ( + f"보고서 ‘{section}’ 섹션에 적합한 이미지를 아래 목록에서 1~2개 추천해 파일명만 리턴하세요:\n" + + "\n".join(names) + ) + resp = call_gpt("당신은 디자인 어시스턴트입니다.", prompt) + picked = [] + for line in resp.splitlines(): + fn = line.strip() + if fn in names: + picked.append(fn) + return picked + + +# ─── 5) 디자인 템플릿 선택 ─────────────────────── \ No newline at end of file diff --git a/02. Prompts/문서생성/codedomain/사이트별_함수_Python_v01.py b/02. Prompts/문서생성/codedomain/사이트별_함수_Python_v01.py new file mode 100644 index 0000000..dda7016 --- /dev/null +++ b/02. Prompts/문서생성/codedomain/사이트별_함수_Python_v01.py @@ -0,0 +1,13 @@ +class SslAdapter(HTTPAdapter): + def init_poolmanager(self, *args, **kwargs): + ctx = ssl.create_default_context() + ctx.set_ciphers('DEFAULT:@SECLEVEL=1') + self.poolmanager = PoolManager(*args, ssl_context=ctx, **kwargs) + +session = requests.Session() +session.mount('https://', SslAdapter()) +headers = {'User-Agent': 'Mozilla/5.0', 'Accept-Language': 'ko-KR,ko;q=0.9'} + +# ------------------------------------------------- +# 사이트별 함수 (대한경제 제외) +# ----------------------------------- \ No newline at end of file diff --git a/02. Prompts/문서생성/codedomain/설명이_없습니다_Python_v01.py b/02. Prompts/문서생성/codedomain/설명이_없습니다_Python_v01.py new file mode 100644 index 0000000..71a0a16 --- /dev/null +++ b/02. Prompts/문서생성/codedomain/설명이_없습니다_Python_v01.py @@ -0,0 +1,7 @@ +def get_detail_content(detail_url): + res = requests.get(detail_url) + soup = BeautifulSoup(res.text, 'html.parser') + div = soup.find('div', {'data-v-5cb2d9fe': True}) + if div and div.find('h2'): + return div.find('h2').text.strip() + return "설명이 없습니다." \ No newline at end of file diff --git a/02. Prompts/문서생성/codedomain/설정_브라우저가_Python_v01.py b/02. Prompts/문서생성/codedomain/설정_브라우저가_Python_v01.py new file mode 100644 index 0000000..40ee214 --- /dev/null +++ b/02. Prompts/문서생성/codedomain/설정_브라우저가_Python_v01.py @@ -0,0 +1,11 @@ +def fetch_dnews_articles(base_url, start_page, end_page): + # Selenium WebDriver 설정 + options = webdriver.ChromeOptions() + options.add_argument('--headless') # 브라우저가 뜨지 않게 설정 + options.add_argument('--no-sandbox') + options.add_argument('--disable-dev-shm-usage') + + # ChromeDriver 경로 설정 + chromedriver_path = 'D:/python_for crawling/webdriver/chromedriver-win64/chromedriver.exe' # ChromeDriver 경로 설정 + service = ChromeService(executable_path=chromedriver_path) + driver = webdr \ No newline at end of file diff --git a/02. Prompts/문서생성/codedomain/수식_자체가_Python_v01.py b/02. Prompts/문서생성/codedomain/수식_자체가_Python_v01.py new file mode 100644 index 0000000..fdc14e4 --- /dev/null +++ b/02. Prompts/문서생성/codedomain/수식_자체가_Python_v01.py @@ -0,0 +1,17 @@ +def extract_raw_constants(file_path): + # 수식 자체가 아닌 입력된 값을 확인하기 위해 로드 + print(f"로그: 파일을 읽는 중입니다...") + wb = openpyxl.load_workbook(file_path, data_only=False) + + raw_data = [] + + for sheet_name in wb.sheetnames: + ws = wb[sheet_name] + print(f"\n" + "="*50) + print(f"▶ [ {sheet_name} ] 시트의 원천 데이터(상수) 추출 시작") + print("="*50) + + for row in ws.iter_rows(): + for cell in row: + value = cell.value + coord = cell.coordin \ No newline at end of file diff --git a/02. Prompts/문서생성/codedomain/수식_주소를_Python_v01.py b/02. Prompts/문서생성/codedomain/수식_주소를_Python_v01.py new file mode 100644 index 0000000..e9ad0c6 --- /dev/null +++ b/02. Prompts/문서생성/codedomain/수식_주소를_Python_v01.py @@ -0,0 +1,11 @@ +def reconstruct_formula(formula, wb_v, sheet_name): + """수식 내 셀 주소를 실제 값으로 치환 및 기호 가독화""" + if not formula or not str(formula).startswith('='): return str(formula) + ref_pattern = r"(?:'([^']+)'|([a-zA-Z0-9가-힣]+))?!([A-Z]+\d+)|([A-Z]+\d+)" + + def replace_with_value(match): + s_name = match.group(1) or match.group(2) or sheet_name + coord = match.group(3) or match.group(4) + try: + val = wb_v[s_name][coord].value + if val is None: return "0" \ No newline at end of file diff --git a/02. Prompts/문서생성/codedomain/수식을_가져오기_Python_v01.py b/02. Prompts/문서생성/codedomain/수식을_가져오기_Python_v01.py new file mode 100644 index 0000000..b8a76ed --- /dev/null +++ b/02. Prompts/문서생성/codedomain/수식을_가져오기_Python_v01.py @@ -0,0 +1,14 @@ +def extract_excel_logic(file_path): + # 1. 수식을 가져오기 위한 로드 (data_only=False) + print(f"로그: 파일을 읽는 중입니다 (수식 모드)...") + wb_formula = openpyxl.load_workbook(file_path, data_only=False) + + # 2. 결과값을 가져오기 위한 로드 (data_only=True) + print(f"로그: 파일을 읽는 중입니다 (데이터 모드)...") + wb_value = openpyxl.load_workbook(file_path, data_only=True) + + extraction_data = [] + + for sheet_name in wb_formula.sheetnames: + ws_f = wb_formula[sheet_name] + ws_v = wb_value[sheet_name] \ No newline at end of file diff --git a/02. Prompts/문서생성/codedomain/아래_디자인_Python_v01.py b/02. Prompts/문서생성/codedomain/아래_디자인_Python_v01.py new file mode 100644 index 0000000..394d537 --- /dev/null +++ b/02. Prompts/문서생성/codedomain/아래_디자인_Python_v01.py @@ -0,0 +1,11 @@ +def choose_design_template(): + samples = sorted(os.listdir(DESIGN_DIR)) + prompt = ( + "아래 디자인 샘플 파일들 중 이 보고서에 어울리는 상위 3안(1안,2안,3안)을 " + "순서대로 파일명만으로 알려주세요:\n" + "\n".join(samples) + ) + lines = call_gpt("디자인 전문가입니다.", prompt).splitlines() + return [ln.strip() for ln in lines if ln.strip() in samples][:3] + + +# ─── PPT 생성 ──────────────────────────────────── \ No newline at end of file diff --git a/02. Prompts/문서생성/codedomain/엔지니어링데일리_기자_Python_v01.py b/02. Prompts/문서생성/codedomain/엔지니어링데일리_기자_Python_v01.py new file mode 100644 index 0000000..5a11762 --- /dev/null +++ b/02. Prompts/문서생성/codedomain/엔지니어링데일리_기자_Python_v01.py @@ -0,0 +1,13 @@ +def clean_text(text): + replacements = { + ' ': ' ', '‘': "'", '’': "'", '“': '"', '”': '"', + '&': '&', '<': '<', '>': '>', ''': "'", + '"' : "'", '·': "'" + } + + for entity, replacement in replacements.items(): + text = text.replace(entity, replacement) + + text = re.sub(r'<[^>]+>', '', text) + text = re.sub(r'\(엔지니어링데일리\).*?기자=', '', text) # (엔지니어링데일리) *** 기자= 패턴 삭제 + text = re.sub(r'\[국토일보\s.*? \ No newline at end of file diff --git a/02. Prompts/문서생성/codedomain/엔티티_불필요한_Python_v01.py b/02. Prompts/문서생성/codedomain/엔티티_불필요한_Python_v01.py new file mode 100644 index 0000000..54ba222 --- /dev/null +++ b/02. Prompts/문서생성/codedomain/엔티티_불필요한_Python_v01.py @@ -0,0 +1,9 @@ +def clean_text(text: str) -> str: + """HTML 엔티티 및 불필요한 태그 제거""" + reps = { + ' ': ' ', '‘': "'", '’': "'", '“': '"', '”': '"', + '&': '&', '<': '<', '>': '>', ''': "'", '"': "'", '·': "'" + } + for key, val in reps.items(): + text = text.replace(key, val) + return re.sub(r'<[^>]+>', '', text).strip() \ No newline at end of file diff --git a/02. Prompts/문서생성/codedomain/인증서_검증_Python_v01.py b/02. Prompts/문서생성/codedomain/인증서_검증_Python_v01.py new file mode 100644 index 0000000..adf0c8a --- /dev/null +++ b/02. Prompts/문서생성/codedomain/인증서_검증_Python_v01.py @@ -0,0 +1,11 @@ +def fetch_article_content(article_url, source): + try: + response = requests.get(article_url, verify=False, timeout=10) # SSL 인증서 검증 비활성화 및 타임아웃 설정 + response.encoding = 'utf-8' # 인코딩 설정 + response.raise_for_status() + soup = BeautifulSoup(response.text, 'html.parser') + paragraphs = soup.find_all('p') + content = ' '.join([clean_text(p.get_text()) for p in paragraphs]) + + # 텍스트 내의 엔터키를 스페이스로 대체 + content = content.replace('\n', ' ') \ No newline at end of file diff --git a/02. Prompts/문서생성/codedomain/카테고리_내용_Python_v01.py b/02. Prompts/문서생성/codedomain/카테고리_내용_Python_v01.py new file mode 100644 index 0000000..f9b44cc --- /dev/null +++ b/02. Prompts/문서생성/codedomain/카테고리_내용_Python_v01.py @@ -0,0 +1,14 @@ +def get_category_and_content(detail_url): + res = requests.get(detail_url) + soup = BeautifulSoup(res.text, 'html.parser') + + # 카테고리 + category_tags = soup.select('ul.flex.flex-row.flex-wrap.gap-2 li a') + categories = [tag['href'].split('/')[-2] for tag in category_tags] + + # 내용 + content_div = soup.select_one('div.content-base.workflow-description.text-md') + if content_div: + content_text = content_div.get_text(separator=' ', strip=True) + else: + content_text = \ No newline at end of file diff --git a/02. Prompts/문서생성/codedomain/커버_슬라이드_Python_v01.py b/02. Prompts/문서생성/codedomain/커버_슬라이드_Python_v01.py new file mode 100644 index 0000000..4304c84 --- /dev/null +++ b/02. Prompts/문서생성/codedomain/커버_슬라이드_Python_v01.py @@ -0,0 +1,14 @@ +def build_ppt(sections, images_map, templates): + prs = Presentation() + prs.slide_width, prs.slide_height = Inches(8.27), Inches(11.69) # A4 + + # 커버 슬라이드 + slide = prs.slides.add_slide(prs.slide_layouts[6]) + tb = slide.shapes.add_textbox(Inches(1), Inches(2), Inches(6.27), Inches(2)) + p = tb.text_frame.paragraphs[0] + p.text = "🚀 자동 보고서" + p.font.size = Pt(26); p.font.bold = True + + # 본문 슬라이드 + for sec in sections: + slide = prs.slides.add_slide(prs.slide_layouts[6] \ No newline at end of file diff --git a/02. Prompts/문서생성/codedomain/합계_기준_Python_v01.py b/02. Prompts/문서생성/codedomain/합계_기준_Python_v01.py new file mode 100644 index 0000000..de4a318 --- /dev/null +++ b/02. Prompts/문서생성/codedomain/합계_기준_Python_v01.py @@ -0,0 +1,15 @@ +def find_unit_from_sum_cell(ws, sum_row, max_col): + """ + 합계 셀 기준 단위 탐색 + - 오른쪽 열 우선, 위쪽 방향 탐색 + - 대분류 경계 무시 (합계 기준으로만 판단) + """ + # 오른쪽 열부터 왼쪽으로 + for c in range(max_col, 0, -1): + # 합계 행부터 위쪽으로 + for r in range(sum_row, 0, -1): + cell_val = ws.cell(row=r, column=c).value + if is_likely_unit(cell_val): + return str(cell_val).strip() + + return "" \ No newline at end of file diff --git a/02. Prompts/문서생성/domain/도메인_문서생성_가계_고금리_v01.md b/02. Prompts/문서생성/domain/도메인_문서생성_가계_고금리_v01.md new file mode 100644 index 0000000..55758da --- /dev/null +++ b/02. Prompts/문서생성/domain/도메인_문서생성_가계_고금리_v01.md @@ -0,0 +1,7 @@ +

3-2. 가계: 고금리 피크아웃, 하지만 체감 회복은 느리다

+

기준금리는 정점을 지나 완만히 낮아지는 방향이지만, 과거 초저금리 시대와 비교하면 여전히 높은 수준이다.

+ \ No newline at end of file diff --git a/02. Prompts/문서생성/domain/도메인_문서생성_경제_진단_v01.md b/02. Prompts/문서생성/domain/도메인_문서생성_경제_진단_v01.md new file mode 100644 index 0000000..9eab505 --- /dev/null +++ b/02. Prompts/문서생성/domain/도메인_문서생성_경제_진단_v01.md @@ -0,0 +1,10 @@ + + + + + + + 경제 진단 보고서 + + + \ No newline at end of file diff --git a/02. Prompts/문서생성/domain/도메인_문서생성_대제목_전체폭_v01.md b/02. Prompts/문서생성/domain/도메인_문서생성_대제목_전체폭_v01.md new file mode 100644 index 0000000..ed3f7c8 --- /dev/null +++ b/02. Prompts/문서생성/domain/도메인_문서생성_대제목_전체폭_v01.md @@ -0,0 +1,9 @@ + /* 대제목(H1)은 전체폭 */ + .sheet .body-content .b-top h1, + .sheet .body-content .b-col h1{ + color: var(--b-primary) !important; + border-bottom: 2px solid var(--b-primary) !important; + margin: 0 0 10px 0; + font-size: 18pt; + font-weight: 900; + } \ No newline at end of file diff --git a/02. Prompts/문서생성/domain/도메인_문서생성_리드문_리드문_v01.md b/02. Prompts/문서생성/domain/도메인_문서생성_리드문_리드문_v01.md new file mode 100644 index 0000000..7da0686 --- /dev/null +++ b/02. Prompts/문서생성/domain/도메인_문서생성_리드문_리드문_v01.md @@ -0,0 +1,3 @@ +
+ [리드문] [리드문] [리드문] +
\ No newline at end of file diff --git a/02. Prompts/문서생성/domain/도메인_문서생성_리스크_요인_v01.md b/02. Prompts/문서생성/domain/도메인_문서생성_리스크_요인_v01.md new file mode 100644 index 0000000..a9162c7 --- /dev/null +++ b/02. Prompts/문서생성/domain/도메인_문서생성_리스크_요인_v01.md @@ -0,0 +1,5 @@ +

1-3. 리스크 요인: 무역갈등, 지정학, 고부채

+

글로벌 전망에서 반복적으로 등장하는 키워드는 “정책 불확실성”이다. IMF는 2025년 4·10월 보고서에서, 관세 인상과 공급망 재편, 지정학적 긴장 고조가 향후 성장률을 추가로 깎아먹을 수 있는 하방 리스크라고 지적한다.

+

두 번째 리스크는 고부채다. 코로나 위기 대응 과정에서 확대한 정부지출과 이후의 고금리 환경이 결합되면서 많은 국가의 재정 상태가 빠르게 악화되었다.

+

마지막으로, 디지털 전환과 에너지 전환(탈탄소화)은 장기적으로는 성장 잠재력을 키우는 요인이지만, 단기적으로는 막대한 투자 비용과 산업 구조조정을 수반한다.

+ \ No newline at end of file diff --git a/02. Prompts/문서생성/domain/도메인_문서생성_마지막으로_로드한_v01.md b/02. Prompts/문서생성/domain/도메인_문서생성_마지막으로_로드한_v01.md new file mode 100644 index 0000000..6d21d40 --- /dev/null +++ b/02. Prompts/문서생성/domain/도메인_문서생성_마지막으로_로드한_v01.md @@ -0,0 +1,11 @@ + + + + + +⚠️ [최종 경고 - 출력 직전 필수 확인] +1. 원본의 모든 텍스트가 100% 포함되었는가? +2. "..." 또는 요약된 문장이 없는가? +3. 생략된 문단이 단 하나도 없는가? + +위 3가지 중 하나라도 위반 시, 출력을 중단하고 처음부터 다시 작성하십시오. +원본 텍스트 글자 수와 출력 텍스트 글자 수가 동일해야 합니다. \ No newline at end of file diff --git a/03. Code/geulbeot_10th/Procfile b/03. Code/geulbeot_10th/Procfile new file mode 100644 index 0000000..ca6e941 --- /dev/null +++ b/03. Code/geulbeot_10th/Procfile @@ -0,0 +1 @@ +web: gunicorn app:app diff --git a/03. Code/geulbeot_10th/README.md b/03. Code/geulbeot_10th/README.md new file mode 100644 index 0000000..6460825 --- /dev/null +++ b/03. Code/geulbeot_10th/README.md @@ -0,0 +1,453 @@ +# 글벗 (Geulbeot) v10.0 + +**백엔드 재구조화 + 프론트 모듈화 + 도메인 지식 시스템 + 데모 모드** + +다양한 형식의 자료(PDF·HWP·이미지·Excel 등)를 입력하면, AI가 RAG 파이프라인으로 분석한 뒤 +선택한 문서 유형(기획서·보고서·발표자료 등)에 맞는 표준 HTML 문서를 자동 생성합니다. +생성된 문서는 웹 편집기에서 수정하고, HTML / PDF / HWP로 출력합니다. + +v10에서는 코드베이스를 전면 재구조화했습니다. +백엔드는 handlers를 doc·template 서브패키지로 분리하고, +프론트엔드는 3,700줄짜리 index.html을 781줄로 축소하며 JS 9개 모듈로 분리했습니다. +토목 14개 세부분야 도메인 지식 시스템과 시연용 데모 모드를 추가했습니다. + +--- + +## 🏗 아키텍처 (Architecture) + +### 핵심 흐름 + +``` +자료 입력 (파일/폴더) + │ + ▼ +도메인 지식 선택 (v10 신규) ─── 토목 14개 분야 + DX + 보고서 가이드 + │ + ▼ +작성 방식 선택 ─── 형식만 변경 / 내용 재구성 / 신규 작성 + │ + ▼ +RAG 파이프라인 (9단계) ─── 공통 처리 + 도메인 프롬프트 + │ + ▼ +문서 유형 선택 + ├─ 기획서 (기본) + ├─ 보고서 (기본) + ├─ 발표자료 (기본) + └─ 사용자 등록 (HWPX 분석 → 자동 등록) + │ + ▼ +글벗 표준 HTML 생성 ◀── 템플릿 스타일 + 시맨틱 맵 참조 + │ + ▼ +웹 편집기 (수기 편집 / AI 편집) + │ + ▼ +출력 (HTML / PDF / HWP) +``` + +### 1. Backend (Python Flask) + +- **Language**: Python 3.13 +- **Web Framework**: Flask 3.0 — 웹 서버 엔진, API 라우팅 +- **AI**: + - Claude API (Anthropic) — 기획서 생성, AI 편집, 문서 유형 맥락 분석 + - OpenAI API — RAG 임베딩, 인덱싱, 텍스트 추출 + - Gemini API — 보고서 콘텐츠·HTML 생성 +- **Features**: + - 자료 입력 → 9단계 RAG 파이프라인 + 도메인 프롬프트 + - 문서 유형별 생성: 기획서 (Claude), 보고서 (Gemini), 사용자 정의 유형 + - AI 편집: 전체 수정 (`/refine`), 부분 수정 (`/refine-selection`) + - 문서 유형 분석·등록: HWPX → 12종 도구 추출 → 시맨틱 매핑 → 스타일 생성 → 유형 CRUD + - 도메인 지식 관리 (v10 신규): 토목 14분야 + DX + 보고서 가이드 + - HWP/PDF 변환 + +### 2. Frontend (v10 모듈화) + +- **index.html**: 3,763줄 → **781줄** — HTML 셸만 유지 +- **main.css**: 1,825줄 — 인라인 CSS 전부 외부 분리 +- **JS 9개 모듈**: + +| 모듈 | 줄 수 | 역할 | +|------|-------|------| +| editor.js | 1,208 | 웹 WYSIWYG 편집기 | +| doc_type.js | 587 | 문서 유형 선택·CRUD | +| generator.js | 483 | 기획서·보고서 생성 호출 | +| demo_mode.js | 370 | 시연용 데모 모드 | +| domain_selector.js | 287 | 도메인 지식 선택 모달 | +| template.js | 188 | 템플릿 관리 UI | +| ai_edit.js | 142 | AI 편집 (전체·부분) | +| modals.js | 134 | 공통 모달 컴포넌트 | +| ui.js | 91 | UI 유틸리티 | +| export.js | 71 | HTML/PDF/HWP 다운로드 | + +### 3. 백엔드 패키지 구조 (v10 재구조화) + +``` +handlers/ +├── briefing/ 기획서 생성 +├── report/ 보고서 생성 +├── doc/ ★ v10 — 문서 유형 서브패키지 +│ ├── doc_type_analyzer.py AI 맥락·구조 분석 +│ ├── content_analyzer.py placeholder 분석 +│ └── custom_doc_type.py 사용자 유형 문서 생성 +└── template/ ★ v10 — 템플릿 서브패키지 + ├── processor.py 기본 관리 + ├── doc_template_analyzer.py 12종 도구 오케스트레이터 + ├── semantic_mapper.py 요소 의미 판별 + ├── style_generator.py CSS 생성 + ├── template_manager.py CRUD + template.html 조립 + └── tools/ HWPX 추출 도구 12종 +``` + +### 4. 도메인 지식 시스템 (v10 신규) + +- **domain_api.py** (456줄): 도메인 지식 관리 API + 파이프라인 래퍼 +- **domain_config.json**: 카테고리 구조 정의 (계층형 선택) +- **토목 14개 세부분야**: 측량·해석·교량·터널·도로·구조·지반·시공·공정원가·품질환경·안전·통신·BIM·기획 +- **DX (디지털 전환)**: 스마트 건설, AI/IoT +- **보고서 가이드**: 현안보고서 구조, 작성 가이드 +- **도메인 선택 UI**: 체크박스 모달 → 선택된 .txt 합쳐서 RAG 파이프라인에 도메인 프롬프트로 전달 + +### 5. 데모 모드 (v10 신규) + +- **demo_mode.js**: `DEMO_MODE = true` 시 실제 API 호출 없이 샘플 문서 표시 +- **샘플 HTML 4종**: 기획서 1p·2p, 보고서, 발표자료 +- 시연·발표용 — 목차 애니메이션 + 가짜 생성 프로세스 + +### 6. 주요 시나리오 (Core Scenarios) + +1. **기획서 생성**: RAG 분석 후 Claude API가 글벗 표준 HTML 생성 +2. **보고서 생성**: RAG 파이프라인 → Gemini API가 다페이지 HTML 보고서 생성 +3. **사용자 정의 문서 생성**: 등록된 유형의 template.html 기반 정리·재구성 +4. **문서 유형 등록**: HWPX 업로드 → 자동 분석 → 유형 CRUD +5. **도메인 지식 적용 (v10 신규)**: 분야 선택 → RAG 파이프라인에 전문 용어·기준 주입 +6. **데모 시연 (v10 신규)**: API 없이 샘플 문서로 전체 워크플로우 시연 +7. **AI 편집 / HWP 내보내기** + +### 프로세스 플로우 + +#### RAG 파이프라인 (공통) + +```mermaid +flowchart TD + classDef process fill:#e8f4fd,stroke:#1a365d,stroke-width:1.5px,color:#1a365d + classDef decision fill:#fffde7,stroke:#f9a825,stroke-width:2px,color:#333 + classDef aiGpt fill:#d4edda,stroke:#10a37f,stroke-width:2px,color:#155724 + classDef startEnd fill:#1a365d,stroke:#1a365d,color:#fff,stroke-width:2px + + A[/"📂 자료 입력 (파일/폴더)"/]:::process + B["step1: 파일 변환\n모든 형식 → PDF 통일"]:::process + C["step2: 텍스트·이미지 추출\n⚡ GPT API"]:::aiGpt + D{"분량 판단\n5,000자 기준"}:::decision + + E["step3: 도메인 분석"]:::process + F["step4: 의미 단위 청킹"]:::process + G["step5: RAG 임베딩 ⚡ GPT"]:::aiGpt + H["step6: 코퍼스 생성"]:::process + + I["step7: FAISS 인덱싱 + 목차 ⚡ GPT"]:::aiGpt + J(["📋 분석 완료 → 문서 유형 선택"]):::startEnd + + A --> B --> C --> D + D -->|"≥ 5,000자"| E --> F --> G --> H --> I + D -->|"< 5,000자"| I + I --> J +``` + +#### 전체 워크플로우 (v10 시점) + +```mermaid +flowchart TD + classDef decision fill:#fffde7,stroke:#f9a825,stroke-width:2px,color:#333 + classDef aiClaude fill:#fff3cd,stroke:#d97706,stroke-width:2px,color:#856404 + classDef aiGemini fill:#d6eaf8,stroke:#4285f4,stroke-width:2px,color:#1a4d8f + classDef editStyle fill:#fff3e0,stroke:#ef6c00,stroke-width:1.5px,color:#e65100 + classDef exportStyle fill:#f3e5f5,stroke:#7b1fa2,stroke-width:1.5px,color:#4a148c + classDef startEnd fill:#1a365d,stroke:#1a365d,color:#fff,stroke-width:2px + classDef planned fill:#f5f5f5,stroke:#999,stroke-width:1px,stroke-dasharray: 5 5,color:#999 + classDef newModule fill:#e0f2f1,stroke:#00695c,stroke-width:2px,color:#004d40 + classDef uiNew fill:#e8eaf6,stroke:#3949ab,stroke-width:2px,color:#1a237e + classDef domainStyle fill:#fce4ec,stroke:#c62828,stroke-width:2px,color:#b71c1c + + A(["📂 자료 입력"]):::startEnd + + DOM["🏗️ 도메인 지식 선택\n토목 14분야 + DX\n(v10 신규)"]:::domainStyle + + W{"작성 방식 선택"}:::uiNew + W1["📄 형식만 변경"]:::uiNew + W2["🔄 내용 재구성"]:::uiNew + W3["✨ 신규 작성"]:::uiNew + + R["RAG 파이프라인\n9단계 + 도메인 프롬프트"]:::startEnd + + B{"문서 유형 선택"}:::decision + + C["기획서 생성\n⚡ Claude API"]:::aiClaude + D["보고서 생성\n⚡ Gemini API"]:::aiGemini + E["발표자료\n예정"]:::planned + U["사용자 정의 유형\ntemplate.html 기반"]:::newModule + + T["📋 템플릿 + 시맨틱 맵"]:::newModule + + G["글벗 표준 HTML"]:::startEnd + + H{"편집 방식"}:::decision + I["웹 편집기\n수기 편집"]:::editStyle + J["AI 편집\n전체·부분 수정\n⚡ Claude API"]:::aiClaude + + K{"출력 형식"}:::decision + L["HTML / PDF"]:::exportStyle + M["HWP 변환\n하이브리드"]:::exportStyle + N["PPT\n예정"]:::planned + O(["✅ 최종 산출물"]):::startEnd + + A --> DOM --> W + W --> W1 & W2 & W3 + W1 & W2 & W3 --> R + + DOM -.->|"도메인 프롬프트"| R + + R --> B + + B -->|"기획서"| C --> G + B -->|"보고서"| D --> G + B -->|"발표자료"| E -.-> G + B -->|"사용자 유형"| U --> G + + T -.->|"스타일·구조 참조"| U + + G --> H + H -->|"수기"| I --> K + H -->|"AI"| J --> K + K -->|"웹/인쇄"| L --> O + K -->|"HWP"| M --> O + K -->|"PPT"| N -.-> O +``` + +#### 문서 유형 등록 + +```mermaid +flowchart TD + classDef process fill:#e8f4fd,stroke:#1a365d,stroke-width:1.5px,color:#1a365d + classDef newModule fill:#fff3e0,stroke:#ef6c00,stroke-width:2px,color:#e65100 + classDef aiNode fill:#d4edda,stroke:#10a37f,stroke-width:2px,color:#155724 + classDef dataStore fill:#e0f2f1,stroke:#00695c,stroke-width:1.5px,color:#004d40 + classDef startEnd fill:#1a365d,stroke:#1a365d,color:#fff,stroke-width:2px + + A(["📄 HWPX 업로드"]):::startEnd + B["DocTemplateAnalyzer\n12종 tools 코드 추출"]:::newModule + C["SemanticMapper\n요소 의미 판별\n헤더표/푸터표/제목블록/데이터표"]:::newModule + D["StyleGenerator\n추출값 → CSS 생성\ncharPr·paraPr·폰트 매핑"]:::newModule + E["ContentAnalyzer\nplaceholder 의미·유형\ncontent_prompt.json"]:::newModule + F["DocTypeAnalyzer\n⚡ AI 맥락·구조 분석\nconfig.json"]:::aiNode + G["TemplateManager\ntemplate.html 조립"]:::newModule + + H[("📋 templates/user/\ntemplates/{tpl_id}/\ndoc_types/{type_id}/")]:::dataStore + + A --> B --> C --> D --> E + B --> F + C & D & E & F --> G --> H +``` + +--- + +## 🔄 v9 → v10 변경사항 + +| 영역 | v9 | v10 | +|------|------|------| +| handlers 구조 | 평탄 (루트에 7개 모듈) | **handlers/doc/ + handlers/template/** 서브패키지로 분리 | +| 프론트 index.html | 3,763줄 (인라인 CSS·JS) | **781줄** — HTML 셸만 유지 | +| CSS | 인라인 | **static/css/main.css** (1,825줄) 외부 분리 | +| JS | editor.js 단일 | **9개 모듈** 분리 (doc_type·generator·demo_mode 등) | +| 도메인 지식 | 없음 | **domain_api.py** + domain_config.json + 토목 14분야 txt | +| 도메인 선택 UI | 없음 | **domain_selector.js** — 체크박스 모달 | +| 데모 모드 | 없음 | **demo_mode.js** + 샘플 HTML 4종 | +| 보고서 가이드 | 없음 | **domain/report_guide/** — 현안보고서 구조·작성법 | +| 레거시 정리 | prompts/ 잔존 | **prompts/ 삭제** | + +--- + +## 🗺 상태 및 로드맵 (Status & Roadmap) + +- **Phase 1**: RAG 파이프라인 — 9단계 파이프라인, 도메인 분석, 분량 자동 판단 (🔧 기본 구현) +- **Phase 2**: 문서 생성 — 기획서·보고서·사용자 정의 유형 AI 생성 (🔧 기본 구현) +- **Phase 3**: 출력 — HTML/PDF 다운로드, HWP 변환 (🔧 기본 구현) +- **Phase 4**: HWP/HWPX/HTML 매핑 — 스타일 분석·HWPX 생성·스타일 주입·표 주입 (🔧 기본 구현) +- **Phase 5**: 문서 유형 분석·등록 — HWPX → 12종 도구 추출 → 시맨틱 매핑 → 유형 CRUD (🔧 기본 구현) +- **Phase 6**: HWPX 템플릿 관리 — template_manager, content_order, 독립 저장 (🔧 기본 구현) +- **Phase 7**: UI 고도화 — 프론트 모듈화, 데모 모드, 도메인 선택기 (🔧 기본 구현 · 현재 버전) +- **Phase 8**: 백엔드 재구조화 — handlers 서브패키지 분리, 레거시 정리 (🔧 기본 구현 · 현재 버전) + +--- + +## 🚀 시작하기 (Getting Started) + +### 사전 요구사항 + +- Python 3.10+ +- Claude API 키 (Anthropic) — 기획서 생성, AI 편집, 문서 유형 분석 +- OpenAI API 키 — RAG 파이프라인 +- Gemini API 키 — 보고서 콘텐츠·HTML 생성 +- pyhwpx — HWP 변환 시 (Windows + 한글 프로그램 필수) + +### 환경 설정 + +```bash +git clone http://[Gitea주소]/kei/geulbeot-v10.git +cd geulbeot-v10 + +python -m venv venv +venv\Scripts\activate # Windows + +pip install -r requirements.txt + +cp .env.sample .env +# .env 파일을 열어 실제 API 키 입력 +``` + +### .env 작성 + +```env +CLAUDE_API_KEY=sk-ant-your-key-here # 기획서 생성, AI 편집, 유형 분석 +GPT_API_KEY=sk-proj-your-key-here # RAG 파이프라인 +GEMINI_API_KEY=AIzaSy-your-key-here # 보고서 콘텐츠 생성 +``` + +### 실행 + +```bash +python app.py +# → http://localhost:5000 접속 +``` + +### 데모 모드 + +API 키 없이 시연하려면 `static/js/demo_mode.js`에서 `DEMO_MODE = true` 확인 후 실행. +샘플 문서(기획서 2종 + 보고서 + 발표자료)가 자동 표시됩니다. + +--- + +## 📂 프로젝트 구조 + +``` +geulbeot_10th/ +├── app.py # Flask 웹 서버 — API 라우팅 +├── api_config.py # .env 환경변수 로더 +├── domain_api.py # ★ v10 — 도메인 지식 관리 API +├── domain_config.json # ★ v10 — 도메인 카테고리 구조 +│ +├── domain/ # 도메인 지식 +│ ├── hwpx/ # HWPX 명세서 + 유틸 +│ ├── civil/ # ★ v10 — 토목 분야 +│ │ ├── general.txt # 토목 일반 +│ │ ├── dx.txt # DX (디지털 전환) +│ │ └── specialties/ # 14개 세부분야 +│ │ ├── survey.txt · road.txt · bridge.txt · tunnel.txt +│ │ ├── structure.txt · geotechnical.txt · construction.txt +│ │ ├── schedule_cost.txt · quality_env.txt · safety.txt +│ │ ├── communication.txt · bim.txt · planning.txt +│ │ └── anlysis.txt +│ └── report_guide/ # ★ v10 — 보고서 작성 가이드 +│ +├── handlers/ # 비즈니스 로직 (★ v10 재구조화) +│ ├── common.py +│ ├── briefing/ # 기획서 처리 +│ ├── report/ # 보고서 처리 +│ ├── doc/ # ★ v10 서브패키지 — 문서 유형 +│ │ ├── doc_type_analyzer.py +│ │ ├── content_analyzer.py +│ │ └── custom_doc_type.py +│ └── template/ # ★ v10 서브패키지 — 템플릿 +│ ├── processor.py · template_manager.py +│ ├── doc_template_analyzer.py · semantic_mapper.py +│ ├── style_generator.py +│ └── tools/ # HWPX 추출 도구 12종 +│ +├── converters/ # 변환 엔진 +│ ├── pipeline/ # 9단계 RAG 파이프라인 +│ └── (style_analyzer, hwpx_*, html_to_hwp*) +│ +├── templates/ +│ ├── default/doc_types/ # 기본 유형 (briefing·report·presentation) +│ ├── user/ # 사용자 등록 데이터 +│ └── index.html # ★ v10 — 781줄 (HTML 셸만) +│ +├── static/ # ★ v10 프론트 모듈화 +│ ├── css/ +│ │ ├── main.css # 1,825줄 (인라인 CSS 분리) +│ │ └── editor.css # 편집기 스타일 +│ ├── js/ +│ │ ├── editor.js # WYSIWYG 편집기 +│ │ ├── doc_type.js # 문서 유형 선택·CRUD +│ │ ├── generator.js # 문서 생성 호출 +│ │ ├── demo_mode.js # 시연용 데모 +│ │ ├── domain_selector.js # 도메인 지식 선택 +│ │ ├── template.js # 템플릿 관리 +│ │ ├── ai_edit.js # AI 편집 +│ │ ├── modals.js # 공통 모달 +│ │ ├── ui.js # UI 유틸리티 +│ │ └── export.js # 다운로드 +│ └── result/ # ★ v10 — 데모 샘플 HTML 4종 +│ +├── .env / .env.sample +├── .gitignore +├── Procfile +└── README.md +``` + +--- + +## 🎨 글벗 표준 HTML 양식 + +| 항목 | 사양 | +|------|------| +| 용지 | A4 인쇄 최적화 (210mm × 297mm) | +| 폰트 | Noto Sans KR (Google Fonts) | +| 색상 | Navy 계열 (#1a365d 기본) | +| 구성 | page-header → lead-box → section → data-table → bottom-box → page-footer | +| 인쇄 | `@media print` 대응, `break-after: page` 페이지 분리 | + +--- + +## ⚠️ 알려진 제한사항 + +- API 키 분산: 파이프라인 각 step에 개별 정의 (공통화 미완) +- HWP 변환: Windows + pyhwpx + 한글 프로그램 필수 +- 발표자료: config.json만 존재, 실제 생성 미구현 +- 도메인 지식: 토목 분야만 구축 (타 분야 확장 가능) +- 도메인 → RAG 연동: 선택된 도메인 프롬프트 주입 경로 완성 중 + +--- + +## 📊 코드 규모 + +| 영역 | 줄 수 | +|------|-------| +| Python 전체 | 19,402 (+462) | +| 프론트엔드 (JS + CSS + HTML) | 6,463 (+1,196) | +| 도메인 지식 (txt) | 1,225 | +| **합계** | **~27,100** | + +--- + +## 📝 버전 이력 + +| 버전 | 핵심 변경 | +|------|----------| +| v1 | Flask + Claude API 기획서 생성기 | +| v2 | 웹 편집기 추가 | +| v3 | 9단계 RAG 파이프라인 + HWP 변환 | +| v4 | 코드 모듈화 (handlers 패키지) + 스타일 분석기·HWPX 생성기 | +| v5 | HWPX 스타일 주입 + 표 열 너비 정밀 변환 | +| v6 | HWPX 템플릿 분석·저장·관리 | +| v7 | UI 고도화 — 작성 방식·문서 유형·템플릿 관리 UI | +| v8 | 문서 유형 분석·등록 + HWPX 추출 도구 12종 + 템플릿 고도화 | +| v9 | 표 매칭 안정화 + 인라인 아이콘 감지 + 프론트 외부 참조 | +| **v10** | **백엔드 재구조화 + 프론트 모듈화 + 도메인 지식 + 데모 모드** | + +--- + +## 📝 라이선스 + +Private — GPD 내부 사용 \ No newline at end of file diff --git a/03. Code/geulbeot_10th/api_config.py b/03. Code/geulbeot_10th/api_config.py new file mode 100644 index 0000000..e2b3524 --- /dev/null +++ b/03. Code/geulbeot_10th/api_config.py @@ -0,0 +1,30 @@ +"""API 키 관리 - .env 파일에서 읽기""" +import os +from pathlib import Path + +def load_api_keys(): + """프로젝트 폴더의 .env에서 API 키 로딩""" + # python-dotenv 있으면 사용 + try: + from dotenv import load_dotenv + env_path = Path(__file__).resolve().parent / '.env' + load_dotenv(env_path) + except ImportError: + # python-dotenv 없으면 수동 파싱 + env_path = Path(__file__).resolve().parent / '.env' + if env_path.exists(): + with open(env_path, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + if line and not line.startswith('#') and '=' in line: + key, _, value = line.partition('=') + os.environ.setdefault(key.strip(), value.strip()) + + return { + 'CLAUDE_API_KEY': os.getenv('CLAUDE_API_KEY', ''), + 'GPT_API_KEY': os.getenv('GPT_API_KEY', ''), + 'GEMINI_API_KEY': os.getenv('GEMINI_API_KEY', ''), + 'PERPLEXITY_API_KEY': os.getenv('PERPLEXITY_API_KEY', ''), + } + +API_KEYS = load_api_keys() diff --git a/03. Code/geulbeot_10th/app.py b/03. Code/geulbeot_10th/app.py new file mode 100644 index 0000000..195375b --- /dev/null +++ b/03. Code/geulbeot_10th/app.py @@ -0,0 +1,684 @@ +# -*- coding: utf-8 -*- +""" +글벗 Light v2.0 +Flask 라우팅 + 공통 기능 +""" + +import os +import io +import tempfile +import json +import shutil +from datetime import datetime +from flask import Flask, render_template, request, jsonify, Response, session, send_file +import queue +import threading +from handlers.template.template_manager import TemplateManager +from pathlib import Path +from domain_api import register_domain_routes + +# 문서 유형별 프로세서 +from handlers.template import TemplateProcessor +from handlers.briefing import BriefingProcessor +from handlers.report import ReportProcessor +from handlers.doc.custom_doc_type import CustomDocTypeProcessor +from handlers.doc.doc_type_analyzer import DocTypeAnalyzer + +app = Flask(__name__) +app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB max +app.config['SECRET_KEY'] = os.environ.get('SECRET_KEY', 'geulbeot-light-secret-key-v2') +register_domain_routes(app) + +# processors 딕셔너리에 추가 +template_mgr = TemplateManager() +processors = { + 'briefing': BriefingProcessor(), + 'report': ReportProcessor(), + 'template': TemplateProcessor(), + 'custom': CustomDocTypeProcessor() +} + +DOC_TYPES_DEFAULT = Path('templates/default/doc_types') +DOC_TYPES_USER = Path('templates/user/doc_types') + + +# ============== 메인 페이지 ============== +@app.route('/') +def index(): + """메인 페이지""" + return render_template('index.html') + + +@app.route('/api/doc-types', methods=['GET']) +def get_doc_types(): + """문서 유형 목록 조회""" + try: + doc_types = [] + + # default 폴더 스캔 + if DOC_TYPES_DEFAULT.exists(): + for folder in DOC_TYPES_DEFAULT.iterdir(): + if folder.is_dir(): + config_file = folder / 'config.json' + if config_file.exists(): + with open(config_file, 'r', encoding='utf-8') as f: + doc_types.append(json.load(f)) + + # user 폴더 스캔 + if DOC_TYPES_USER.exists(): + for folder in DOC_TYPES_USER.iterdir(): + if folder.is_dir(): + config_file = folder / 'config.json' + if config_file.exists(): + with open(config_file, 'r', encoding='utf-8') as f: + doc_types.append(json.load(f)) + + # order → isDefault 순 정렬 + doc_types.sort(key=lambda x: (x.get('order', 999), not x.get('isDefault', False))) + + return jsonify(doc_types) + + except Exception as e: + import traceback + return jsonify({'error': str(e), 'trace': traceback.format_exc()}), 500 + + +@app.route('/api/doc-types', methods=['POST']) +def add_doc_type(): + """문서 유형 추가 (분석 결과 저장)""" + try: + data = request.get_json() + + if not data: + return jsonify({'error': 'JSON 데이터가 필요합니다'}), 400 + + # user 폴더 생성 + DOC_TYPES_USER.mkdir(parents=True, exist_ok=True) + + type_id = data.get('id') + if not type_id: + import time + type_id = f"user_{int(time.time())}" + data['id'] = type_id + + folder_path = DOC_TYPES_USER / type_id + folder_path.mkdir(parents=True, exist_ok=True) + + # config.json 저장 + with open(folder_path / 'config.json', 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=2) + + return jsonify(data) + + except Exception as e: + import traceback + return jsonify({'error': str(e), 'trace': traceback.format_exc()}), 500 + + +@app.route('/api/doc-types/', methods=['DELETE']) +def delete_doc_type(type_id): + """문서 유형 삭제""" + try: + folder_path = DOC_TYPES_USER / type_id + + if not folder_path.exists(): + return jsonify({'error': '문서 유형을 찾을 수 없습니다'}), 404 + + shutil.rmtree(folder_path) + return jsonify({'success': True, 'deleted': type_id}) + + except Exception as e: + import traceback + return jsonify({'error': str(e), 'trace': traceback.format_exc()}), 500 + + +# ============== 생성 API ============== + +@app.route('/generate', methods=['POST']) +def generate(): + """문서 생성 API""" + try: + content = "" + if 'file' in request.files and request.files['file'].filename: + file = request.files['file'] + content = file.read().decode('utf-8') + elif 'content' in request.form: + content = request.form.get('content', '') + + doc_type = request.form.get('doc_type', 'briefing') + + if doc_type.startswith('user_'): + options = { + 'instruction': request.form.get('instruction', '') + } + result = processors['custom'].generate(content, doc_type, options) + else: + options = { + 'page_option': request.form.get('page_option', '1'), + 'department': request.form.get('department', ''), + 'instruction': request.form.get('instruction', '') + } + + processor = processors.get(doc_type, processors['briefing']) + result = processor.generate(content, options) + + if 'error' in result: + return jsonify(result), 400 if 'trace' not in result else 500 + return jsonify(result) + + except Exception as e: + import traceback + return jsonify({'error': str(e), 'trace': traceback.format_exc()}), 500 + + +@app.route('/generate-report', methods=['POST']) +def generate_report(): + """보고서 생성 API""" + try: + data = request.get_json() or {} + content = data.get('content', '') + + options = { + 'folder_path': data.get('folder_path', ''), + 'cover': data.get('cover', False), + 'toc': data.get('toc', False), + 'divider': data.get('divider', False), + 'instruction': data.get('instruction', ''), + 'template_id': data.get('template_id') + } + + result = processors['report'].generate(content, options) + + if 'error' in result: + return jsonify(result), 500 + return jsonify(result) + + except Exception as e: + import traceback + return jsonify({'error': str(e), 'trace': traceback.format_exc()}), 500 + + +# ============== 수정 API ============== + +@app.route('/refine', methods=['POST']) +def refine(): + """피드백 반영 API""" + try: + feedback = request.json.get('feedback', '') + current_html = request.json.get('current_html', '') or session.get('current_html', '') + original_html = session.get('original_html', '') + doc_type = request.json.get('doc_type', 'briefing') + + processor = processors.get(doc_type, processors['briefing']) + result = processor.refine(feedback, current_html, original_html) + + if 'error' in result: + return jsonify(result), 400 + return jsonify(result) + + except Exception as e: + return jsonify({'error': str(e)}), 500 + + +@app.route('/refine-selection', methods=['POST']) +def refine_selection(): + """선택 부분 수정 API""" + try: + data = request.json + current_html = data.get('current_html', '') + selected_text = data.get('selected_text', '') + user_request = data.get('request', '') + doc_type = data.get('doc_type', 'briefing') + + processor = processors.get(doc_type, processors['briefing']) + result = processor.refine_selection(current_html, selected_text, user_request) + + if 'error' in result: + return jsonify(result), 400 + return jsonify(result) + + except Exception as e: + return jsonify({'error': str(e)}), 500 + + +# ============== 다운로드 API ============== + +@app.route('/download/html', methods=['POST']) +def download_html(): + """HTML 파일 다운로드""" + html_content = request.form.get('html', '') + if not html_content: + return "No content", 400 + + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + filename = f'report_{timestamp}.html' + + return Response( + html_content, + mimetype='text/html', + headers={'Content-Disposition': f'attachment; filename={filename}'} + ) + + +@app.route('/download/pdf', methods=['POST']) +def download_pdf(): + """PDF 파일 다운로드""" + try: + from weasyprint import HTML + + html_content = request.form.get('html', '') + if not html_content: + return "No content", 400 + + pdf_buffer = io.BytesIO() + HTML(string=html_content).write_pdf(pdf_buffer) + pdf_buffer.seek(0) + + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + filename = f'report_{timestamp}.pdf' + + return Response( + pdf_buffer.getvalue(), + mimetype='application/pdf', + headers={'Content-Disposition': f'attachment; filename={filename}'} + ) + except ImportError: + return jsonify({'error': 'PDF 변환 미지원'}), 501 + except Exception as e: + return jsonify({'error': f'PDF 변환 오류: {str(e)}'}), 500 + + +# ============== 기타 API ============== + +@app.route('/assets/') +def serve_assets(filename): + """로컬 assets 폴더 서빙""" + assets_dir = r"D:\for python\geulbeot-light\geulbeot-light\output\assets" + return send_file(os.path.join(assets_dir, filename)) + + +@app.route('/hwp-script') +def hwp_script(): + """HWP 변환 스크립트 안내""" + return render_template('hwp_guide.html') + + +@app.route('/health') +def health(): + """헬스 체크""" + return jsonify({'status': 'healthy', 'version': '2.0.0'}) + + +@app.route('/export-hwp', methods=['POST']) +def export_hwp(): + """HWP 변환 (스타일 그루핑 지원)""" + try: + data = request.get_json() + html_content = data.get('html', '') + doc_type = data.get('doc_type', 'briefing') + use_style_grouping = data.get('style_grouping', False) # 새 옵션 + + if not html_content: + return jsonify({'error': 'HTML 내용이 없습니다'}), 400 + + temp_dir = tempfile.gettempdir() + html_path = os.path.join(temp_dir, 'geulbeot_temp.html') + hwp_path = os.path.join(temp_dir, 'geulbeot_output.hwp') + + with open(html_path, 'w', encoding='utf-8') as f: + f.write(html_content) + + # 변환기 선택 + if doc_type == 'briefing': + from converters.html_to_hwp_briefing import HtmlToHwpConverter + else: + from converters.html_to_hwp import HtmlToHwpConverter + + converter = HtmlToHwpConverter(visible=False) + + # 스타일 그루핑 사용 여부 + if use_style_grouping: + final_path = converter.convert_with_styles(html_path, hwp_path) + # HWPX 파일 전송 + return send_file( + final_path, + as_attachment=True, + download_name=f'report_{datetime.now().strftime("%Y%m%d_%H%M%S")}.hwpx', + mimetype='application/vnd.hancom.hwpx' + ) + else: + converter.convert(html_path, hwp_path) + return send_file( + hwp_path, + as_attachment=True, + download_name=f'report_{datetime.now().strftime("%Y%m%d_%H%M%S")}.hwp', + mimetype='application/x-hwp' + ) + + except ImportError as e: + return jsonify({'error': f'pyhwpx 필요: {str(e)}'}), 500 + except Exception as e: + return jsonify({'error': str(e)}), 500 + + +# 기존 add_doc_type 대체 또는 수정 +@app.route('/api/doc-types/analyze', methods=['POST']) +def analyze_doc_type(): + """문서 유형 분석 API""" + if 'file' not in request.files: + return jsonify({"error": "파일이 필요합니다"}), 400 + + file = request.files['file'] + doc_name = request.form.get('name', '새 문서 유형') + + # 임시 저장 + import tempfile + temp_path = os.path.join(tempfile.gettempdir(), file.filename) + file.save(temp_path) + + try: + analyzer = DocTypeAnalyzer() + result = analyzer.analyze(temp_path, doc_name) + + return jsonify({ + "success": True, + "config": result["config"], + "summary": { + "pageCount": result["structure"]["pageCount"], + "sections": len(result["toc"]), + "style": result["style"] + } + }) + except Exception as e: + return jsonify({"error": str(e)}), 500 + finally: + os.remove(temp_path) + + +@app.route('/analyze-styles', methods=['POST']) +def analyze_styles(): + """HTML 스타일 분석 미리보기""" + try: + data = request.get_json() + html_content = data.get('html', '') + + if not html_content: + return jsonify({'error': 'HTML 내용이 없습니다'}), 400 + + from converters.style_analyzer import StyleAnalyzer + from converters.hwp_style_mapping import ROLE_TO_STYLE_NAME + + analyzer = StyleAnalyzer() + elements = analyzer.analyze(html_content) + + # 요약 정보 + summary = analyzer.get_role_summary() + + # 상세 정보 (처음 50개만) + details = [] + for elem in elements[:50]: + details.append({ + 'role': elem.role, + 'hwp_style': ROLE_TO_STYLE_NAME.get(elem.role, '바탕글'), + 'text': elem.text[:50] + ('...' if len(elem.text) > 50 else ''), + 'section': elem.section + }) + + return jsonify({ + 'total_elements': len(elements), + 'summary': summary, + 'details': details + }) + + except Exception as e: + import traceback + return jsonify({'error': str(e), 'trace': traceback.format_exc()}), 500 + +@app.route('/templates', methods=['GET']) +def get_templates(): + """저장된 템플릿 목록 조회""" + try: + templates = template_mgr.list_templates() + return jsonify(templates) + except Exception as e: + return jsonify({'error': str(e)}), 500 + +@app.route('/api/templates', methods=['GET']) +def get_templates_api(): + """템플릿 목록 조회 (API 경로)""" + try: + templates = template_mgr.list_templates() + return jsonify(templates) + except Exception as e: + return jsonify({'error': str(e)}), 500 + + +@app.route('/analyze-template', methods=['POST']) +def analyze_template(): + """템플릿 추출 및 저장 (doc_template_analyzer → template_manager)""" + try: + if 'file' not in request.files: + return jsonify({'error': '파일이 없습니다'}), 400 + + file = request.files['file'] + name = request.form.get('name', '').strip() + + if not name: + return jsonify({'error': '템플릿 이름을 입력해주세요'}), 400 + + if not file.filename: + return jsonify({'error': '파일을 선택해주세요'}), 400 + + # 임시 저장 → HWPX 파싱 → 템플릿 추출 + temp_dir = tempfile.gettempdir() + temp_path = os.path.join(temp_dir, file.filename) + file.save(temp_path) + + try: + # v3 파서 재사용 (HWPX → parsed dict) + from handlers.doc.doc_type_analyzer import DocTypeAnalyzer + parser = DocTypeAnalyzer() + parsed = parser._parse_hwpx(temp_path) + + # template_manager로 추출+저장 + result = template_mgr.extract_and_save( + parsed, name, + source_file=file.filename + ) + + return jsonify(result) + finally: + try: + os.remove(temp_path) + except: + pass + + except Exception as e: + import traceback + return jsonify({'error': str(e), 'trace': traceback.format_exc()}), 500 + +# ============== 문서 유형 분석 SSE API ============== + +@app.route('/api/doc-types/analyze-stream', methods=['POST']) +def analyze_doc_type_stream(): + """ + 문서 유형 분석 (SSE 스트리밍) + 실시간으로 각 단계의 진행 상황을 전달 + """ + import tempfile + + # 파일 및 데이터 검증 + if 'file' not in request.files: + return jsonify({'error': '파일이 없습니다'}), 400 + + file = request.files['file'] + name = request.form.get('name', '').strip() + description = request.form.get('description', '').strip() + + if not name: + return jsonify({'error': '문서 유형 이름을 입력해주세요'}), 400 + + if not file.filename: + return jsonify({'error': '파일을 선택해주세요'}), 400 + + # 임시 파일 저장 + temp_dir = tempfile.gettempdir() + temp_path = os.path.join(temp_dir, file.filename) + file.save(temp_path) + + # 메시지 큐 생성 + message_queue = queue.Queue() + analysis_result = {"data": None, "error": None} + + def progress_callback(step_id, status, message): + """진행 상황 콜백 - 메시지 큐에 추가""" + message_queue.put({ + "type": "progress", + "step": step_id, + "status": status, + "message": message + }) + + def run_analysis(): + """분석 실행 (별도 스레드)""" + try: + + analyzer = DocTypeAnalyzer(progress_callback=progress_callback) + result = analyzer.analyze(temp_path, name, description) + + # 저장 + save_path = analyzer.save_doc_type(result["config"], result.get("template", "") ) + + analysis_result["data"] = { + "success": True, + "config": result["config"], + "layout": result.get("layout", {}), + "context": result.get("context", {}), + "structure": result.get("structure", {}), + "template_generated": bool(result.get("template_id") or result.get("template")), + "template_id": result.get("template_id"), # ★ 추가 + "saved_path": save_path + } + + except Exception as e: + import traceback + analysis_result["error"] = { + "message": str(e), + "trace": traceback.format_exc() + } + finally: + # 완료 신호 + message_queue.put({"type": "complete"}) + # 임시 파일 삭제 + try: + os.remove(temp_path) + except: + pass + + def generate_events(): + """SSE 이벤트 생성기""" + # 분석 시작 + analysis_thread = threading.Thread(target=run_analysis) + analysis_thread.start() + + # 이벤트 스트리밍 + while True: + try: + msg = message_queue.get(timeout=60) # 60초 타임아웃 + + if msg["type"] == "complete": + # 분석 완료 + if analysis_result["error"]: + yield f"data: {json.dumps({'type': 'error', 'error': analysis_result['error']}, ensure_ascii=False)}\n\n" + else: + yield f"data: {json.dumps({'type': 'result', 'data': analysis_result['data']}, ensure_ascii=False)}\n\n" + break + else: + # 진행 상황 + yield f"data: {json.dumps(msg, ensure_ascii=False)}\n\n" + + except queue.Empty: + # 타임아웃 + yield f"data: {json.dumps({'type': 'error', 'error': {'message': '분석 시간 초과'}}, ensure_ascii=False)}\n\n" + break + + return Response( + generate_events(), + mimetype='text/event-stream', + headers={ + 'Cache-Control': 'no-cache', + 'Connection': 'keep-alive', + 'X-Accel-Buffering': 'no' + } + ) + +@app.route('/delete-template/', methods=['DELETE']) +def delete_template(template_id): + """템플릿 삭제 (레거시 호환)""" + try: + result = template_mgr.delete_template(template_id) + if 'error' in result: + return jsonify(result), 400 + return jsonify(result) + except Exception as e: + return jsonify({'error': str(e)}), 500 + + +@app.route('/api/templates/', methods=['GET']) +def get_template(tpl_id): + """특정 템플릿 조회""" + try: + result = template_mgr.load_template(tpl_id) + if 'error' in result: + return jsonify(result), 404 + return jsonify(result) + except Exception as e: + return jsonify({'error': str(e)}), 500 + + +@app.route('/api/templates/', methods=['DELETE']) +def delete_template_new(tpl_id): + """템플릿 삭제""" + try: + result = template_mgr.delete_template(tpl_id) + if 'error' in result: + return jsonify(result), 400 + return jsonify(result) + except Exception as e: + return jsonify({'error': str(e)}), 500 + + +@app.route('/api/doc-types//template', methods=['PUT']) +def change_doc_type_template(type_id): + """문서 유형의 템플릿 교체""" + try: + data = request.get_json() + new_tpl_id = data.get('template_id') + + if not new_tpl_id: + return jsonify({'error': 'template_id가 필요합니다'}), 400 + + result = template_mgr.change_template(type_id, new_tpl_id) + if 'error' in result: + return jsonify(result), 400 + return jsonify(result) + except Exception as e: + return jsonify({'error': str(e)}), 500 + + +@app.route('/api/doc-types//template', methods=['GET']) +def get_doc_type_template(type_id): + """문서 유형에 연결된 템플릿 조회""" + try: + result = template_mgr.get_template_for_doctype(type_id) + if 'error' in result: + return jsonify(result), 404 + return jsonify(result) + except Exception as e: + return jsonify({'error': str(e)}), 500 + + +if __name__ == '__main__': + port = int(os.environ.get('PORT', 5000)) + debug = os.environ.get('FLASK_DEBUG', 'False').lower() == 'true' + app.run(host='0.0.0.0', port=port, debug=debug) \ No newline at end of file diff --git a/03. Code/geulbeot_10th/converters/__init__.py b/03. Code/geulbeot_10th/converters/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/03. Code/geulbeot_10th/converters/html_to_hwp.py b/03. Code/geulbeot_10th/converters/html_to_hwp.py new file mode 100644 index 0000000..7b2b8a0 --- /dev/null +++ b/03. Code/geulbeot_10th/converters/html_to_hwp.py @@ -0,0 +1,1115 @@ +# -*- coding: utf-8 -*- +""" +HTML → HWP 변환기 v11 + +✅ 이미지: sizeoption=0 (원본 크기) 또는 width/height 지정 +✅ 페이지번호: ctrl 코드 방식으로 수정 +✅ 나머지는 v10 유지 + +pip install pyhwpx beautifulsoup4 pillow +""" + +from pyhwpx import Hwp +from bs4 import BeautifulSoup, NavigableString +import os, re + +# 스타일 그루핑 시스템 추가 +from converters.style_analyzer import StyleAnalyzer, StyledElement +from converters.hwp_style_mapping import HwpStyleMapper, DEFAULT_STYLES, ROLE_TO_STYLE_NAME +from converters.hwpx_style_injector import HwpxStyleInjector, inject_styles_to_hwpx + + +# PIL 선택적 import (이미지 크기 확인용) +try: + from PIL import Image + HAS_PIL = True +except ImportError: + HAS_PIL = False + print("[알림] PIL 없음 - 이미지 원본 크기로 삽입") + +class Config: + MARGIN_LEFT, MARGIN_RIGHT, MARGIN_TOP, MARGIN_BOTTOM = 20, 20, 20, 15 + HEADER_LEN, FOOTER_LEN = 10, 10 + MAX_IMAGE_WIDTH = 150 # mm (최대 이미지 너비) + ASSETS_PATH = r"D:\for python\geulbeot-light\geulbeot-light\output\assets" # 🆕 추가 + +class StyleParser: + def __init__(self): + self.style_map = {} # 스타일 매핑 (역할 → HwpStyle) + self.sty_gen = None # 스타일 생성기 + self.class_styles = { + 'h1': {'font-size': '20pt', 'color': '#008000'}, + 'h2': {'font-size': '16pt', 'color': '#03581d'}, + 'h3': {'font-size': '13pt', 'color': '#228B22'}, + 'p': {'font-size': '11pt', 'color': '#333333'}, + 'li': {'font-size': '11pt', 'color': '#333333'}, + 'th': {'font-size': '9pt', 'color': '#006400'}, + 'td': {'font-size': '9.5pt', 'color': '#333333'}, + 'toc-lvl-1': {'font-size': '13pt', 'font-weight': '900', 'color': '#006400'}, + 'toc-lvl-2': {'font-size': '11pt', 'color': '#333333'}, + 'toc-lvl-3': {'font-size': '10pt', 'color': '#666666'}, + } + + def get_element_style(self, elem): + style = {} + tag = elem.name if hasattr(elem, 'name') else None + if tag and tag in self.class_styles: style.update(self.class_styles[tag]) + for cls in elem.get('class', []) if hasattr(elem, 'get') else []: + if cls in self.class_styles: style.update(self.class_styles[cls]) + return style + + def parse_size(self, s): + m = re.search(r'([\d.]+)', str(s)) if s else None + return float(m.group(1)) if m else 11 + + def parse_color(self, c): + if not c: return '#000000' + c = str(c).strip().lower() + if re.match(r'^#[0-9a-fA-F]{6}$', c): return c.upper() + m = re.search(r'rgb[a]?\s*\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)', c) + return f'#{int(m.group(1)):02X}{int(m.group(2)):02X}{int(m.group(3)):02X}' if m else '#000000' + + def is_bold(self, style): return style.get('font-weight', '') in ['bold', '700', '800', '900'] + +# ═══════════════════════════════════════════════════════════════ +# 번호 제거 유틸리티 +# ═══════════════════════════════════════════════════════════════ + +NUMBERING_PATTERNS = { + 'H1': re.compile(r'^(\d+)\.\s*'), # "1. " → "" + 'H2': re.compile(r'^(\d+)\.(\d+)\s*'), # "1.1 " → "" + 'H3': re.compile(r'^(\d+)\.(\d+)\.(\d+)\s*'), # "1.1.1 " → "" + 'H4': re.compile(r'^[가-하]\.\s*'), # "가. " → "" + 'H5': re.compile(r'^(\d+)\)\s*'), # "1) " → "" + 'H6': re.compile(r'^\((\d+)\)\s*'), # "(1) " → "" + 'H7': re.compile(r'^[①②③④⑤⑥⑦⑧⑨⑩]\s*'), # "① " → "" + 'LIST_ITEM': re.compile(r'^[•\-○]\s*'), # "• " → "" +} + +def strip_numbering(text: str, role: str) -> str: + """ + 역할에 따라 텍스트 앞의 번호/기호 제거 + HWP 개요 기능이 번호를 자동 생성하므로 중복 방지 + """ + if not text: + return text + + pattern = NUMBERING_PATTERNS.get(role) + if pattern: + return pattern.sub('', text).strip() + + return text.strip() + +# ═══════════════════════════════════════════════════════════════ +# 표 너비 파싱 유틸리티 (🆕 추가) +# ═══════════════════════════════════════════════════════════════ + +def _parse_width(width_str): + """너비 문자열 파싱 → mm 값 반환""" + if not width_str: + return None + + width_str = str(width_str).strip().lower() + + # style 속성에서 width 추출 + style_match = re.search(r'width\s*:\s*([^;]+)', width_str) + if style_match: + width_str = style_match.group(1).strip() + + # px → mm (96 DPI 기준) + px_match = re.search(r'([\d.]+)\s*px', width_str) + if px_match: + return float(px_match.group(1)) * 25.4 / 96 + + # mm 그대로 + mm_match = re.search(r'([\d.]+)\s*mm', width_str) + if mm_match: + return float(mm_match.group(1)) + + # % → 본문폭(170mm) 기준 계산 + pct_match = re.search(r'([\d.]+)\s*%', width_str) + if pct_match: + return float(pct_match.group(1)) * 170 / 100 + + # 숫자만 있으면 px로 간주 + num_match = re.search(r'^([\d.]+)$', width_str) + if num_match: + return float(num_match.group(1)) * 25.4 / 96 + + return None + + +def _parse_align(cell): + """셀의 정렬 속성 파싱""" + align = cell.get('align', '').lower() + if align in ['left', 'center', 'right']: + return align + + style = cell.get('style', '') + align_match = re.search(r'text-align\s*:\s*(\w+)', style) + if align_match: + return align_match.group(1).lower() + + return None + + +def _parse_bg_color(cell): + """셀의 배경색 파싱""" + bgcolor = cell.get('bgcolor', '') + if bgcolor: + return bgcolor if bgcolor.startswith('#') else f'#{bgcolor}' + + style = cell.get('style', '') + bg_match = re.search(r'background(?:-color)?\s*:\s*([^;]+)', style) + if bg_match: + color = bg_match.group(1).strip() + if color.startswith('#'): + return color + rgb_match = re.search(r'rgb\s*\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)', color) + if rgb_match: + r, g, b = int(rgb_match.group(1)), int(rgb_match.group(2)), int(rgb_match.group(3)) + return f'#{r:02X}{g:02X}{b:02X}' + + return None + + +class HtmlToHwpConverter: + def __init__(self, visible=True): + self.hwp = Hwp(visible=visible) + self.cfg = Config() + self.sp = StyleParser() + self.base_path = "" + self.is_first_h1 = True + self.image_count = 0 + self.table_widths = [] # 🆕 표 열 너비 정보 저장용 + self.style_map = {} # 역할 → 스타일 이름 매핑 + self.sty_path = None # .sty 파일 경로 + + def _mm(self, mm): return self.hwp.MiliToHwpUnit(mm) + def _pt(self, pt): return self.hwp.PointToHwpUnit(pt) + def _rgb(self, c): + c = c.lstrip('#') + return self.hwp.RGBColor(int(c[0:2],16), int(c[2:4],16), int(c[4:6],16)) if len(c)>=6 else self.hwp.RGBColor(0,0,0) + + def _setup_page(self): + try: + self.hwp.HAction.GetDefault("PageSetup", self.hwp.HParameterSet.HSecDef.HSet) + s = self.hwp.HParameterSet.HSecDef + s.PageDef.LeftMargin = self._mm(self.cfg.MARGIN_LEFT) + s.PageDef.RightMargin = self._mm(self.cfg.MARGIN_RIGHT) + s.PageDef.TopMargin = self._mm(self.cfg.MARGIN_TOP) + s.PageDef.BottomMargin = self._mm(self.cfg.MARGIN_BOTTOM) + s.PageDef.HeaderLen = self._mm(self.cfg.HEADER_LEN) + s.PageDef.FooterLen = self._mm(self.cfg.FOOTER_LEN) + self.hwp.HAction.Execute("PageSetup", s.HSet) + except: pass + + def _create_header(self, right_text=""): + print(f" → 머리말 생성: {right_text if right_text else '(초기화)'}") + try: + self.hwp.HAction.GetDefault("HeaderFooter", self.hwp.HParameterSet.HHeaderFooter.HSet) + self.hwp.HParameterSet.HHeaderFooter.HSet.SetItem("HeaderFooterStyle", 0) + self.hwp.HParameterSet.HHeaderFooter.HSet.SetItem("HeaderFooterCtrlType", 0) + self.hwp.HAction.Execute("HeaderFooter", self.hwp.HParameterSet.HHeaderFooter.HSet) + + self.hwp.HAction.Run("ParagraphShapeAlignRight") + self._set_font(9, False, '#333333') + if right_text: + self.hwp.insert_text(right_text) + + self.hwp.HAction.Run("CloseEx") + except Exception as e: + print(f" [경고] 머리말: {e}") + + # ═══════════════════════════════════════════════════════════════ + # 꼬리말 - 페이지 번호 (수정) + # ═══════════════════════════════════════════════════════════════ + def _create_footer(self, left_text=""): + print(f" → 꼬리말: {left_text}") + + # 1. 꼬리말 열기 + self.hwp.HAction.GetDefault("HeaderFooter", self.hwp.HParameterSet.HHeaderFooter.HSet) + self.hwp.HParameterSet.HHeaderFooter.HSet.SetItem("HeaderFooterStyle", 0) + self.hwp.HParameterSet.HHeaderFooter.HSet.SetItem("HeaderFooterCtrlType", 1) + self.hwp.HAction.Execute("HeaderFooter", self.hwp.HParameterSet.HHeaderFooter.HSet) + + # 2. 좌측 정렬 + 제목 8pt + self.hwp.HAction.Run("ParagraphShapeAlignLeft") + self._set_font(8, False, '#666666') + self.hwp.insert_text(left_text) + + # 3. 꼬리말 닫기 + self.hwp.HAction.Run("CloseEx") + + # 4. 쪽번호 (우측 하단) + self.hwp.HAction.GetDefault("PageNumPos", self.hwp.HParameterSet.HPageNumPos.HSet) + self.hwp.HParameterSet.HPageNumPos.DrawPos = self.hwp.PageNumPosition("BottomRight") + self.hwp.HAction.Execute("PageNumPos", self.hwp.HParameterSet.HPageNumPos.HSet) + + def _new_section_with_header(self, header_text): + """새 구역 생성 후 머리말 설정""" + print(f" → 새 구역 머리말: {header_text}") + try: + self.hwp.HAction.Run("BreakSection") + + self.hwp.HAction.GetDefault("HeaderFooter", self.hwp.HParameterSet.HHeaderFooter.HSet) + self.hwp.HParameterSet.HHeaderFooter.HSet.SetItem("HeaderFooterStyle", 0) + self.hwp.HParameterSet.HHeaderFooter.HSet.SetItem("HeaderFooterCtrlType", 0) + self.hwp.HAction.Execute("HeaderFooter", self.hwp.HParameterSet.HHeaderFooter.HSet) + + self.hwp.HAction.Run("SelectAll") + self.hwp.HAction.Run("Delete") + + self.hwp.HAction.Run("ParagraphShapeAlignRight") + self._set_font(9, False, '#333333') + self.hwp.insert_text(header_text) + + self.hwp.HAction.Run("CloseEx") + except Exception as e: + print(f" [경고] 구역 머리말: {e}") + + # 스타일 적용 관련 (🆕 NEW) + + def _load_style_template(self, sty_path: str): + """ + .sty 스타일 템플릿 로드 + HWP에서 스타일 불러오기 기능 사용 + """ + if not os.path.exists(sty_path): + print(f" [경고] 스타일 파일 없음: {sty_path}") + return False + + try: + # HWP 스타일 불러오기 + self.hwp.HAction.GetDefault("StyleTemplate", self.hwp.HParameterSet.HStyleTemplate.HSet) + self.hwp.HParameterSet.HStyleTemplate.filename = sty_path + self.hwp.HAction.Execute("StyleTemplate", self.hwp.HParameterSet.HStyleTemplate.HSet) + print(f" ✅ 스타일 템플릿 로드: {sty_path}") + return True + except Exception as e: + print(f" [경고] 스타일 로드 실패: {e}") + return False + + + def _apply_style_by_name(self, style_name: str): + """ + 현재 문단에 스타일 이름으로 적용 + 텍스트 삽입 후 호출 + """ + try: + # 현재 문단 선택 + self.hwp.HAction.Run("MoveLineBegin") + self.hwp.HAction.Run("MoveSelLineEnd") + + # 스타일 적용 + self.hwp.HAction.GetDefault("Style", self.hwp.HParameterSet.HStyle.HSet) + self.hwp.HParameterSet.HStyle.StyleName = style_name + self.hwp.HAction.Execute("Style", self.hwp.HParameterSet.HStyle.HSet) + + # 커서 문단 끝으로 + self.hwp.HAction.Run("MoveLineEnd") + + except Exception as e: + print(f" [경고] 스타일 적용 실패 '{style_name}': {e}") + + + def _build_dynamic_style_map(self, elements: list): + """HTML 분석 결과 기반 동적 스타일 매핑 생성 (숫자)""" + roles = set(elem.role for elem in elements) + + # 제목 역할 정렬 (H1, H2, H3...) + title_roles = sorted([r for r in roles if r.startswith('H') and r[1:].isdigit()], + key=lambda x: int(x[1:])) + + # 기타 역할 + other_roles = [r for r in roles if r not in title_roles] + + # 순차 할당 (개요 1~10) + self.style_map = {} + style_num = 1 + + for role in title_roles: + if style_num <= 10: + self.style_map[role] = style_num + style_num += 1 + + for role in other_roles: + if style_num <= 10: + self.style_map[role] = style_num + style_num += 1 + + print(f" 📝 동적 스타일 매핑: {self.style_map}") + return self.style_map + + + + def _set_font(self, size=11, bold=False, color='#000000'): + self.hwp.set_font(FaceName='맑은 고딕', Height=size, Bold=bold, TextColor=self._rgb(color)) + + def _set_para(self, align='justify', lh=170, left=0, indent=0, before=0, after=0): + acts = {'left':'ParagraphShapeAlignLeft','center':'ParagraphShapeAlignCenter', + 'right':'ParagraphShapeAlignRight','justify':'ParagraphShapeAlignJustify'} + if align in acts: self.hwp.HAction.Run(acts[align]) + try: + self.hwp.HAction.GetDefault("ParagraphShape", self.hwp.HParameterSet.HParaShape.HSet) + p = self.hwp.HParameterSet.HParaShape + p.LineSpaceType, p.LineSpacing = 0, lh + p.LeftMargin = self._mm(left) + p.IndentMargin = self._mm(indent) + p.SpaceBeforePara = self._pt(before) + p.SpaceAfterPara = self._pt(after) + p.BreakNonLatinWord = 0 + self.hwp.HAction.Execute("ParagraphShape", p.HSet) + except: pass + + def _set_cell_bg(self, color): + try: + self.hwp.HAction.GetDefault("CellBorderFill", self.hwp.HParameterSet.HCellBorderFill.HSet) + p = self.hwp.HParameterSet.HCellBorderFill + p.FillAttr.type = self.hwp.BrushType("NullBrush|WinBrush") + p.FillAttr.WinBrushFaceStyle = self.hwp.HatchStyle("None") + p.FillAttr.WinBrushHatchColor = self._rgb('#000000') + p.FillAttr.WinBrushFaceColor = self._rgb(color) + p.FillAttr.WindowsBrush = 1 + self.hwp.HAction.Execute("CellBorderFill", p.HSet) + except: pass + + def _underline_box(self, text, size=14, color='#008000'): + try: + self.hwp.HAction.GetDefault("TableCreate", self.hwp.HParameterSet.HTableCreation.HSet) + t = self.hwp.HParameterSet.HTableCreation + t.Rows, t.Cols, t.WidthType, t.HeightType = 1, 1, 0, 0 + t.WidthValue, t.HeightValue = self._mm(168), self._mm(10) + self.hwp.HAction.Execute("TableCreate", t.HSet) + self.hwp.HAction.GetDefault("InsertText", self.hwp.HParameterSet.HInsertText.HSet) + self.hwp.HParameterSet.HInsertText.Text = text + self.hwp.HAction.Execute("InsertText", self.hwp.HParameterSet.HInsertText.HSet) + self.hwp.HAction.Run("TableCellBlock") + self.hwp.HAction.GetDefault("CharShape", self.hwp.HParameterSet.HCharShape.HSet) + self.hwp.HParameterSet.HCharShape.Height = self._pt(size) + self.hwp.HParameterSet.HCharShape.TextColor = self._rgb(color) + self.hwp.HAction.Execute("CharShape", self.hwp.HParameterSet.HCharShape.HSet) + self.hwp.HAction.GetDefault("CellBorder", self.hwp.HParameterSet.HCellBorderFill.HSet) + c = self.hwp.HParameterSet.HCellBorderFill + c.BorderTypeTop = self.hwp.HwpLineType("None") + c.BorderTypeRight = self.hwp.HwpLineType("None") + c.BorderTypeLeft = self.hwp.HwpLineType("None") + self.hwp.HAction.Execute("CellBorder", c.HSet) + self.hwp.HAction.GetDefault("CellBorder", self.hwp.HParameterSet.HCellBorderFill.HSet) + c = self.hwp.HParameterSet.HCellBorderFill + c.BorderColorBottom = self._rgb(color) + c.BorderWidthBottom = self.hwp.HwpLineWidth("0.4mm") + self.hwp.HAction.Execute("CellBorder", c.HSet) + self.hwp.HAction.Run("Cancel") + self.hwp.HAction.Run("CloseEx") + self.hwp.HAction.Run("MoveDocEnd") + except: + self._set_font(size, True, color) + self.hwp.insert_text(text) + self.hwp.BreakPara() + + def _update_header(self, new_title): + """머리말 텍스트 업데이트""" + try: + # 기존 머리말 편집 모드로 진입 + self.hwp.HAction.GetDefault("HeaderFooter", self.hwp.HParameterSet.HHeaderFooter.HSet) + self.hwp.HParameterSet.HHeaderFooter.HSet.SetItem("HeaderFooterStyle", 2) # 편집 모드 + self.hwp.HParameterSet.HHeaderFooter.HSet.SetItem("HeaderFooterCtrlType", 0) + self.hwp.HAction.Execute("HeaderFooter", self.hwp.HParameterSet.HHeaderFooter.HSet) + + # 기존 내용 삭제 + self.hwp.HAction.Run("SelectAll") + self.hwp.HAction.Run("Delete") + + # 새 내용 삽입 + self.hwp.HAction.Run("ParagraphShapeAlignRight") + self._set_font(9, False, '#333333') + self.hwp.insert_text(new_title) + + self.hwp.HAction.Run("CloseEx") + except Exception as e: + print(f" [경고] 머리말 업데이트: {e}") + + def _insert_heading(self, elem): + lv = int(elem.name[1]) if elem.name in ['h1','h2','h3'] else 1 + txt = elem.get_text(strip=True) + st = self.sp.get_element_style(elem) + sz = self.sp.parse_size(st.get('font-size','14pt')) + cl = self.sp.parse_color(st.get('color','#008000')) + + if lv == 1: + if self.is_first_h1: + self._create_header(txt) + self.is_first_h1 = False + else: + self._new_section_with_header(txt) + + self._set_para('left', 130, before=0, after=0) + self._underline_box(txt, sz, cl) + self.hwp.BreakPara() + self._set_para('left', 130, before=0, after=15) + self.hwp.BreakPara() + elif lv == 2: + self._set_para('left', 150, before=20, after=8) + self._set_font(sz, True, cl) + self.hwp.insert_text("■ " + txt) + self.hwp.BreakPara() + elif lv == 3: + self._set_para('left', 140, left=3, before=12, after=5) + self._set_font(sz, True, cl) + self.hwp.insert_text("▸ " + txt) + self.hwp.BreakPara() + + def _insert_paragraph(self, elem): + txt = elem.get_text(strip=True) + if not txt: return + st = self.sp.get_element_style(elem) + sz = self.sp.parse_size(st.get('font-size','11pt')) + cl = self.sp.parse_color(st.get('color','#333333')) + self._set_para('justify', 170, left=0, indent=3, before=0, after=3) + + if elem.find(['b','strong']): + for ch in elem.children: + if isinstance(ch, NavigableString): + if str(ch).strip(): self._set_font(sz,False,cl); self.hwp.insert_text(str(ch)) + elif ch.name in ['b','strong']: + if ch.get_text(): self._set_font(sz,True,cl); self.hwp.insert_text(ch.get_text()) + else: + self._set_font(sz, self.sp.is_bold(st), cl) + self.hwp.insert_text(txt) + self.hwp.BreakPara() + + def _insert_list(self, elem): + lt = elem.name + for i, li in enumerate(elem.find_all('li', recursive=False)): + st = self.sp.get_element_style(li) + cls = li.get('class', []) + txt = li.get_text(strip=True) + is_toc = any('toc-' in c for c in cls) + + if 'toc-lvl-1' in cls: left, bef = 0, 8 + elif 'toc-lvl-2' in cls: left, bef = 7, 3 + elif 'toc-lvl-3' in cls: left, bef = 14, 1 + else: left, bef = 4, 2 + + pf = f"{i+1}. " if lt == 'ol' else "• " + sz = self.sp.parse_size(st.get('font-size','11pt')) + cl = self.sp.parse_color(st.get('color','#333333')) + bd = self.sp.is_bold(st) + + if is_toc: + self._set_para('left', 170, left=left, indent=0, before=bef, after=1) + self._set_font(sz, bd, cl) + self.hwp.insert_text(pf + txt) + self.hwp.BreakPara() + else: + self._set_para('justify', 170, left=left, indent=0, before=bef, after=1) + self._set_font(sz, bd, cl) + self.hwp.insert_text(pf) + self.hwp.HAction.Run("ParagraphShapeIndentAtCaret") + self.hwp.insert_text(txt) + self.hwp.BreakPara() + + def _insert_table(self, table_elem): + """HTML 테이블 → HWP 표 변환 (내용 기반 열 너비 계산 + HWPX 후처리용 저장)""" + + # ═══ 1. 테이블 구조 분석 ═══ + rows_data = [] + cell_styles = {} + occupied = {} + max_cols = 0 + col_widths = [] # 열 너비 (mm) - HTML에서 지정된 값 + + # /에서 너비 추출 + colgroup = table_elem.find('colgroup') + if colgroup: + for col in colgroup.find_all('col'): + width = _parse_width(col.get('width') or col.get('style', '')) + col_widths.append(width) + + # 행 데이터 수집 + for ri, tr in enumerate(table_elem.find_all('tr')): + row = [] + ci = 0 + + for cell in tr.find_all(['td', 'th']): + # 병합된 셀 건너뛰기 + while (ri, ci) in occupied: + row.append("") + ci += 1 + + txt = cell.get_text(strip=True) + cs = int(cell.get('colspan', 1)) + rs = int(cell.get('rowspan', 1)) + + # 셀 스타일 저장 + cell_styles[(ri, ci)] = { + 'is_header': cell.name == 'th' or ri == 0, + 'align': _parse_align(cell), + 'bg_color': _parse_bg_color(cell) + } + + # 첫 행에서 열 너비 추출 (colgroup 없을 때) + if ri == 0: + width = _parse_width(cell.get('width') or cell.get('style', '')) + for _ in range(cs): + if len(col_widths) <= ci + _: + col_widths.append(width if _ == 0 else None) + + row.append(txt) + + # 병합 영역 표시 + for dr in range(rs): + for dc in range(cs): + if dr > 0 or dc > 0: + occupied[(ri + dr, ci + dc)] = True + + # colspan 빈 셀 추가 + for _ in range(cs - 1): + row.append("") + ci += cs + + rows_data.append(row) + max_cols = max(max_cols, len(row)) + + # 행/열 수 맞추기 + for row in rows_data: + while len(row) < max_cols: + row.append("") + while len(col_widths) < max_cols: + col_widths.append(None) + + rc = len(rows_data) + if rc == 0 or max_cols == 0: + return + + print(f" 표: {rc}행 × {max_cols}열") + + # ═══ 2. 열 너비 계산 (내용 길이 기반) ═══ + body_width_mm = 170 # A4 본문 폭 (210mm - 좌우 여백 40mm) + + # 지정된 너비가 있는 열 확인 + specified_width = sum(w for w in col_widths if w is not None) + unspecified_indices = [i for i, w in enumerate(col_widths) if w is None] + + if unspecified_indices: + # 각 열의 최대 텍스트 길이 계산 (한글=2, 영문/숫자=1) + col_text_lengths = [0] * max_cols + for row in rows_data: + for ci, cell_text in enumerate(row): + if ci < max_cols: + # 한글은 2배 너비로 계산 + length = sum(2 if ord(c) > 127 else 1 for c in str(cell_text)) + col_text_lengths[ci] = max(col_text_lengths[ci], length) + + # 최소 너비 보장 (8자 이상) + col_text_lengths = [max(length, 8) for length in col_text_lengths] + + # 미지정 열들의 총 텍스트 길이 + unspecified_total_length = sum(col_text_lengths[i] for i in unspecified_indices) + + # 남은 너비를 텍스트 길이 비율로 분배 + remaining_width = max(body_width_mm - specified_width, 15 * len(unspecified_indices)) + + for i in unspecified_indices: + if unspecified_total_length > 0: + ratio = col_text_lengths[i] / unspecified_total_length + col_widths[i] = remaining_width * ratio + else: + col_widths[i] = remaining_width / len(unspecified_indices) + + print(f" 텍스트 길이: {col_text_lengths}") + + # 본문 폭 초과 시 비례 축소 + total = sum(col_widths) + if total > body_width_mm: + ratio = body_width_mm / total + col_widths = [w * ratio for w in col_widths] + + col_widths_mm = [round(w, 1) for w in col_widths] + print(f" 열 너비(mm): {col_widths_mm}") + + # ═══ 3. HWPX 후처리용 열 너비 저장 ═══ + self.table_widths.append(col_widths_mm) + print(f" 📊 표 #{len(self.table_widths)} 저장 완료") + + # ═══ 4. HWP 표 생성 (기본 방식) ═══ + self._set_para('left', 130, before=5, after=0) + self.hwp.create_table(rc, max_cols, treat_as_char=True) + + # ═══ 5. 셀 내용 입력 ═══ + for ri, row in enumerate(rows_data): + for ci in range(max_cols): + # 병합된 셀 건너뛰기 + if (ri, ci) in occupied: + self.hwp.HAction.Run("MoveRight") + continue + + txt = row[ci] if ci < len(row) else "" + style = cell_styles.get((ri, ci), {}) + hdr = style.get('is_header', False) + + # 배경색 + if hdr: + self._set_cell_bg('#E8F5E9') + elif style.get('bg_color'): + self._set_cell_bg(style['bg_color']) + + # 정렬 + align = style.get('align', 'center' if hdr else 'left') + if align == 'center': + self.hwp.HAction.Run("ParagraphShapeAlignCenter") + elif align == 'right': + self.hwp.HAction.Run("ParagraphShapeAlignRight") + else: + self.hwp.HAction.Run("ParagraphShapeAlignLeft") + + # 폰트 + self._set_font(9 if hdr else 9.5, hdr, '#006400' if hdr else '#333333') + self.hwp.insert_text(str(txt)) + + # 다음 셀로 이동 (마지막 셀 제외) + if not (ri == rc - 1 and ci == max_cols - 1): + self.hwp.HAction.Run("MoveRight") + + # ═══ 6. 표 편집 종료 ═══ + self.hwp.HAction.Run("Cancel") + self.hwp.HAction.Run("CloseEx") + self.hwp.HAction.Run("MoveDocEnd") + self._set_para('left', 130, before=5, after=5) + self.hwp.BreakPara() + + # ═══════════════════════════════════════════════════════════════ + # 이미지 삽입 - sizeoption 수정 ★ + # ═══════════════════════════════════════════════════════════════ + def _insert_image(self, src, caption=""): + self.image_count += 1 + + if not src: + return + + # 🆕 assets 폴더에서 먼저 찾기 + filename = os.path.basename(src) + full_path = os.path.join(self.cfg.ASSETS_PATH, filename) + + # assets에 없으면 기존 방식으로 fallback + if not os.path.exists(full_path): + if not os.path.isabs(src): + full_path = os.path.normpath(os.path.join(self.base_path, src)) + else: + full_path = src + + print(f" 📷 이미지 #{self.image_count}: {filename}") + + if not os.path.exists(full_path): + print(f" ❌ 파일 없음: {full_path}") + self._set_font(9, False, '#999999') + self._set_para('center', 130) + self.hwp.insert_text(f"[이미지 없음: {os.path.basename(src)}]") + self.hwp.BreakPara() + return + + try: + self._set_para('center', 130, before=5, after=3) + + # ★ sizeoption=0: 원본 크기 + # ★ sizeoption=2: 지정 크기 (width, height 필요) + # ★ 둘 다 안되면 sizeoption 없이 시도 + + inserted = False + + # 방법 1: sizeoption=0 (원본 크기) + try: + self.hwp.insert_picture(full_path, sizeoption=0) + inserted = True + print(f" ✅ 삽입 성공 (원본 크기)") + except Exception as e1: + pass + + # 방법 2: width/height 지정 + if not inserted and HAS_PIL: + try: + with Image.open(full_path) as img: + w_px, h_px = img.size + # px → mm 변환 (96 DPI 기준) + w_mm = w_px * 25.4 / 96 + h_mm = h_px * 25.4 / 96 + # 최대 너비 제한 + if w_mm > self.cfg.MAX_IMAGE_WIDTH: + ratio = self.cfg.MAX_IMAGE_WIDTH / w_mm + w_mm = self.cfg.MAX_IMAGE_WIDTH + h_mm = h_mm * ratio + + self.hwp.insert_picture(full_path, sizeoption=1, + width=self._mm(w_mm), height=self._mm(h_mm)) + inserted = True + print(f" ✅ 삽입 성공 ({w_mm:.0f}×{h_mm:.0f}mm)") + except Exception as e2: + pass + + # 방법 3: 기본값 + if not inserted: + try: + self.hwp.insert_picture(full_path) + inserted = True + print(f" ✅ 삽입 성공 (기본)") + except Exception as e3: + print(f" ❌ 삽입 실패: {e3}") + self._set_font(9, False, '#FF0000') + self.hwp.insert_text(f"[이미지 오류: {os.path.basename(src)}]") + + self.hwp.BreakPara() + + if caption and inserted: + self._set_font(9.5, True, '#666666') + self._set_para('center', 130, before=0, after=5) + self.hwp.insert_text(caption) + self.hwp.BreakPara() + + except Exception as e: + print(f" ❌ 오류: {e}") + + def _insert_table_from_element(self, elem: 'StyledElement'): + """StyledElement에서 표 삽입 (수정됨)""" + table_data = elem.attributes.get('table_data', {}) + if not table_data: + return + + rows = table_data.get('rows', []) + if not rows: + return + + num_rows = len(rows) + num_cols = max(len(row) for row in rows) if rows else 1 + + print(f" → 표 삽입: {num_rows}행 × {num_cols}열") + + try: + # 1. 표 앞에 문단 설정 + self._set_para('left', 130, before=5, after=0) + + # 2. 표 생성 (pyhwpx 내장 메서드 사용) + self.hwp.create_table(num_rows, num_cols, treat_as_char=True) + + # 3. 셀별 데이터 입력 + for row_idx, row in enumerate(rows): + for col_idx, cell in enumerate(row): + # 셀 건너뛰기 (병합된 셀) + if col_idx >= len(row): + self.hwp.HAction.Run("TableRightCell") + continue + + cell_text = cell.get('text', '') + is_header = cell.get('is_header', False) + + # 헤더 셀 스타일 + if is_header: + self._set_cell_bg('#E8F5E9') + self.hwp.HAction.Run("ParagraphShapeAlignCenter") + self._set_font(9, True, '#006400') + else: + self._set_font(9.5, False, '#333333') + + # 텍스트 입력 + self.hwp.insert_text(cell_text) + + # 다음 셀로 (마지막 셀 제외) + if not (row_idx == num_rows - 1 and col_idx == num_cols - 1): + self.hwp.HAction.Run("TableRightCell") + + # 4. ★ 표 빠져나오기 (핵심!) + self.hwp.HAction.Run("Cancel") # 선택 해제 + self.hwp.HAction.Run("CloseEx") # 표 편집 종료 + self.hwp.HAction.Run("MoveDocEnd") # 문서 끝으로 + + # 5. 표 뒤 문단 + self._set_para('left', 130, before=5, after=5) + self.hwp.BreakPara() + + print(f" ✅ 표 삽입 완료") + + except Exception as e: + print(f" [오류] 표 삽입 실패: {e}") + # 표 안에 갇혔을 경우 탈출 시도 + try: + self.hwp.HAction.Run("Cancel") + self.hwp.HAction.Run("CloseEx") + self.hwp.HAction.Run("MoveDocEnd") + except: + pass + + def _move_to_cell(self, row: int, col: int): + """표에서 특정 셀로 이동""" + # 첫 셀로 이동 + self.hwp.HAction.Run("TableColBegin") + self.hwp.HAction.Run("TableRowBegin") + + # row만큼 아래로 + for _ in range(row): + self.hwp.HAction.Run("TableLowerCell") + + # col만큼 오른쪽으로 + for _ in range(col): + self.hwp.HAction.Run("TableRightCell") + + def _apply_cell_style(self, bold=False, bg_color=None, align='left'): + """현재 셀 스타일 적용""" + # 글자 굵기 + if bold: + self.hwp.HAction.Run("CharShapeBold") + + # 정렬 + align_actions = { + 'left': "ParagraphShapeAlignLeft", + 'center': "ParagraphShapeAlignCenter", + 'right': "ParagraphShapeAlignRight", + } + if align in align_actions: + self.hwp.HAction.Run(align_actions[align]) + + # 배경색 + if bg_color: + self._apply_cell_bg(bg_color) + + def _apply_cell_bg(self, color: str): + """셀 배경색 적용""" + try: + color = color.lstrip('#') + r, g, b = int(color[0:2], 16), int(color[2:4], 16), int(color[4:6], 16) + + self.hwp.HAction.GetDefault("CellBorder", self.hwp.HParameterSet.HCellBorderFill.HSet) + self.hwp.HParameterSet.HCellBorderFill.FillAttr.FillType = 1 # 단색 + self.hwp.HParameterSet.HCellBorderFill.FillAttr.WinBrush.FaceColor = self.hwp.RGBColor(r, g, b) + self.hwp.HAction.Execute("CellBorder", self.hwp.HParameterSet.HCellBorderFill.HSet) + except Exception as e: + print(f" [경고] 셀 배경색: {e}") + + + def _insert_highlight_box(self, elem): + txt = elem.get_text(strip=True) + if not txt: return + self._set_para('left', 130, before=5, after=0) + self.hwp.create_table(1, 1, treat_as_char=True) + self._set_cell_bg('#E2ECE2') + self._set_font(11, False, '#333333') + self.hwp.insert_text(txt) + self.hwp.HAction.Run("Cancel") + self.hwp.HAction.Run("CloseEx") + self.hwp.HAction.Run("MoveDocEnd") + self._set_para('left', 130, before=0, after=5) + self.hwp.BreakPara() + + def _process(self, elem): + if isinstance(elem, NavigableString): return + tag = elem.name + if not tag or tag in ['script','style','template','noscript','head']: return + + if tag == 'figure': + img = elem.find('img') + if img: + figcaption = elem.find('figcaption') + caption = figcaption.get_text(strip=True) if figcaption else "" + self._insert_image(img.get('src', ''), caption) + return + + if tag == 'img': + self._insert_image(elem.get('src', '')) + return + + if tag in ['h1','h2','h3']: self._insert_heading(elem) + elif tag == 'p': self._insert_paragraph(elem) + elif tag == 'table': self._insert_table(elem) + elif tag in ['ul','ol']: self._insert_list(elem) + elif 'highlight-box' in elem.get('class',[]): self._insert_highlight_box(elem) + elif tag in ['div','section','article','main','body','html','span']: + for ch in elem.children: self._process(ch) + + def convert(self, html_path, output_path): + print("="*60) + print("HTML → HWP 변환기 v11") + print(" ✓ 이미지: sizeoption 수정") + print(" ✓ 페이지번호: 다중 방법 시도") + print("="*60) + + self.base_path = os.path.dirname(os.path.abspath(html_path)) + self.is_first_h1 = True + self.image_count = 0 + self.table_widths = [] # 🆕 표 열 너비 초기화 + + print(f"\n입력: {html_path}") + print(f"출력: {output_path}\n") + + with open(html_path, 'r', encoding='utf-8') as f: + soup = BeautifulSoup(f.read(), 'html.parser') + + title_tag = soup.find('title') + if title_tag: + full_title = title_tag.get_text(strip=True) + footer_title = full_title.split(':')[0].strip() # ":" 이전 + else: + footer_title = "" + + self.hwp.FileNew() + self._setup_page() + self._create_footer(footer_title) + + raw = soup.find(id='raw-container') + if raw: + cover = raw.find(id='box-cover') + if cover: + print(" → 표지") + for ch in cover.children: self._process(ch) + self.hwp.HAction.Run("BreakPage") + toc = raw.find(id='box-toc') + if toc: + print(" → 목차") + self.is_first_h1 = True + self._underline_box("목 차", 20, '#008000') + self.hwp.BreakPara(); self.hwp.BreakPara() + self._insert_list(toc.find('ul') or toc) + self.hwp.HAction.Run("BreakPage") + summary = raw.find(id='box-summary') + if summary: + print(" → 요약") + self.is_first_h1 = True + self._process(summary) + self.hwp.HAction.Run("BreakPage") + content = raw.find(id='box-content') + if content: + print(" → 본문") + self.is_first_h1 = True + self._process(content) + else: + self._process(soup.find('body') or soup) + + self.hwp.SaveAs(output_path) + print(f"\n✅ 저장: {output_path}") + print(f" 이미지: {self.image_count}개 처리") + + def convert_with_styles(self, html_path, output_path, sty_path=None): + """ + 스타일 그루핑이 적용된 HWP 변환 (하이브리드 방식) + + 워크플로우: + 1. HTML 분석 (역할 분류) + 2. 기존 convert() 로직으로 HWP 생성 (표/이미지 정상 작동) + 3. .hwpx로 저장 + 4. HWPX 후처리: 커스텀 스타일 주입 + """ + print("="*60) + print("HTML → HWP 변환기 v11 (스타일 그루핑)") + print("="*60) + + self.base_path = os.path.dirname(os.path.abspath(html_path)) + + # ═══ 1단계: HTML 분석 ═══ + with open(html_path, 'r', encoding='utf-8') as f: + html_content = f.read() + + analyzer = StyleAnalyzer() + elements = analyzer.analyze(html_content) + + print(f" 🔧 HTML 전처리 중...") + print(f" 📄 분석 완료: {len(elements)}개 요소") + for role, count in analyzer.get_role_summary().items(): + print(f" {role}: {count}") + + # ═══ 2단계: 기존 convert() 로직으로 HWP 생성 ═══ + # (표/이미지/머리말/꼬리말 모두 정상 작동) + self.convert(html_path, output_path) + + # ═══ 3단계: .hwpx로 다시 저장 ═══ + hwpx_path = output_path.replace('.hwp', '.hwpx') + if not hwpx_path.endswith('.hwpx'): + hwpx_path = output_path + 'x' + + # HWP 다시 열어서 HWPX로 저장 + self.hwp.Open(output_path) + self.hwp.SaveAs(hwpx_path, "HWPX") + self.hwp.Clear(1) # 문서 닫기 + + print(f"\n 📦 HWPX 변환: {hwpx_path}") + + # ═══ 4단계: HWPX 후처리 - 스킵 (convert에서 이미 완성) ═══ + print(f" ⏭️ 스타일 후처리 스킵 (convert 결과 유지)") + + # 🆕 ═══ 4-1단계: 표 열 너비 수정 ═══ + if self.table_widths: + try: + from converters.hwpx_table_injector import inject_table_widths + inject_table_widths(hwpx_path, self.table_widths) + except Exception as e: + print(f" [경고] 표 열 너비 수정 실패: {e}") + import traceback + traceback.print_exc() + + # ═══ 5단계: 최종 출력 ═══ + # HWPX를 기본 출력으로 사용 (또는 HWP로 재변환) + final_output = hwpx_path + + print(f"\n✅ 최종 저장: {final_output}") + return final_output + + def _get_style_config(self, role: str) -> dict: + """역할에 따른 스타일 설정 반환""" + + STYLE_CONFIGS = { + # 표지 + 'COVER_TITLE': {'font_size': 32, 'bold': True, 'align': 'center', 'color': '#1a365d', 'space_before': 20, 'space_after': 10}, + 'COVER_SUBTITLE': {'font_size': 18, 'bold': False, 'align': 'center', 'color': '#555555'}, + 'COVER_INFO': {'font_size': 12, 'align': 'center', 'color': '#666666'}, + + # 목차 + 'TOC_H1': {'font_size': 12, 'bold': True, 'indent_left': 0}, + 'TOC_H2': {'font_size': 11, 'indent_left': 5}, + 'TOC_H3': {'font_size': 10, 'indent_left': 10, 'color': '#666666'}, + + # 제목 계층 + 'H1': {'font_size': 20, 'bold': True, 'align': 'left', 'color': '#008000', 'space_before': 15, 'space_after': 8}, + 'H2': {'font_size': 16, 'bold': True, 'align': 'left', 'color': '#03581d', 'space_before': 12, 'space_after': 6}, + 'H3': {'font_size': 13, 'bold': True, 'align': 'left', 'color': '#228B22', 'space_before': 10, 'space_after': 5}, + 'H4': {'font_size': 12, 'bold': True, 'align': 'left', 'indent_left': 3, 'space_before': 8, 'space_after': 4}, + 'H5': {'font_size': 11, 'bold': True, 'align': 'left', 'indent_left': 6, 'space_before': 6, 'space_after': 3}, + 'H6': {'font_size': 11, 'bold': False, 'align': 'left', 'indent_left': 9}, + 'H7': {'font_size': 10.5, 'bold': False, 'align': 'left', 'indent_left': 12}, + + # 본문 + 'BODY': {'font_size': 11, 'align': 'justify', 'line_height': 180, 'indent_first': 3}, + 'LIST_ITEM': {'font_size': 11, 'align': 'left', 'indent_left': 5}, + 'HIGHLIGHT_BOX': {'font_size': 10.5, 'align': 'left', 'indent_left': 3}, + + # 표 + 'TH': {'font_size': 9, 'bold': True, 'align': 'center', 'color': '#006400'}, + 'TD': {'font_size': 9.5, 'align': 'left'}, + 'TABLE_CAPTION': {'font_size': 10, 'bold': True, 'align': 'center'}, + + # 그림 + 'FIGURE': {'align': 'center'}, + 'FIGURE_CAPTION': {'font_size': 9.5, 'align': 'center', 'color': '#666666'}, + + # 기타 + 'UNKNOWN': {'font_size': 11, 'align': 'left'}, + } + + return STYLE_CONFIGS.get(role, STYLE_CONFIGS['UNKNOWN']) + + def close(self): + try: self.hwp.Quit() + except: pass + +def main(): + html_path = r"D:\for python\survey_test\output\generated\report.html" + output_path = r"D:\for python\survey_test\output\generated\report_styled.hwp" + sty_path = r"D:\for python\survey_test\교통영향평가스타일.sty" # 🆕 추가 + + try: + conv = HtmlToHwpConverter(visible=True) + conv.convert_with_styles(html_path, output_path, sty_path) # 🆕 sty_path 추가 + input("\nEnter를 누르면 HWP가 닫힙니다...") + conv.close() + except Exception as e: + print(f"\n[에러] {e}") + import traceback; traceback.print_exc() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/03. Code/geulbeot_10th/converters/html_to_hwp_briefing.py b/03. Code/geulbeot_10th/converters/html_to_hwp_briefing.py new file mode 100644 index 0000000..d591e69 --- /dev/null +++ b/03. Code/geulbeot_10th/converters/html_to_hwp_briefing.py @@ -0,0 +1,616 @@ +# -*- coding: utf-8 -*- +""" +HTML → HWP 변환기 (기획서 전용) + +✅ 머리말/꼬리말: 보고서 방식 적용 (페이지 번호 포함) +✅ lead-box, section, data-table, strategy-grid, qa-grid, bottom-box 지원 +✅ process-container (단계별 프로세스) 지원 +✅ badge 스타일 텍스트 변환 +✅ Navy 색상 테마 + +pip install pyhwpx beautifulsoup4 +""" + +from pyhwpx import Hwp +from bs4 import BeautifulSoup +import os + + +class Config: + """페이지 설정""" + PAGE_WIDTH = 210 + PAGE_HEIGHT = 297 + MARGIN_LEFT = 20 + MARGIN_RIGHT = 20 + MARGIN_TOP = 20 + MARGIN_BOTTOM = 15 + HEADER_LEN = 10 + FOOTER_LEN = 10 + CONTENT_WIDTH = 170 + + +class HtmlToHwpConverter: + """HTML → HWP 변환기 (기획서 전용)""" + + def __init__(self, visible=True): + self.hwp = Hwp(visible=visible) + self.cfg = Config() + self.colors = {} + self.is_first_h1 = True + + # ───────────────────────────────────────────────────────── + # 초기화 및 유틸리티 + # ───────────────────────────────────────────────────────── + + def _init_colors(self): + """색상 팔레트 초기화 (Navy 계열)""" + self.colors = { + 'primary-navy': self.hwp.RGBColor(26, 54, 93), # #1a365d + 'secondary-navy': self.hwp.RGBColor(44, 82, 130), # #2c5282 + 'accent-navy': self.hwp.RGBColor(49, 130, 206), # #3182ce + 'dark-gray': self.hwp.RGBColor(45, 55, 72), # #2d3748 + 'medium-gray': self.hwp.RGBColor(74, 85, 104), # #4a5568 + 'light-gray': self.hwp.RGBColor(226, 232, 240), # #e2e8f0 + 'bg-light': self.hwp.RGBColor(247, 250, 252), # #f7fafc + 'border-color': self.hwp.RGBColor(203, 213, 224), # #cbd5e0 + 'badge-safe': self.hwp.RGBColor(30, 111, 63), # #1e6f3f + 'badge-caution': self.hwp.RGBColor(154, 91, 19), # #9a5b13 + 'badge-risk': self.hwp.RGBColor(161, 43, 43), # #a12b2b + 'white': self.hwp.RGBColor(255, 255, 255), + 'black': self.hwp.RGBColor(0, 0, 0), + } + + def _mm(self, mm): + """밀리미터를 HWP 단위로 변환""" + return self.hwp.MiliToHwpUnit(mm) + + def _pt(self, pt): + """포인트를 HWP 단위로 변환""" + return self.hwp.PointToHwpUnit(pt) + + def _rgb(self, hex_color): + """HEX 색상을 RGB로 변환""" + c = hex_color.lstrip('#') + return self.hwp.RGBColor(int(c[0:2], 16), int(c[2:4], 16), int(c[4:6], 16)) if len(c) >= 6 else self.hwp.RGBColor(0, 0, 0) + + def _font(self, size=10, color='black', bold=False): + """폰트 설정 (색상 이름 사용)""" + self.hwp.set_font( + FaceName='맑은 고딕', + Height=size, + Bold=bold, + TextColor=self.colors.get(color, self.colors['black']) + ) + + def _set_font(self, size=11, bold=False, hex_color='#000000'): + """폰트 설정 (HEX 색상 사용)""" + self.hwp.set_font( + FaceName='맑은 고딕', + Height=size, + Bold=bold, + TextColor=self._rgb(hex_color) + ) + + def _align(self, align): + """정렬 설정""" + actions = { + 'left': 'ParagraphShapeAlignLeft', + 'center': 'ParagraphShapeAlignCenter', + 'right': 'ParagraphShapeAlignRight', + 'justify': 'ParagraphShapeAlignJustify', + } + if align in actions: + self.hwp.HAction.Run(actions[align]) + + def _para(self, text='', size=10, color='black', bold=False, align='left'): + """문단 삽입""" + self._align(align) + self._font(size, color, bold) + if text: + self.hwp.insert_text(text) + self.hwp.BreakPara() + + def _exit_table(self): + """표 편집 모드 종료""" + self.hwp.HAction.Run("Cancel") + self.hwp.HAction.Run("CloseEx") + self.hwp.HAction.Run("MoveDocEnd") + self.hwp.BreakPara() + + def _setup_page(self): + """페이지 설정""" + try: + self.hwp.HAction.GetDefault("PageSetup", self.hwp.HParameterSet.HSecDef.HSet) + s = self.hwp.HParameterSet.HSecDef + s.PageDef.LeftMargin = self._mm(self.cfg.MARGIN_LEFT) + s.PageDef.RightMargin = self._mm(self.cfg.MARGIN_RIGHT) + s.PageDef.TopMargin = self._mm(self.cfg.MARGIN_TOP) + s.PageDef.BottomMargin = self._mm(self.cfg.MARGIN_BOTTOM) + s.PageDef.HeaderLen = self._mm(self.cfg.HEADER_LEN) + s.PageDef.FooterLen = self._mm(self.cfg.FOOTER_LEN) + self.hwp.HAction.Execute("PageSetup", s.HSet) + print(f"[설정] 여백: 좌우 {self.cfg.MARGIN_LEFT}mm, 상 {self.cfg.MARGIN_TOP}mm, 하 {self.cfg.MARGIN_BOTTOM}mm") + except Exception as e: + print(f"[경고] 페이지 설정 실패: {e}") + + # ───────────────────────────────────────────────────────── + # 머리말 / 꼬리말 (보고서 방식) + # ───────────────────────────────────────────────────────── + + def _create_header(self, right_text=""): + """머리말 생성 (우측 정렬)""" + print(f" → 머리말 생성: {right_text if right_text else '(초기화)'}") + try: + self.hwp.HAction.GetDefault("HeaderFooter", self.hwp.HParameterSet.HHeaderFooter.HSet) + self.hwp.HParameterSet.HHeaderFooter.HSet.SetItem("HeaderFooterStyle", 0) + self.hwp.HParameterSet.HHeaderFooter.HSet.SetItem("HeaderFooterCtrlType", 0) + self.hwp.HAction.Execute("HeaderFooter", self.hwp.HParameterSet.HHeaderFooter.HSet) + + self.hwp.HAction.Run("ParagraphShapeAlignRight") + self._set_font(9, False, '#4a5568') + if right_text: + self.hwp.insert_text(right_text) + + self.hwp.HAction.Run("CloseEx") + except Exception as e: + print(f" [경고] 머리말: {e}") + + def _create_footer(self, left_text=""): + """꼬리말 생성 (좌측 텍스트 + 우측 페이지 번호)""" + print(f" → 꼬리말: {left_text}") + + # 1. 꼬리말 열기 + self.hwp.HAction.GetDefault("HeaderFooter", self.hwp.HParameterSet.HHeaderFooter.HSet) + self.hwp.HParameterSet.HHeaderFooter.HSet.SetItem("HeaderFooterStyle", 0) + self.hwp.HParameterSet.HHeaderFooter.HSet.SetItem("HeaderFooterCtrlType", 1) + self.hwp.HAction.Execute("HeaderFooter", self.hwp.HParameterSet.HHeaderFooter.HSet) + + # 2. 좌측 정렬 + 제목 8pt + self.hwp.HAction.Run("ParagraphShapeAlignLeft") + self._set_font(8, False, '#4a5568') + self.hwp.insert_text(left_text) + + # 3. 꼬리말 닫기 + self.hwp.HAction.Run("CloseEx") + + # 4. 쪽번호 (우측 하단) + self.hwp.HAction.GetDefault("PageNumPos", self.hwp.HParameterSet.HPageNumPos.HSet) + self.hwp.HParameterSet.HPageNumPos.DrawPos = self.hwp.PageNumPosition("BottomRight") + self.hwp.HAction.Execute("PageNumPos", self.hwp.HParameterSet.HPageNumPos.HSet) + + def _new_section_with_header(self, header_text): + """새 구역 생성 후 머리말 설정""" + print(f" → 새 구역 머리말: {header_text}") + try: + self.hwp.HAction.Run("BreakSection") + + self.hwp.HAction.GetDefault("HeaderFooter", self.hwp.HParameterSet.HHeaderFooter.HSet) + self.hwp.HParameterSet.HHeaderFooter.HSet.SetItem("HeaderFooterStyle", 0) + self.hwp.HParameterSet.HHeaderFooter.HSet.SetItem("HeaderFooterCtrlType", 0) + self.hwp.HAction.Execute("HeaderFooter", self.hwp.HParameterSet.HHeaderFooter.HSet) + + self.hwp.HAction.Run("SelectAll") + self.hwp.HAction.Run("Delete") + + self.hwp.HAction.Run("ParagraphShapeAlignRight") + self._set_font(9, False, '#4a5568') + self.hwp.insert_text(header_text) + + self.hwp.HAction.Run("CloseEx") + except Exception as e: + print(f" [경고] 구역 머리말: {e}") + + # ───────────────────────────────────────────────────────── + # 셀 배경색 설정 + # ───────────────────────────────────────────────────────── + + def _set_cell_bg(self, color_name): + """셀 배경색 설정 (색상 이름)""" + self.hwp.HAction.GetDefault("CellBorderFill", self.hwp.HParameterSet.HCellBorderFill.HSet) + pset = self.hwp.HParameterSet.HCellBorderFill + pset.FillAttr.type = self.hwp.BrushType("NullBrush|WinBrush") + pset.FillAttr.WinBrushFaceStyle = self.hwp.HatchStyle("None") + pset.FillAttr.WinBrushHatchColor = self.hwp.RGBColor(0, 0, 0) + pset.FillAttr.WinBrushFaceColor = self.colors.get(color_name, self.colors['white']) + pset.FillAttr.WindowsBrush = 1 + self.hwp.HAction.Execute("CellBorderFill", pset.HSet) + + # ───────────────────────────────────────────────────────── + # HTML 요소 변환 (기획서 전용) + # ───────────────────────────────────────────────────────── + + def _convert_lead_box(self, elem): + """lead-box 변환 (핵심 기조 박스)""" + content = elem.find("div") + if not content: + return + + text = content.get_text(strip=True) + text = ' '.join(text.split()) + print(f" → lead-box") + + self.hwp.create_table(1, 1, treat_as_char=True) + self._set_cell_bg('bg-light') + self._font(11.5, 'dark-gray', False) + self.hwp.insert_text(text) + self._exit_table() + + def _convert_strategy_grid(self, elem): + """strategy-grid 변환 (2x2 전략 박스)""" + items = elem.find_all(class_="strategy-item") + if not items: + return + + print(f" → strategy-grid: {len(items)} items") + + self.hwp.create_table(2, 2, treat_as_char=True) + + for i, item in enumerate(items[:4]): + if i > 0: + self.hwp.HAction.Run("MoveRight") + + self._set_cell_bg('bg-light') + + title = item.find(class_="strategy-title") + if title: + self._font(10, 'primary-navy', True) + self.hwp.insert_text(title.get_text(strip=True)) + self.hwp.BreakPara() + + p = item.find("p") + if p: + self._font(9.5, 'dark-gray', False) + self.hwp.insert_text(p.get_text(strip=True)) + + self._exit_table() + + def _convert_process_container(self, elem): + """process-container 변환 (단계별 프로세스)""" + steps = elem.find_all(class_="process-step") + if not steps: + return + + print(f" → process-container: {len(steps)} steps") + + rows = len(steps) + self.hwp.create_table(rows, 2, treat_as_char=True) + + for i, step in enumerate(steps): + if i > 0: + self.hwp.HAction.Run("MoveRight") + + # 번호 셀 + num = step.find(class_="step-num") + self._set_cell_bg('primary-navy') + self._font(10, 'white', True) + self._align('center') + if num: + self.hwp.insert_text(num.get_text(strip=True)) + + self.hwp.HAction.Run("MoveRight") + + # 내용 셀 + content = step.find(class_="step-content") + self._set_cell_bg('bg-light') + self._font(10.5, 'dark-gray', False) + self._align('left') + if content: + self.hwp.insert_text(content.get_text(strip=True)) + + self._exit_table() + + def _convert_data_table(self, table): + """data-table 변환 (badge 포함)""" + data = [] + + thead = table.find("thead") + if thead: + ths = thead.find_all("th") + data.append([th.get_text(strip=True) for th in ths]) + + tbody = table.find("tbody") + if tbody: + for tr in tbody.find_all("tr"): + row = [] + for td in tr.find_all("td"): + badge = td.find(class_="badge") + if badge: + badge_class = ' '.join(badge.get('class', [])) + badge_text = badge.get_text(strip=True) + if 'badge-safe' in badge_class: + row.append(f"[✓ {badge_text}]") + elif 'badge-caution' in badge_class: + row.append(f"[△ {badge_text}]") + elif 'badge-risk' in badge_class: + row.append(f"[✗ {badge_text}]") + else: + row.append(f"[{badge_text}]") + else: + row.append(td.get_text(strip=True)) + data.append(row) + + if not data: + return + + rows = len(data) + cols = len(data[0]) if data else 0 + print(f" → data-table: {rows}×{cols}") + + self.hwp.create_table(rows, cols, treat_as_char=True) + + for row_idx, row in enumerate(data): + for col_idx, cell_text in enumerate(row): + is_header = (row_idx == 0) + is_first_col = (col_idx == 0 and not is_header) + + is_safe = '[✓' in str(cell_text) + is_caution = '[△' in str(cell_text) + is_risk = '[✗' in str(cell_text) + + if is_header: + self._set_cell_bg('primary-navy') + self._font(9, 'white', True) + elif is_first_col: + self._set_cell_bg('bg-light') + self._font(9.5, 'primary-navy', True) + elif is_safe: + self._font(9.5, 'badge-safe', True) + elif is_caution: + self._font(9.5, 'badge-caution', True) + elif is_risk: + self._font(9.5, 'badge-risk', True) + else: + self._font(9.5, 'dark-gray', False) + + self._align('center') + self.hwp.insert_text(str(cell_text)) + + if not (row_idx == rows - 1 and col_idx == cols - 1): + self.hwp.HAction.Run("MoveRight") + + self._exit_table() + + def _convert_qa_grid(self, elem): + """qa-grid 변환 (Q&A 2단 박스)""" + items = elem.find_all(class_="qa-item") + if not items: + return + + print(f" → qa-grid: {len(items)} items") + + self.hwp.create_table(1, 2, treat_as_char=True) + + for i, item in enumerate(items[:2]): + if i > 0: + self.hwp.HAction.Run("MoveRight") + + self._set_cell_bg('bg-light') + + text = item.get_text(strip=True) + strong = item.find("strong") + if strong: + q_text = strong.get_text(strip=True) + a_text = text.replace(q_text, '').strip() + + self._font(9.5, 'primary-navy', True) + self.hwp.insert_text(q_text) + self.hwp.BreakPara() + self._font(9.5, 'dark-gray', False) + self.hwp.insert_text(a_text) + else: + self._font(9.5, 'dark-gray', False) + self.hwp.insert_text(text) + + self._exit_table() + + def _convert_bottom_box(self, elem): + """bottom-box 변환 (핵심 결론 박스)""" + left = elem.find(class_="bottom-left") + right = elem.find(class_="bottom-right") + + if not left or not right: + return + + left_text = ' '.join(left.get_text().split()) + right_text = right.get_text(strip=True) + print(f" → bottom-box") + + self.hwp.create_table(1, 2, treat_as_char=True) + + # 좌측 (Navy 배경) + self._set_cell_bg('primary-navy') + self._font(10.5, 'white', True) + self._align('center') + self.hwp.insert_text(left_text) + + self.hwp.HAction.Run("MoveRight") + + # 우측 (연한 배경) + self._set_cell_bg('bg-light') + self._font(10.5, 'primary-navy', True) + self._align('center') + self.hwp.insert_text(right_text) + + self._exit_table() + + def _convert_section(self, section): + """section 변환""" + title = section.find(class_="section-title") + if title: + self._para("■ " + title.get_text(strip=True), 12, 'primary-navy', True) + + strategy_grid = section.find(class_="strategy-grid") + if strategy_grid: + self._convert_strategy_grid(strategy_grid) + + process = section.find(class_="process-container") + if process: + self._convert_process_container(process) + + table = section.find("table", class_="data-table") + if table: + self._convert_data_table(table) + + ul = section.find("ul") + if ul: + for li in ul.find_all("li", recursive=False): + keyword = li.find(class_="keyword") + if keyword: + kw_text = keyword.get_text(strip=True) + full = li.get_text(strip=True) + rest = full.replace(kw_text, '', 1).strip() + + self._font(10.5, 'primary-navy', True) + self.hwp.insert_text(" • " + kw_text + " ") + self._font(10.5, 'dark-gray', False) + self.hwp.insert_text(rest) + self.hwp.BreakPara() + else: + self._para(" • " + li.get_text(strip=True), 10.5, 'dark-gray') + + qa_grid = section.find(class_="qa-grid") + if qa_grid: + self._convert_qa_grid(qa_grid) + + self._para() + + def _convert_sheet(self, sheet, is_first_page=False, footer_title=""): + """한 페이지(sheet) 변환""" + + # 첫 페이지에서만 머리말/꼬리말 설정 + if is_first_page: + # 머리말: page-header에서 텍스트 추출 + header = sheet.find(class_="page-header") + if header: + left = header.find(class_="header-left") + right = header.find(class_="header-right") + # 우측 텍스트 사용 (부서명 등) + header_text = right.get_text(strip=True) if right else "" + if header_text: + self._create_header(header_text) + + # 꼬리말: 제목 + 페이지번호 + self._create_footer(footer_title) + + # 대제목 + title = sheet.find(class_="header-title") + if title: + title_text = title.get_text(strip=True) + if '[첨부]' in title_text: + self._para(title_text, 15, 'primary-navy', True, 'left') + self._font(10, 'secondary-navy', False) + self._align('left') + self.hwp.insert_text("─" * 60) + self.hwp.BreakPara() + else: + self._para(title_text, 23, 'primary-navy', True, 'center') + self._font(10, 'secondary-navy', False) + self._align('center') + self.hwp.insert_text("━" * 45) + self.hwp.BreakPara() + + self._para() + + # 리드 박스 + lead_box = sheet.find(class_="lead-box") + if lead_box: + self._convert_lead_box(lead_box) + self._para() + + # 섹션들 + for section in sheet.find_all(class_="section"): + self._convert_section(section) + + # 하단 박스 + bottom_box = sheet.find(class_="bottom-box") + if bottom_box: + self._para() + self._convert_bottom_box(bottom_box) + + # ───────────────────────────────────────────────────────── + # 메인 변환 함수 + # ───────────────────────────────────────────────────────── + + def convert(self, html_path, output_path): + """HTML → HWP 변환 실행""" + + print("=" * 60) + print("HTML → HWP 변환기 (기획서 전용)") + print(" ✓ 머리말/꼬리말: 보고서 방식") + print(" ✓ Navy 테마, 기획서 요소") + print("=" * 60) + + print(f"\n[입력] {html_path}") + + with open(html_path, 'r', encoding='utf-8') as f: + soup = BeautifulSoup(f.read(), 'html.parser') + + # 제목 추출 (꼬리말용) + title_tag = soup.find('title') + if title_tag: + full_title = title_tag.get_text(strip=True) + footer_title = full_title.split(':')[0].strip() + else: + footer_title = "" + + self.hwp.FileNew() + self._init_colors() + self._setup_page() + + # 페이지별 변환 + sheets = soup.find_all(class_="sheet") + total = len(sheets) + print(f"[변환] 총 {total} 페이지\n") + + for i, sheet in enumerate(sheets, 1): + print(f"[{i}/{total}] 페이지 처리 중...") + self._convert_sheet(sheet, is_first_page=(i == 1), footer_title=footer_title) + + if i < total: + self.hwp.HAction.Run("BreakPage") + + # 저장 + self.hwp.SaveAs(output_path) + print(f"\n✅ 저장 완료: {output_path}") + + def close(self): + """HWP 종료""" + try: + self.hwp.Quit() + except: + pass + + +def main(): + """메인 실행""" + + html_path = r"D:\for python\geulbeot-light\geulbeot-light\output\briefing.html" + output_path = r"D:\for python\geulbeot-light\geulbeot-light\output\briefing.hwp" + + print("=" * 60) + print("HTML → HWP 변환기 (기획서)") + print("=" * 60) + print() + + try: + converter = HtmlToHwpConverter(visible=True) + converter.convert(html_path, output_path) + + print("\n" + "=" * 60) + print("✅ 변환 완료!") + print("=" * 60) + + input("\nEnter를 누르면 HWP가 닫힙니다...") + converter.close() + + except FileNotFoundError: + print(f"\n[에러] 파일을 찾을 수 없습니다: {html_path}") + print("경로를 확인해주세요.") + except Exception as e: + print(f"\n[에러] {e}") + import traceback + traceback.print_exc() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/03. Code/geulbeot_10th/converters/hwp_style_mapping.py b/03. Code/geulbeot_10th/converters/hwp_style_mapping.py new file mode 100644 index 0000000..d248e77 --- /dev/null +++ b/03. Code/geulbeot_10th/converters/hwp_style_mapping.py @@ -0,0 +1,434 @@ +# -*- coding: utf-8 -*- +""" +HWP 스타일 매핑 모듈 v2.0 +HTML 역할(Role) → HWP 스타일 매핑 + +✅ v2.0 변경사항: +- pyhwpx API에 맞게 apply_to_hwp() 재작성 +- CharShape/ParaShape 직접 설정 방식 +- 역할 → 개요 스타일 매핑 +""" + +from dataclasses import dataclass +from typing import Dict, Optional +from enum import Enum + + +class HwpStyleType(Enum): + """HWP 스타일 유형""" + PARAGRAPH = "paragraph" + CHARACTER = "character" + + +@dataclass +class HwpStyle: + """HWP 스타일 정의""" + id: int + name: str + type: HwpStyleType + font_size: float + font_bold: bool = False + font_color: str = "000000" + align: str = "justify" + line_spacing: float = 160 + space_before: float = 0 + space_after: float = 0 + indent_left: float = 0 + indent_first: float = 0 + bg_color: Optional[str] = None + + +# ============================================================================= +# 기본 스타일 템플릿 +# ============================================================================= +DEFAULT_STYLES: Dict[str, HwpStyle] = { + # 표지 + "COVER_TITLE": HwpStyle( + id=100, name="표지제목", type=HwpStyleType.PARAGRAPH, + font_size=32, font_bold=True, align="center", + space_before=20, space_after=10, font_color="1a365d" + ), + "COVER_SUBTITLE": HwpStyle( + id=101, name="표지부제", type=HwpStyleType.PARAGRAPH, + font_size=18, font_bold=False, align="center", + font_color="555555" + ), + "COVER_INFO": HwpStyle( + id=102, name="표지정보", type=HwpStyleType.PARAGRAPH, + font_size=12, align="center", font_color="666666" + ), + + # 목차 + "TOC_H1": HwpStyle( + id=110, name="목차1수준", type=HwpStyleType.PARAGRAPH, + font_size=12, font_bold=True, indent_left=0 + ), + "TOC_H2": HwpStyle( + id=111, name="목차2수준", type=HwpStyleType.PARAGRAPH, + font_size=11, indent_left=20 + ), + "TOC_H3": HwpStyle( + id=112, name="목차3수준", type=HwpStyleType.PARAGRAPH, + font_size=10, indent_left=40, font_color="666666" + ), + + # 제목 계층 (개요 1~7 매핑) + "H1": HwpStyle( + id=1, name="개요 1", type=HwpStyleType.PARAGRAPH, + font_size=20, font_bold=True, align="left", + space_before=30, space_after=15, font_color="1a365d" + ), + "H2": HwpStyle( + id=2, name="개요 2", type=HwpStyleType.PARAGRAPH, + font_size=16, font_bold=True, align="left", + space_before=20, space_after=10, font_color="2c5282" + ), + "H3": HwpStyle( + id=3, name="개요 3", type=HwpStyleType.PARAGRAPH, + font_size=14, font_bold=True, align="left", + space_before=15, space_after=8, font_color="2b6cb0" + ), + "H4": HwpStyle( + id=4, name="개요 4", type=HwpStyleType.PARAGRAPH, + font_size=12, font_bold=True, align="left", + space_before=10, space_after=5, indent_left=10 + ), + "H5": HwpStyle( + id=5, name="개요 5", type=HwpStyleType.PARAGRAPH, + font_size=11, font_bold=True, align="left", + space_before=8, space_after=4, indent_left=20 + ), + "H6": HwpStyle( + id=6, name="개요 6", type=HwpStyleType.PARAGRAPH, + font_size=11, font_bold=False, align="left", + indent_left=30 + ), + "H7": HwpStyle( + id=7, name="개요 7", type=HwpStyleType.PARAGRAPH, + font_size=10.5, font_bold=False, align="left", + indent_left=40 + ), + + # 본문 + "BODY": HwpStyle( + id=20, name="바탕글", type=HwpStyleType.PARAGRAPH, + font_size=11, align="justify", + line_spacing=180, indent_first=10 + ), + "LIST_ITEM": HwpStyle( + id=8, name="개요 8", type=HwpStyleType.PARAGRAPH, + font_size=11, align="left", + indent_left=15, line_spacing=160 + ), + "HIGHLIGHT_BOX": HwpStyle( + id=21, name="강조박스", type=HwpStyleType.PARAGRAPH, + font_size=10.5, align="left", + bg_color="f7fafc", indent_left=10, indent_first=0 + ), + + # 표 + "TABLE": HwpStyle( + id=30, name="표", type=HwpStyleType.PARAGRAPH, + font_size=10, align="center" + ), + "TH": HwpStyle( + id=11, name="표제목", type=HwpStyleType.PARAGRAPH, + font_size=10, font_bold=True, align="center", + bg_color="e2e8f0" + ), + "TD": HwpStyle( + id=31, name="표내용", type=HwpStyleType.PARAGRAPH, + font_size=10, align="left" + ), + "TABLE_CAPTION": HwpStyle( + id=19, name="표캡션", type=HwpStyleType.PARAGRAPH, + font_size=10, font_bold=True, align="center", + space_before=5, space_after=3 + ), + + # 그림 + "FIGURE": HwpStyle( + id=32, name="그림", type=HwpStyleType.PARAGRAPH, + font_size=10, align="center" + ), + "FIGURE_CAPTION": HwpStyle( + id=18, name="그림캡션", type=HwpStyleType.PARAGRAPH, + font_size=9.5, align="center", + font_color="666666", space_before=5 + ), + + # 기타 + "UNKNOWN": HwpStyle( + id=0, name="바탕글", type=HwpStyleType.PARAGRAPH, + font_size=10, align="left" + ), +} + +# 역할 → 개요 번호 매핑 (StyleShortcut 용) +ROLE_TO_OUTLINE_NUM = { + "H1": 1, + "H2": 2, + "H3": 3, + "H4": 4, + "H5": 5, + "H6": 6, + "H7": 7, + "LIST_ITEM": 8, + "BODY": 0, # 바탕글 + "COVER_TITLE": 0, + "COVER_SUBTITLE": 0, + "COVER_INFO": 0, +} + +# 역할 → HWP 스타일 이름 매핑 +ROLE_TO_STYLE_NAME = { + "H1": "개요 1", + "H2": "개요 2", + "H3": "개요 3", + "H4": "개요 4", + "H5": "개요 5", + "H6": "개요 6", + "H7": "개요 7", + "LIST_ITEM": "개요 8", + "BODY": "바탕글", + "COVER_TITLE": "표지제목", + "COVER_SUBTITLE": "표지부제", + "TH": "표제목", + "TD": "표내용", + "TABLE_CAPTION": "표캡션", + "FIGURE_CAPTION": "그림캡션", + "UNKNOWN": "바탕글", +} + + +class HwpStyleMapper: + """HTML 역할 → HWP 스타일 매퍼""" + + def __init__(self, custom_styles: Optional[Dict[str, HwpStyle]] = None): + self.styles = DEFAULT_STYLES.copy() + if custom_styles: + self.styles.update(custom_styles) + + def get_style(self, role: str) -> HwpStyle: + return self.styles.get(role, self.styles["UNKNOWN"]) + + def get_style_id(self, role: str) -> int: + return self.get_style(role).id + + def get_all_styles(self) -> Dict[str, HwpStyle]: + return self.styles + + +class HwpStyGenerator: + """ + HTML 스타일 → HWP 스타일 적용기 + + pyhwpx API를 사용하여: + 1. 역할별 스타일 정보 저장 + 2. 텍스트 삽입 시 CharShape/ParaShape 직접 적용 + 3. 개요 스타일 번호 매핑 반환 + """ + + def __init__(self): + self.styles: Dict[str, HwpStyle] = {} + self.hwp = None + + def update_from_html(self, html_styles: Dict[str, Dict]): + """HTML에서 추출한 스타일로 업데이트""" + for role, style_dict in html_styles.items(): + if role in DEFAULT_STYLES: + base = DEFAULT_STYLES[role] + + # color 처리 - # 제거 + color = style_dict.get('color', base.font_color) + if isinstance(color, str): + color = color.lstrip('#') + + self.styles[role] = HwpStyle( + id=base.id, + name=base.name, + type=base.type, + font_size=style_dict.get('font_size', base.font_size), + font_bold=style_dict.get('bold', base.font_bold), + font_color=color, + align=style_dict.get('align', base.align), + line_spacing=style_dict.get('line_spacing', base.line_spacing), + space_before=style_dict.get('space_before', base.space_before), + space_after=style_dict.get('space_after', base.space_after), + indent_left=style_dict.get('indent_left', base.indent_left), + indent_first=style_dict.get('indent_first', base.indent_first), + bg_color=style_dict.get('bg_color', base.bg_color), + ) + else: + # 기본 스타일 사용 + self.styles[role] = DEFAULT_STYLES.get('UNKNOWN') + + # 누락된 역할은 기본값으로 채움 + for role in DEFAULT_STYLES: + if role not in self.styles: + self.styles[role] = DEFAULT_STYLES[role] + + def apply_to_hwp(self, hwp) -> Dict[str, HwpStyle]: + """역할 → HwpStyle 매핑 반환""" + self.hwp = hwp + + # 🚫 스타일 생성 비활성화 (API 문제) + # for role, style in self.styles.items(): + # self._create_or_update_style(hwp, role, style) + + if not self.styles: + self.styles = DEFAULT_STYLES.copy() + + print(f" ✅ 스타일 매핑 완료: {len(self.styles)}개") + return self.styles + + def _create_or_update_style(self, hwp, role: str, style: HwpStyle): + """HWP에 스타일 생성 또는 수정""" + try: + # 1. 스타일 편집 모드 + hwp.HAction.GetDefault("ModifyStyle", hwp.HParameterSet.HStyle.HSet) + hwp.HParameterSet.HStyle.StyleName = style.name + + # 2. 글자 모양 + color_hex = style.font_color.lstrip('#') + if len(color_hex) == 6: + r, g, b = int(color_hex[0:2], 16), int(color_hex[2:4], 16), int(color_hex[4:6], 16) + text_color = hwp.RGBColor(r, g, b) + else: + text_color = hwp.RGBColor(0, 0, 0) + + hwp.HParameterSet.HStyle.CharShape.Height = hwp.PointToHwpUnit(style.font_size) + hwp.HParameterSet.HStyle.CharShape.Bold = style.font_bold + hwp.HParameterSet.HStyle.CharShape.TextColor = text_color + + # 3. 문단 모양 + align_map = {'left': 0, 'center': 1, 'right': 2, 'justify': 3} + hwp.HParameterSet.HStyle.ParaShape.Align = align_map.get(style.align, 3) + hwp.HParameterSet.HStyle.ParaShape.LineSpacing = int(style.line_spacing) + hwp.HParameterSet.HStyle.ParaShape.SpaceBeforePara = hwp.PointToHwpUnit(style.space_before) + hwp.HParameterSet.HStyle.ParaShape.SpaceAfterPara = hwp.PointToHwpUnit(style.space_after) + + # 4. 실행 + hwp.HAction.Execute("ModifyStyle", hwp.HParameterSet.HStyle.HSet) + print(f" ✓ 스타일 '{style.name}' 정의됨") + + except Exception as e: + print(f" [경고] 스타일 '{style.name}' 생성 실패: {e}") + + def get_style(self, role: str) -> HwpStyle: + """역할에 해당하는 스타일 반환""" + return self.styles.get(role, DEFAULT_STYLES.get('UNKNOWN')) + + def apply_char_shape(self, hwp, role: str): + """현재 선택 영역에 글자 모양 적용""" + style = self.get_style(role) + + try: + # RGB 색상 변환 + color_hex = style.font_color.lstrip('#') if style.font_color else '000000' + if len(color_hex) == 6: + r = int(color_hex[0:2], 16) + g = int(color_hex[2:4], 16) + b = int(color_hex[4:6], 16) + text_color = hwp.RGBColor(r, g, b) + else: + text_color = hwp.RGBColor(0, 0, 0) + + # 글자 모양 설정 + hwp.HAction.GetDefault("CharShape", hwp.HParameterSet.HCharShape.HSet) + hwp.HParameterSet.HCharShape.Height = hwp.PointToHwpUnit(style.font_size) + hwp.HParameterSet.HCharShape.Bold = style.font_bold + hwp.HParameterSet.HCharShape.TextColor = text_color + hwp.HAction.Execute("CharShape", hwp.HParameterSet.HCharShape.HSet) + + except Exception as e: + print(f" [경고] 글자 모양 적용 실패 ({role}): {e}") + + def apply_para_shape(self, hwp, role: str): + """현재 문단에 문단 모양 적용""" + style = self.get_style(role) + + try: + # 정렬 + align_actions = { + 'left': "ParagraphShapeAlignLeft", + 'center': "ParagraphShapeAlignCenter", + 'right': "ParagraphShapeAlignRight", + 'justify': "ParagraphShapeAlignJustify" + } + if style.align in align_actions: + hwp.HAction.Run(align_actions[style.align]) + + # 문단 모양 상세 설정 + hwp.HAction.GetDefault("ParagraphShape", hwp.HParameterSet.HParaShape.HSet) + p = hwp.HParameterSet.HParaShape + p.LineSpaceType = 0 # 퍼센트 + p.LineSpacing = int(style.line_spacing) + p.LeftMargin = hwp.MiliToHwpUnit(style.indent_left) + p.IndentMargin = hwp.MiliToHwpUnit(style.indent_first) + p.SpaceBeforePara = hwp.PointToHwpUnit(style.space_before) + p.SpaceAfterPara = hwp.PointToHwpUnit(style.space_after) + hwp.HAction.Execute("ParagraphShape", p.HSet) + + except Exception as e: + print(f" [경고] 문단 모양 적용 실패 ({role}): {e}") + + def apply_style(self, hwp, role: str): + """역할에 맞는 전체 스타일 적용 (글자 + 문단)""" + self.apply_char_shape(hwp, role) + self.apply_para_shape(hwp, role) + + def export_sty(self, hwp, output_path: str) -> bool: + """스타일 파일 내보내기 (현재 미지원)""" + print(f" [알림] .sty 내보내기는 현재 미지원") + return False + + +# ============================================================================= +# 번호 제거 유틸리티 +# ============================================================================= +import re + +NUMBERING_PATTERNS = { + 'H1': re.compile(r'^(\d+)\.\s*'), # "1. " → "" + 'H2': re.compile(r'^(\d+)\.(\d+)\s*'), # "1.1 " → "" + 'H3': re.compile(r'^(\d+)\.(\d+)\.(\d+)\s*'), # "1.1.1 " → "" + 'H4': re.compile(r'^[가-하]\.\s*'), # "가. " → "" + 'H5': re.compile(r'^(\d+)\)\s*'), # "1) " → "" + 'H6': re.compile(r'^\((\d+)\)\s*'), # "(1) " → "" + 'H7': re.compile(r'^[①②③④⑤⑥⑦⑧⑨⑩]\s*'), # "① " → "" + 'LIST_ITEM': re.compile(r'^[•\-○]\s*'), # "• " → "" +} + +def strip_numbering(text: str, role: str) -> str: + """ + 역할에 따라 텍스트 앞의 번호/기호 제거 + HWP 개요 기능이 번호를 자동 생성하므로 중복 방지 + """ + if not text: + return text + + pattern = NUMBERING_PATTERNS.get(role) + if pattern: + return pattern.sub('', text).strip() + + return text.strip() + + +if __name__ == "__main__": + # 테스트 + print("=== 스타일 매핑 테스트 ===") + + gen = HwpStyGenerator() + + # HTML 스타일 시뮬레이션 + html_styles = { + 'H1': {'font_size': 20, 'color': '#1a365d', 'bold': True}, + 'H2': {'font_size': 16, 'color': '#2c5282', 'bold': True}, + 'BODY': {'font_size': 11, 'align': 'justify'}, + } + + gen.update_from_html(html_styles) + + for role, style in gen.styles.items(): + print(f"{role:15} → size={style.font_size}pt, bold={style.font_bold}, color=#{style.font_color}") \ No newline at end of file diff --git a/03. Code/geulbeot_10th/converters/hwpx_generator.py b/03. Code/geulbeot_10th/converters/hwpx_generator.py new file mode 100644 index 0000000..91fd9db --- /dev/null +++ b/03. Code/geulbeot_10th/converters/hwpx_generator.py @@ -0,0 +1,431 @@ +""" +HWPX 파일 생성기 +StyleAnalyzer 결과를 받아 스타일이 적용된 HWPX 파일 생성 +""" + +import os +import zipfile +import xml.etree.ElementTree as ET +from typing import List, Dict, Optional +from dataclasses import dataclass +from pathlib import Path + +from style_analyzer import StyleAnalyzer, StyledElement +from hwp_style_mapping import HwpStyleMapper, HwpStyle, ROLE_TO_STYLE_NAME + + +@dataclass +class HwpxConfig: + """HWPX 생성 설정""" + paper_width: int = 59528 # A4 너비 (hwpunit, 1/7200 inch) + paper_height: int = 84188 # A4 높이 + margin_left: int = 8504 + margin_right: int = 8504 + margin_top: int = 5668 + margin_bottom: int = 4252 + default_font: str = "함초롬바탕" + default_font_size: int = 1000 # 10pt (hwpunit) + + +class HwpxGenerator: + """HWPX 파일 생성기""" + + def __init__(self, config: Optional[HwpxConfig] = None): + self.config = config or HwpxConfig() + self.mapper = HwpStyleMapper() + self.used_styles: set = set() + + def generate(self, elements: List[StyledElement], output_path: str) -> str: + """ + StyledElement 리스트로부터 HWPX 파일 생성 + + Args: + elements: StyleAnalyzer로 분류된 요소 리스트 + output_path: 출력 파일 경로 (.hwpx) + + Returns: + 생성된 파일 경로 + """ + # 사용된 스타일 수집 + self.used_styles = {e.role for e in elements} + + # 임시 디렉토리 생성 + temp_dir = Path(output_path).with_suffix('.temp') + temp_dir.mkdir(parents=True, exist_ok=True) + + try: + # HWPX 구조 생성 + self._create_mimetype(temp_dir) + self._create_meta_inf(temp_dir) + self._create_version(temp_dir) + self._create_header(temp_dir) + self._create_content(temp_dir, elements) + self._create_settings(temp_dir) + + # ZIP으로 압축 + self._create_hwpx(temp_dir, output_path) + + return output_path + + finally: + # 임시 파일 정리 + import shutil + if temp_dir.exists(): + shutil.rmtree(temp_dir) + + def _create_mimetype(self, temp_dir: Path): + """mimetype 파일 생성""" + mimetype_path = temp_dir / "mimetype" + mimetype_path.write_text("application/hwp+zip") + + def _create_meta_inf(self, temp_dir: Path): + """META-INF/manifest.xml 생성""" + meta_dir = temp_dir / "META-INF" + meta_dir.mkdir(exist_ok=True) + + manifest = """ + + + + + + +""" + + (meta_dir / "manifest.xml").write_text(manifest, encoding='utf-8') + + def _create_version(self, temp_dir: Path): + """version.xml 생성""" + version = """ +""" + + (temp_dir / "version.xml").write_text(version, encoding='utf-8') + + def _create_header(self, temp_dir: Path): + """Contents/header.xml 생성 (스타일 정의 포함)""" + contents_dir = temp_dir / "Contents" + contents_dir.mkdir(exist_ok=True) + + # 스타일별 속성 생성 + char_props_xml = self._generate_char_properties() + para_props_xml = self._generate_para_properties() + styles_xml = self._generate_styles_xml() + + header = f""" + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +{char_props_xml} +{para_props_xml} +{styles_xml} + + + + + +""" + + (contents_dir / "header.xml").write_text(header, encoding='utf-8') + + def _generate_char_properties(self) -> str: + """글자 속성 XML 생성""" + lines = [f' '] + + # 기본 글자 속성 (id=0) + lines.append(''' + + + + + + + + + + ''') + + # 역할별 글자 속성 + for idx, role in enumerate(sorted(self.used_styles), start=1): + style = self.mapper.get_style(role) + height = int(style.font_size * 100) # pt → hwpunit + color = style.font_color.lstrip('#') + font_id = "1" if style.font_bold else "0" # 굵게면 함초롬돋움 + + lines.append(f''' + + + + + + + + + + ''') + + lines.append(' ') + return '\n'.join(lines) + + def _generate_para_properties(self) -> str: + """문단 속성 XML 생성""" + lines = [f' '] + + # 기본 문단 속성 (id=0) + lines.append(''' + + + + + + + + + + + + + + + + ''') + + # 역할별 문단 속성 + align_map = {"left": "LEFT", "center": "CENTER", "right": "RIGHT", "justify": "JUSTIFY"} + + for idx, role in enumerate(sorted(self.used_styles), start=1): + style = self.mapper.get_style(role) + align_val = align_map.get(style.align, "JUSTIFY") + line_spacing = int(style.line_spacing) + left_margin = int(style.indent_left * 100) + indent = int(style.indent_first * 100) + space_before = int(style.space_before * 100) + space_after = int(style.space_after * 100) + + lines.append(f''' + + + + + + + + + + + + + + + + ''') + + lines.append(' ') + return '\n'.join(lines) + + def _generate_styles_xml(self) -> str: + """스타일 정의 XML 생성 (charPrIDRef, paraPrIDRef 참조)""" + lines = [f' '] + + # 기본 스타일 (id=0, 바탕글) + lines.append(' ') + + # 역할별 스타일 (charPrIDRef, paraPrIDRef 참조) + for idx, role in enumerate(sorted(self.used_styles), start=1): + style = self.mapper.get_style(role) + style_name = style.name.replace('<', '<').replace('>', '>') + + lines.append(f' ') + + lines.append(' ') + return '\n'.join(lines) + + def _create_content(self, temp_dir: Path, elements: List[StyledElement]): + """Contents/section0.xml 생성 (본문 + 스타일 참조)""" + contents_dir = temp_dir / "Contents" + + # 문단 XML 생성 + paragraphs = [] + current_table = None + + # 역할 → 스타일 인덱스 매핑 생성 + role_to_idx = {role: idx for idx, role in enumerate(sorted(self.used_styles), start=1)} + + for elem in elements: + style = self.mapper.get_style(elem.role) + style_idx = role_to_idx.get(elem.role, 0) + + # 테이블 요소는 특수 처리 + if elem.role in ["TH", "TD", "TABLE_CAPTION", "TABLE", "FIGURE"]: + continue # 테이블/그림은 별도 처리 필요 + + # 일반 문단 + para_xml = self._create_paragraph(elem.text, style, style_idx) + paragraphs.append(para_xml) + + section = f""" + +{"".join(paragraphs)} +""" + + (contents_dir / "section0.xml").write_text(section, encoding='utf-8') + + def _create_paragraph(self, text: str, style: HwpStyle, style_idx: int) -> str: + """단일 문단 XML 생성""" + text = self._escape_xml(text) + + return f''' + + + {text} + + ''' + + def _escape_xml(self, text: str) -> str: + """XML 특수문자 이스케이프""" + return (text + .replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace('"', """) + .replace("'", "'")) + + def _create_settings(self, temp_dir: Path): + """settings.xml 생성""" + settings = """ + + + + + +""" + + (temp_dir / "settings.xml").write_text(settings, encoding='utf-8') + + def _create_hwpx(self, temp_dir: Path, output_path: str): + """HWPX 파일 생성 (ZIP 압축)""" + with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zf: + # mimetype은 압축하지 않고 첫 번째로 + mimetype_path = temp_dir / "mimetype" + zf.write(mimetype_path, "mimetype", compress_type=zipfile.ZIP_STORED) + + # 나머지 파일들 + for root, dirs, files in os.walk(temp_dir): + for file in files: + if file == "mimetype": + continue + file_path = Path(root) / file + arcname = file_path.relative_to(temp_dir) + zf.write(file_path, arcname) + + +def convert_html_to_hwpx(html: str, output_path: str) -> str: + """ + HTML → HWPX 변환 메인 함수 + + Args: + html: HTML 문자열 + output_path: 출력 파일 경로 + + Returns: + 생성된 파일 경로 + """ + # 1. HTML 분석 → 역할 분류 + analyzer = StyleAnalyzer() + elements = analyzer.analyze(html) + + print(f"📊 분석 완료: {len(elements)}개 요소") + for role, count in analyzer.get_role_summary().items(): + print(f" {role}: {count}") + + # 2. HWPX 생성 + generator = HwpxGenerator() + result_path = generator.generate(elements, output_path) + + print(f"✅ 생성 완료: {result_path}") + return result_path + + +if __name__ == "__main__": + # 테스트 + test_html = """ + + +
+

건설·토목 측량 DX 실무지침

+

드론/UAV·GIS·지형/지반 모델 기반

+

2024년 1월

+
+ +

1. 개요

+

본 보고서는 건설 및 토목 분야의 측량 디지털 전환에 대한 실무 지침을 제공합니다.

+ +

1.1 배경

+

최근 드론과 GIS 기술의 발전으로 측량 업무가 크게 변화하고 있습니다.

+ +

1.1.1 기술 동향

+

1) 드론 측량의 발전

+

드론을 활용한 측량은 기존 방식 대비 효율성이 크게 향상되었습니다.

+ +

(1) RTK 드론

+

실시간 보정 기능을 갖춘 RTK 드론이 보급되고 있습니다.

+ +
    +
  • 고정밀 GPS 수신기 내장
  • +
  • 센티미터 단위 정확도
  • +
+ + + """ + + output = "/home/claude/test_output.hwpx" + convert_html_to_hwpx(test_html, output) \ No newline at end of file diff --git a/03. Code/geulbeot_10th/converters/hwpx_style_injector.py b/03. Code/geulbeot_10th/converters/hwpx_style_injector.py new file mode 100644 index 0000000..9719876 --- /dev/null +++ b/03. Code/geulbeot_10th/converters/hwpx_style_injector.py @@ -0,0 +1,750 @@ +""" +HWPX 스타일 주입기 +pyhwpx로 생성된 HWPX 파일에 커스텀 스타일을 후처리로 주입 + +워크플로우: +1. HWPX 압축 해제 +2. header.xml에 커스텀 스타일 정의 추가 +3. section*.xml에서 역할별 styleIDRef 매핑 +4. 다시 압축 +""" + +import os +import re +import zipfile +import shutil +import tempfile +from pathlib import Path +from typing import Dict, List, Optional +from dataclasses import dataclass + + +@dataclass +class StyleDefinition: + """스타일 정의""" + id: int + name: str + font_size: int # hwpunit (pt * 100) + font_bold: bool + font_color: str # #RRGGBB + align: str # LEFT, CENTER, RIGHT, JUSTIFY + line_spacing: int # percent (160 = 160%) + indent_left: int # hwpunit + indent_first: int # hwpunit + space_before: int # hwpunit + space_after: int # hwpunit + outline_level: int = -1 # 🆕 개요 수준 (-1=없음, 0=1수준, 1=2수준, ...) + + +# 역할 → 스타일 정의 매핑 +ROLE_STYLES: Dict[str, StyleDefinition] = { + # 🆕 개요 문단 (자동 번호 매기기!) + 'H1': StyleDefinition( + id=101, name='제1장 제목', font_size=2200, font_bold=True, + font_color='#006400', align='CENTER', line_spacing=200, + indent_left=0, indent_first=0, space_before=400, space_after=200, + outline_level=0 # 🆕 제^1장 + ), + 'H2': StyleDefinition( + id=102, name='1.1 제목', font_size=1500, font_bold=True, + font_color='#03581d', align='LEFT', line_spacing=200, + indent_left=0, indent_first=0, space_before=300, space_after=100, + outline_level=1 # 🆕 ^1.^2 + ), + 'H3': StyleDefinition( + id=103, name='1.1.1 제목', font_size=1400, font_bold=True, + font_color='#228B22', align='LEFT', line_spacing=200, + indent_left=500, indent_first=0, space_before=200, space_after=100, + outline_level=2 # 🆕 ^1.^2.^3 + ), + 'H4': StyleDefinition( + id=104, name='가. 제목', font_size=1300, font_bold=True, + font_color='#000000', align='LEFT', line_spacing=200, + indent_left=1000, indent_first=0, space_before=150, space_after=50, + outline_level=3 # 🆕 ^4. + ), + 'H5': StyleDefinition( + id=105, name='1) 제목', font_size=1200, font_bold=True, + font_color='#000000', align='LEFT', line_spacing=200, + indent_left=1500, indent_first=0, space_before=100, space_after=50, + outline_level=4 # 🆕 ^5) + ), + 'H6': StyleDefinition( + id=106, name='가) 제목', font_size=1150, font_bold=True, + font_color='#000000', align='LEFT', line_spacing=200, + indent_left=2000, indent_first=0, space_before=100, space_after=50, + outline_level=5 # 🆕 ^6) + ), + 'H7': StyleDefinition( + id=115, name='① 제목', font_size=1100, font_bold=True, + font_color='#000000', align='LEFT', line_spacing=200, + indent_left=2300, indent_first=0, space_before=100, space_after=50, + outline_level=6 # 🆕 ^7 (원문자) + ), + # 본문 스타일 (개요 아님) + 'BODY': StyleDefinition( + id=107, name='○본문', font_size=1100, font_bold=False, + font_color='#000000', align='JUSTIFY', line_spacing=200, + indent_left=1500, indent_first=0, space_before=0, space_after=0 + ), + 'LIST_ITEM': StyleDefinition( + id=108, name='●본문', font_size=1050, font_bold=False, + font_color='#000000', align='JUSTIFY', line_spacing=200, + indent_left=2500, indent_first=0, space_before=0, space_after=0 + ), + 'TABLE_CAPTION': StyleDefinition( + id=109, name='<표 제목>', font_size=1100, font_bold=True, + font_color='#000000', align='LEFT', line_spacing=130, + indent_left=0, indent_first=0, space_before=200, space_after=100 + ), + 'FIGURE_CAPTION': StyleDefinition( + id=110, name='<그림 제목>', font_size=1100, font_bold=True, + font_color='#000000', align='CENTER', line_spacing=130, + indent_left=0, indent_first=0, space_before=100, space_after=200 + ), + 'COVER_TITLE': StyleDefinition( + id=111, name='표지제목', font_size=2800, font_bold=True, + font_color='#1a365d', align='CENTER', line_spacing=150, + indent_left=0, indent_first=0, space_before=0, space_after=200 + ), + 'COVER_SUBTITLE': StyleDefinition( + id=112, name='표지부제', font_size=1800, font_bold=False, + font_color='#2d3748', align='CENTER', line_spacing=150, + indent_left=0, indent_first=0, space_before=0, space_after=100 + ), + 'TOC_1': StyleDefinition( + id=113, name='목차1수준', font_size=1200, font_bold=True, + font_color='#000000', align='LEFT', line_spacing=180, + indent_left=0, indent_first=0, space_before=100, space_after=50 + ), + 'TOC_2': StyleDefinition( + id=114, name='목차2수준', font_size=1100, font_bold=False, + font_color='#000000', align='LEFT', line_spacing=180, + indent_left=500, indent_first=0, space_before=0, space_after=0 + ), +} + +# ⚠️ 개요 자동 번호 기능 활성화! +# idRef="0"은 numbering id=1을 참조하므로, 해당 패턴을 교체하면 동작함 + + +class HwpxStyleInjector: + """HWPX 스타일 주입기""" + + def __init__(self): + self.temp_dir: Optional[Path] = None + self.role_to_style_id: Dict[str, int] = {} + self.role_to_para_id: Dict[str, int] = {} # 🆕 + self.role_to_char_id: Dict[str, int] = {} # 🆕 + self.next_char_id = 0 + self.next_para_id = 0 + self.next_style_id = 0 + + def _find_max_ids(self): + """기존 스타일 교체: 바탕글(id=0)만 유지, 나머지는 우리 스타일로 교체""" + header_path = self.temp_dir / "Contents" / "header.xml" + if not header_path.exists(): + self.next_char_id = 1 + self.next_para_id = 1 + self.next_style_id = 1 + return + + content = header_path.read_text(encoding='utf-8') + + # 🆕 기존 "본문", "개요 1~10" 등 스타일 제거 (id=1~22) + # 바탕글(id=0)만 유지! + + # style id=1~30 제거 (바탕글 제외) + content = re.sub(r'\s*', '', content) + + # itemCnt는 나중에 _update_item_counts에서 자동 업데이트됨 + + # 파일 저장 + header_path.write_text(content, encoding='utf-8') + print(f" [INFO] 기존 스타일(본문, 개요1~10 등) 제거 완료") + + # charPr, paraPr은 기존 것 다음부터 (참조 깨지지 않도록) + char_ids = [int(m) for m in re.findall(r' str: + """ + HWPX 파일에 커스텀 스타일 주입 + + Args: + hwpx_path: 원본 HWPX 파일 경로 + role_positions: 역할별 위치 정보 {role: [(section_idx, para_idx), ...]} + + Returns: + 수정된 HWPX 파일 경로 + """ + print(f"\n🎨 HWPX 스타일 주입 시작...") + print(f" 입력: {hwpx_path}") + + # 1. 임시 디렉토리에 압축 해제 + self.temp_dir = Path(tempfile.mkdtemp(prefix='hwpx_inject_')) + print(f" 임시 폴더: {self.temp_dir}") + + try: + with zipfile.ZipFile(hwpx_path, 'r') as zf: + zf.extractall(self.temp_dir) + + # 압축 해제 직후 section 파일 크기 확인 + print(f" [DEBUG] After unzip:") + for sec in ['section0.xml', 'section1.xml', 'section2.xml']: + sec_path = self.temp_dir / "Contents" / sec + if sec_path.exists(): + print(f" [DEBUG] {sec} size: {sec_path.stat().st_size} bytes") + + # 🆕 기존 최대 ID 찾기 (연속 ID 할당을 위해) + self._find_max_ids() + print(f" [DEBUG] Starting IDs: char={self.next_char_id}, para={self.next_para_id}, style={self.next_style_id}") + + # 2. header.xml에 스타일 정의 추가 + used_roles = set(role_positions.keys()) + self._inject_header_styles(used_roles) + + # 3. section*.xml에 styleIDRef 매핑 + self._inject_section_styles(role_positions) + + # 4. 다시 압축 + output_path = hwpx_path # 원본 덮어쓰기 + self._repack_hwpx(output_path) + + print(f" ✅ 스타일 주입 완료: {output_path}") + return output_path + + finally: + # 임시 폴더 정리 + if self.temp_dir and self.temp_dir.exists(): + shutil.rmtree(self.temp_dir) + + def _inject_header_styles(self, used_roles: set): + """header.xml에 스타일 정의 추가 (모든 ROLE_STYLES 주입)""" + header_path = self.temp_dir / "Contents" / "header.xml" + if not header_path.exists(): + print(" [경고] header.xml 없음") + return + + content = header_path.read_text(encoding='utf-8') + + # 🆕 모든 ROLE_STYLES 주입 (used_roles 무시) + char_props = [] + para_props = [] + styles = [] + + for role, style_def in ROLE_STYLES.items(): + char_id = self.next_char_id + para_id = self.next_para_id + style_id = self.next_style_id + + self.role_to_style_id[role] = style_id + self.role_to_para_id[role] = para_id # 🆕 + self.role_to_char_id[role] = char_id # 🆕 + + # charPr 생성 + char_props.append(self._make_char_pr(char_id, style_def)) + + # paraPr 생성 + para_props.append(self._make_para_pr(para_id, style_def)) + + # style 생성 + styles.append(self._make_style(style_id, style_def.name, para_id, char_id)) + + self.next_char_id += 1 + self.next_para_id += 1 + self.next_style_id += 1 + + if not styles: + print(" [정보] 주입할 스타일 없음") + return + + # charProperties에 추가 + content = self._insert_before_tag( + content, '', '\n'.join(char_props) + '\n' + ) + + # paraProperties에 추가 + content = self._insert_before_tag( + content, '', '\n'.join(para_props) + '\n' + ) + + # styles에 추가 + content = self._insert_before_tag( + content, '', '\n'.join(styles) + '\n' + ) + + # 🆕 numbering id=1 패턴 교체 (idRef="0"이 참조하는 기본 번호 모양) + # 이렇게 하면 개요 자동 번호가 "제1장, 1.1, 1.1.1..." 형식으로 동작! + content = self._replace_default_numbering(content) + + # itemCnt 업데이트 + content = self._update_item_counts(content) + + header_path.write_text(content, encoding='utf-8') + print(f" → header.xml 수정 완료 ({len(styles)}개 스타일 추가)") + + def _make_char_pr(self, id: int, style: StyleDefinition) -> str: + """charPr XML 생성 (한 줄로!)""" + color = style.font_color.lstrip('#') + font_id = "1" if style.font_bold else "0" + + return f'' + + def _make_para_pr(self, id: int, style: StyleDefinition) -> str: + """paraPr XML 생성 (한 줄로!)""" + # 개요 문단이면 type="OUTLINE", 아니면 type="NONE" + # idRef="0"은 numbering id=1 (기본 번호 모양)을 참조 + if style.outline_level >= 0: + heading = f'' + else: + heading = '' + + return f'{heading}' + + def _make_style(self, id: int, name: str, para_id: int, char_id: int) -> str: + """style XML 생성""" + safe_name = name.replace('<', '<').replace('>', '>') + return f'' + + def _insert_before_tag(self, content: str, tag: str, insert_text: str) -> str: + """특정 태그 앞에 텍스트 삽입""" + return content.replace(tag, insert_text + tag) + + def _update_item_counts(self, content: str) -> str: + """itemCnt 속성 업데이트""" + # charProperties itemCnt + char_count = content.count(' str: + """numbering id=1의 패턴을 우리 패턴으로 교체""" + # 우리가 원하는 개요 번호 패턴 + new_patterns = [ + {'level': '1', 'format': 'DIGIT', 'pattern': '제^1장'}, + {'level': '2', 'format': 'DIGIT', 'pattern': '^1.^2'}, + {'level': '3', 'format': 'DIGIT', 'pattern': '^1.^2.^3'}, + {'level': '4', 'format': 'HANGUL_SYLLABLE', 'pattern': '^4.'}, + {'level': '5', 'format': 'DIGIT', 'pattern': '^5)'}, + {'level': '6', 'format': 'HANGUL_SYLLABLE', 'pattern': '^6)'}, + {'level': '7', 'format': 'CIRCLED_DIGIT', 'pattern': '^7'}, + ] + + # numbering id="1" 찾기 + match = re.search(r'(]*>)(.*?)()', content, re.DOTALL) + if not match: + print(" [경고] numbering id=1 없음, 교체 건너뜀") + return content + + numbering_content = match.group(2) + + for np in new_patterns: + level = np['level'] + fmt = np['format'] + pattern = np['pattern'] + + # 해당 level의 paraHead 찾아서 교체 + def replace_parahead(m): + tag = m.group(0) + # numFormat 변경 + tag = re.sub(r'numFormat="[^"]*"', f'numFormat="{fmt}"', tag) + # 패턴(텍스트 내용) 변경 + tag = re.sub(r'>([^<]*)', f'>{pattern}', tag) + return tag + + numbering_content = re.sub( + rf']*level="{level}"[^>]*>.*?', + replace_parahead, + numbering_content + ) + + new_content = match.group(1) + numbering_content + match.group(3) + print(" [INFO] numbering id=1 패턴 교체 완료 (제^1장, ^1.^2, ^1.^2.^3...)") + return content.replace(match.group(0), new_content) + + def _adjust_tables(self, content: str) -> str: + """표 셀 크기 자동 조정 + + 1. 행 높이: 최소 800 hwpunit (내용 잘림 방지) + 2. 열 너비: 표 전체 너비를 열 개수로 균등 분배 (또는 첫 열 좁게) + """ + + def adjust_table(match): + tbl = match.group(0) + + # 표 전체 너비 추출 + sz_match = re.search(r' 1 else table_width + + # 행 높이 최소값 설정 + min_height = 800 # 약 8mm + + # 셀 크기 조정 + col_idx = [0] # closure용 + + def adjust_cell_sz(cell_match): + width = int(cell_match.group(1)) + height = int(cell_match.group(2)) + + # 높이 조정 + new_height = max(height, min_height) + + return f'' + + tbl = re.sub( + r'', + adjust_cell_sz, + tbl + ) + + return tbl + + return re.sub(r']*>.*?', adjust_table, content, flags=re.DOTALL) + + def _inject_section_styles(self, role_positions: Dict[str, List[tuple]]): + """section*.xml에 styleIDRef 매핑 (텍스트 매칭 방식)""" + contents_dir = self.temp_dir / "Contents" + + # 🔍 디버그: role_to_style_id 확인 + print(f" [DEBUG] role_to_style_id: {self.role_to_style_id}") + + # section 파일들 찾기 + section_files = sorted(contents_dir.glob("section*.xml")) + print(f" [DEBUG] section files: {[f.name for f in section_files]}") + + total_modified = 0 + + for section_file in section_files: + print(f" [DEBUG] Processing: {section_file.name}") + original_content = section_file.read_text(encoding='utf-8') + print(f" [DEBUG] File size: {len(original_content)} bytes") + + content = original_content # 작업용 복사본 + + # 🆕 머리말/꼬리말 영역 보존 (placeholder로 교체) + header_footer_map = {} + placeholder_idx = 0 + + def save_header_footer(match): + nonlocal placeholder_idx + key = f"__HF_PLACEHOLDER_{placeholder_idx}__" + header_footer_map[key] = match.group(0) + placeholder_idx += 1 + return key + + # 머리말/꼬리말 임시 교체 + content = re.sub(r']*>.*?', save_header_footer, content, flags=re.DOTALL) + content = re.sub(r']*>.*?', save_header_footer, content, flags=re.DOTALL) + + # 모든 태그와 내부 텍스트 추출 + para_pattern = r'(]*>)(.*?)()' + + section_modified = 0 + + def replace_style(match): + nonlocal total_modified, section_modified + open_tag = match.group(1) + inner = match.group(2) + close_tag = match.group(3) + + # 텍스트 추출 (태그 제거) + text = re.sub(r'<[^>]+>', '', inner).strip() + if not text: + return match.group(0) + + # 텍스트 앞부분으로 역할 판단 + text_start = text[:50] # 처음 50자로 판단 + + matched_role = None + matched_style_id = None + matched_para_id = None + matched_char_id = None + + # 제목 패턴 매칭 (앞에 특수문자 허용) + # Unicode: ■\u25a0 ▸\u25b8 ◆\u25c6 ▶\u25b6 ●\u25cf ○\u25cb ▪\u25aa ►\u25ba ☞\u261e ★\u2605 ※\u203b ·\u00b7 + prefix = r'^[\u25a0\u25b8\u25c6\u25b6\u25cf\u25cb\u25aa\u25ba\u261e\u2605\u203b\u00b7\s]*' + + # 🆕 FIGURE_CAPTION: "[그림 1-1]", "[그림 1-2]" 등 (가장 먼저 체크!) + # 그림 = \uadf8\ub9bc + if re.match(r'^\[\uadf8\ub9bc\s*[\d-]+\]', text_start): + matched_role = 'FIGURE_CAPTION' + # 🆕 TABLE_CAPTION: "<표 1-1>", "[표 1-1]" 등 + # 표 = \ud45c + elif re.match(r'^[<\[]\ud45c\s*[\d-]+[>\]]', text_start): + matched_role = 'TABLE_CAPTION' + # H1: "제1장", "1 개요" 등 + elif re.match(prefix + r'\uc81c?\s*\d+\uc7a5?\s', text_start) or re.match(prefix + r'[1-9]\s+[\uac00-\ud7a3]', text_start): + matched_role = 'H1' + # H3: "1.1.1 " (H2보다 먼저 체크!) + elif re.match(prefix + r'\d+\.\d+\.\d+\s', text_start): + matched_role = 'H3' + # H2: "1.1 " + elif re.match(prefix + r'\d+\.\d+\s', text_start): + matched_role = 'H2' + # H4: "가. " + elif re.match(prefix + r'[\uac00-\ud7a3]\.\s', text_start): + matched_role = 'H4' + # H5: "1) " + elif re.match(prefix + r'\d+\)\s', text_start): + matched_role = 'H5' + # H6: "(1) " 또는 "가) " + elif re.match(prefix + r'\(\d+\)\s', text_start): + matched_role = 'H6' + elif re.match(prefix + r'[\uac00-\ud7a3]\)\s', text_start): + matched_role = 'H6' + # LIST_ITEM: "○ ", "● ", "• " 등 + elif re.match(r'^[\u25cb\u25cf\u25e6\u2022\u2023\u25b8]\s', text_start): + matched_role = 'LIST_ITEM' + elif re.match(r'^[-\u2013\u2014]\s', text_start): + matched_role = 'LIST_ITEM' + + # 매칭된 역할이 있고 스타일 ID가 있으면 적용 + if matched_role and matched_role in self.role_to_style_id: + matched_style_id = self.role_to_style_id[matched_role] + matched_para_id = self.role_to_para_id[matched_role] + matched_char_id = self.role_to_char_id[matched_role] + elif 'BODY' in self.role_to_style_id and len(text) > 20: + # 긴 텍스트는 본문으로 간주 + matched_role = 'BODY' + matched_style_id = self.role_to_style_id['BODY'] + matched_para_id = self.role_to_para_id['BODY'] + matched_char_id = self.role_to_char_id['BODY'] + + if matched_style_id: + # 1. hp:p 태그의 styleIDRef 변경 + if 'styleIDRef="' in open_tag: + new_open = re.sub(r'styleIDRef="[^"]*"', f'styleIDRef="{matched_style_id}"', open_tag) + else: + new_open = open_tag.replace(']*charPrIDRef=")[^"]*(")', f'\\g<1>{matched_char_id}\\2', inner) + + # 🆕 4. 개요 문단이면 수동 번호 제거 (자동 번호가 붙으니까!) + if matched_role in ROLE_STYLES and ROLE_STYLES[matched_role].outline_level >= 0: + new_inner = self._remove_manual_numbering(new_inner, matched_role) + + total_modified += 1 + section_modified += 1 + return new_open + new_inner + close_tag + + return match.group(0) + + new_content = re.sub(para_pattern, replace_style, content, flags=re.DOTALL) + + # 🆕 표 크기 자동 조정 + new_content = self._adjust_tables(new_content) + + # 🆕 outlineShapeIDRef를 1로 변경 (우리가 교체한 numbering id=1 사용) + new_content = re.sub( + r'outlineShapeIDRef="[^"]*"', + 'outlineShapeIDRef="1"', + new_content + ) + + + # 🆕 머리말/꼬리말 복원 + for key, original in header_footer_map.items(): + new_content = new_content.replace(key, original) + + print(f" [DEBUG] {section_file.name}: {section_modified} paras modified, content changed: {new_content != original_content}") + + if new_content != original_content: + section_file.write_text(new_content, encoding='utf-8') + print(f" -> {section_file.name} saved") + + print(f" -> Total {total_modified} paragraphs styled") + + def _update_para_style(self, content: str, para_idx: int, style_id: int) -> str: + """특정 인덱스의 문단 styleIDRef 변경""" + # 태그들 찾기 + pattern = r']*>' + matches = list(re.finditer(pattern, content)) + + if para_idx >= len(matches): + return content + + match = matches[para_idx] + old_tag = match.group(0) + + # styleIDRef 속성 변경 또는 추가 + if 'styleIDRef=' in old_tag: + new_tag = re.sub(r'styleIDRef="[^"]*"', f'styleIDRef="{style_id}"', old_tag) + else: + # 속성 추가 + new_tag = old_tag.replace(' str: + """🆕 개요 문단에서 수동 번호 제거 (자동 번호가 붙으니까!) + + HTML에서 "제1장 DX 개요" → "DX 개요" (자동으로 "제1장" 붙음) + HTML에서 "1.1 측량 DX" → "측량 DX" (자동으로 "1.1" 붙음) + """ + # 역할별 번호 패턴 + patterns = { + 'H1': r'^(제\s*\d+\s*장\s*)', # "제1장 " → 제거 + 'H2': r'^(\d+\.\d+\s+)', # "1.1 " → 제거 + 'H3': r'^(\d+\.\d+\.\d+\s+)', # "1.1.1 " → 제거 + 'H4': r'^([가-힣]\.\s+)', # "가. " → 제거 + 'H5': r'^(\d+\)\s+)', # "1) " → 제거 + 'H6': r'^([가-힣]\)\s+|\(\d+\)\s+)', # "가) " 또는 "(1) " → 제거 + 'H7': r'^([①②③④⑤⑥⑦⑧⑨⑩]+\s*)', # "① " → 제거 + } + + if role not in patterns: + return inner + + pattern = patterns[role] + + # 태그 내 텍스트에서 번호 제거 + def remove_number(match): + text = match.group(1) + # 첫 번째 내용에서만 번호 제거 + new_text = re.sub(pattern, '', text, count=1) + return f'{new_text}' + + # 첫 번째 hp:t 태그만 처리 + new_inner = re.sub(r'([^<]*)', remove_number, inner, count=1) + + return new_inner + + def _repack_hwpx(self, output_path: str): + """HWPX 재압축""" + print(f" [DEBUG] Repacking to: {output_path}") + print(f" [DEBUG] Source dir: {self.temp_dir}") + + # 압축 전 section 파일 크기 확인 + for sec in ['section0.xml', 'section1.xml', 'section2.xml']: + sec_path = self.temp_dir / "Contents" / sec + if sec_path.exists(): + print(f" [DEBUG] {sec} size before zip: {sec_path.stat().st_size} bytes") + + # 🆕 임시 파일에 먼저 저장 (원본 파일 잠금 문제 회피) + temp_output = output_path + ".tmp" + + with zipfile.ZipFile(temp_output, 'w', zipfile.ZIP_DEFLATED) as zf: + # mimetype은 압축 없이 첫 번째로 + mimetype_path = self.temp_dir / "mimetype" + if mimetype_path.exists(): + zf.write(mimetype_path, "mimetype", compress_type=zipfile.ZIP_STORED) + + # 나머지 파일들 + file_count = 0 + for root, dirs, files in os.walk(self.temp_dir): + for file in files: + if file == "mimetype": + continue + file_path = Path(root) / file + arcname = file_path.relative_to(self.temp_dir) + zf.write(file_path, arcname) + file_count += 1 + + print(f" [DEBUG] Total files zipped: {file_count}") + + # 🆕 원본 삭제 후 임시 파일을 원본 이름으로 변경 + import time + for attempt in range(3): + try: + if os.path.exists(output_path): + os.remove(output_path) + os.rename(temp_output, output_path) + break + except PermissionError: + print(f" [DEBUG] 파일 잠금 대기 중... ({attempt + 1}/3)") + time.sleep(0.5) + else: + # 3번 시도 실패 시 임시 파일 이름으로 유지 + print(f" [경고] 원본 덮어쓰기 실패, 임시 파일 사용: {temp_output}") + output_path = temp_output + + # 압축 후 결과 확인 + print(f" [DEBUG] Output file size: {Path(output_path).stat().st_size} bytes") + + +def inject_styles_to_hwpx(hwpx_path: str, elements: list) -> str: + """ + 편의 함수: StyledElement 리스트로부터 역할 위치 추출 후 스타일 주입 + + Args: + hwpx_path: HWPX 파일 경로 + elements: StyleAnalyzer의 StyledElement 리스트 + + Returns: + 수정된 HWPX 파일 경로 + """ + # 역할별 위치 수집 + # 참고: 현재는 section 0, para 순서대로 가정 + role_positions: Dict[str, List[tuple]] = {} + + for idx, elem in enumerate(elements): + role = elem.role + if role not in role_positions: + role_positions[role] = [] + # (section_idx, para_idx) - 현재는 section 0 가정 + role_positions[role].append((0, idx)) + + injector = HwpxStyleInjector() + return injector.inject(hwpx_path, role_positions) + + +# 테스트 +if __name__ == "__main__": + # 테스트용 + test_positions = { + 'H1': [(0, 0), (0, 5)], + 'H2': [(0, 1), (0, 6)], + 'BODY': [(0, 2), (0, 3), (0, 4)], + } + + # injector = HwpxStyleInjector() + # injector.inject("test.hwpx", test_positions) + print("HwpxStyleInjector 모듈 로드 완료") \ No newline at end of file diff --git a/03. Code/geulbeot_10th/converters/hwpx_table_injector.py b/03. Code/geulbeot_10th/converters/hwpx_table_injector.py new file mode 100644 index 0000000..fb6b6da --- /dev/null +++ b/03. Code/geulbeot_10th/converters/hwpx_table_injector.py @@ -0,0 +1,174 @@ +# -*- coding: utf-8 -*- +""" +HWPX 표 열 너비 수정기 v2 +표 생성 후 HWPX 파일을 직접 수정하여 열 너비 적용 +""" + +import zipfile +import re +from pathlib import Path +import tempfile +import shutil + +# mm → HWPML 단위 변환 (1mm ≈ 283.46 HWPML units) +MM_TO_HWPML = 7200 / 25.4 # ≈ 283.46 + + +def inject_table_widths(hwpx_path: str, table_widths_list: list): + """ + HWPX 파일의 표 열 너비를 수정 + + Args: + hwpx_path: HWPX 파일 경로 + table_widths_list: [[w1, w2, w3], [w1, w2], ...] 형태 (mm 단위) + """ + if not table_widths_list: + print(" [INFO] 수정할 표 없음") + return + + print(f"📐 HWPX 표 열 너비 수정 시작... ({len(table_widths_list)}개 표)") + + # HWPX 압축 해제 + temp_dir = Path(tempfile.mkdtemp(prefix="hwpx_table_")) + + with zipfile.ZipFile(hwpx_path, 'r') as zf: + zf.extractall(temp_dir) + + # section*.xml 파일들에서 표 찾기 + contents_dir = temp_dir / "Contents" + + table_idx = 0 + total_modified = 0 + + for section_file in sorted(contents_dir.glob("section*.xml")): + with open(section_file, 'r', encoding='utf-8') as f: + content = f.read() + + original_content = content + + # 모든 표(...) 찾기 + tbl_pattern = re.compile(r'(]*>)(.*?)()', re.DOTALL) + + def process_table(match): + nonlocal table_idx, total_modified + + if table_idx >= len(table_widths_list): + return match.group(0) + + tbl_open = match.group(1) + tbl_content = match.group(2) + tbl_close = match.group(3) + + col_widths_mm = table_widths_list[table_idx] + col_widths_hwpml = [int(w * MM_TO_HWPML) for w in col_widths_mm] + + # 표 전체 너비 수정 (hp:sz width="...") + total_width = int(sum(col_widths_mm) * MM_TO_HWPML) + tbl_content = re.sub( + r'(= len(col_widths_hwpml): + return tc_content + + new_width = col_widths_hwpml[col_idx] + + # cellSz width 교체 + tc_content = re.sub( + r'(... 블록 처리 + tbl_content = re.sub( + r']*>.*?', + replace_cell_width, + tbl_content, + flags=re.DOTALL + ) + + print(f" ✅ 표 #{table_idx + 1}: {col_widths_mm} mm → HWPML 적용") + table_idx += 1 + total_modified += 1 + + return tbl_open + tbl_content + tbl_close + + # 표 처리 + new_content = tbl_pattern.sub(process_table, content) + + # 변경사항 있으면 저장 + if new_content != original_content: + with open(section_file, 'w', encoding='utf-8') as f: + f.write(new_content) + print(f" → {section_file.name} 저장됨") + + # 다시 압축 + repack_hwpx(temp_dir, hwpx_path) + + # 임시 폴더 삭제 + shutil.rmtree(temp_dir) + + print(f" ✅ 총 {total_modified}개 표 열 너비 수정 완료") + + +def repack_hwpx(source_dir: Path, output_path: str): + """HWPX 파일 다시 압축""" + import os + import time + + temp_output = output_path + ".tmp" + + with zipfile.ZipFile(temp_output, 'w', zipfile.ZIP_DEFLATED) as zf: + # mimetype은 압축 없이 첫 번째로 + mimetype_path = source_dir / "mimetype" + if mimetype_path.exists(): + zf.write(mimetype_path, "mimetype", compress_type=zipfile.ZIP_STORED) + + # 나머지 파일들 + for root, dirs, files in os.walk(source_dir): + for file in files: + if file == "mimetype": + continue + file_path = Path(root) / file + arcname = file_path.relative_to(source_dir) + zf.write(file_path, arcname) + + # 원본 교체 + for attempt in range(3): + try: + if os.path.exists(output_path): + os.remove(output_path) + os.rename(temp_output, output_path) + break + except PermissionError: + time.sleep(0.5) + + +# 테스트용 +if __name__ == "__main__": + test_widths = [ + [18.2, 38.9, 42.8, 70.1], + [19.9, 79.6, 70.5], + [28.7, 81.4, 59.9], + [19.2, 61.4, 89.5], + ] + + hwpx_path = r"C:\Users\User\AppData\Local\Temp\geulbeot_output.hwpx" + inject_table_widths(hwpx_path, test_widths) \ No newline at end of file diff --git a/03. Code/geulbeot_10th/converters/pipeline/__init__.py b/03. Code/geulbeot_10th/converters/pipeline/__init__.py new file mode 100644 index 0000000..d698245 --- /dev/null +++ b/03. Code/geulbeot_10th/converters/pipeline/__init__.py @@ -0,0 +1 @@ +from .router import process_document, is_long_document diff --git a/03. Code/geulbeot_10th/converters/pipeline/router.py b/03. Code/geulbeot_10th/converters/pipeline/router.py new file mode 100644 index 0000000..c4795d8 --- /dev/null +++ b/03. Code/geulbeot_10th/converters/pipeline/router.py @@ -0,0 +1,176 @@ +# -*- coding: utf-8 -*- +""" +router.py + +기능: +- HTML 입력의 분량을 판단하여 적절한 파이프라인으로 분기 +- 긴 문서 (5000자 이상): RAG 파이프라인 (step3→4→5→6→7→8→9) +- 짧은 문서 (5000자 미만): 직접 생성 (step7→8→9) +""" + +import re +import os +from typing import Dict, Any + +# 분량 판단 기준 +LONG_DOC_THRESHOLD = 5000 # 5000자 이상이면 긴 문서 + +# 이미지 assets 경로 (개발용 고정) - r prefix 필수! +ASSETS_BASE_PATH = r"D:\for python\geulbeot-light\geulbeot-light\output\assets" + +def count_characters(html_content: str) -> int: + """HTML 태그 제외한 순수 텍스트 글자 수 계산""" + # HTML 태그 제거 + text_only = re.sub(r'<[^>]+>', '', html_content) + # 공백 정리 + text_only = ' '.join(text_only.split()) + return len(text_only) + + +def is_long_document(html_content: str) -> bool: + """긴 문서 여부 판단""" + char_count = count_characters(html_content) + return char_count >= LONG_DOC_THRESHOLD + +def convert_image_paths(html_content: str) -> str: + """ + HTML 내 이미지 경로를 서버 경로로 변환 + - assets/xxx.png → /assets/xxx.png (Flask 서빙용) + - 절대 경로나 URL은 그대로 유지 + """ + + def replace_src(match): + original_path = match.group(1) + + # 이미 절대 경로이거나 URL이면 그대로 + if original_path.startswith(('http://', 'https://', 'file://', 'D:', 'C:', '/')): + return match.group(0) + + # assets/로 시작하면 /assets/로 변환 (Flask 서빙) + if original_path.startswith('assets/'): + return f'src="/{original_path}"' + + return match.group(0) + + # src="..." 패턴 찾아서 변환 + result = re.sub(r'src="([^"]+)"', replace_src, html_content) + return result + +def run_short_pipeline(html_content: str, options: dict) -> Dict[str, Any]: + """ + 짧은 문서 파이프라인 (5000자 미만) + """ + try: + # 이미지 경로 변환 + processed_html = convert_image_paths(html_content) + + # TODO: step7, step8, step9 연동 + return { + 'success': True, + 'pipeline': 'short', + 'char_count': count_characters(html_content), + 'html': processed_html + } + except Exception as e: + return { + 'success': False, + 'error': str(e), + 'pipeline': 'short' + } + +def inject_template_css(html_content: str, template_css: str) -> str: + """ + HTML에 템플릿 CSS 주입 + - 태그 앞에 추가 + if '' in html_content: + return html_content.replace('', f'{css_block}', 1) + + # 태그 뒤에 새로 추가 + elif '' in html_content: + return html_content.replace('', f'\n', 1) + + # head도 없으면 맨 앞에 추가 + else: + return f'\n{html_content}' + + +def run_long_pipeline(html_content: str, options: dict) -> Dict[str, Any]: + """ + 긴 문서 파이프라인 (5000자 이상) + 이제 실제 step들을 호출함 + """ + try: + processed_html = convert_image_paths(html_content) + + folder_path = options.get('folder_path', '') + write_mode = options.get('write_mode', 'restructure') + + if not folder_path: + # 폴더 없으면 HTML만으로 처리 (기존 로직) + return { + 'success': True, + 'pipeline': 'long', + 'char_count': count_characters(html_content), + 'html': processed_html + } + + # ★ 파이프라인 실행은 /api/generate-toc → /api/generate-report-from-toc 에서 처리 + # router는 여전히 HTML 통과 역할 유지 + return { + 'success': True, + 'pipeline': 'long', + 'char_count': count_characters(html_content), + 'html': processed_html, + 'needs_pipeline': True # ← 프론트에서 분기 판단용 + } + + except Exception as e: + return {'success': False, 'error': str(e), 'pipeline': 'long'} + + +def process_document(content: str, options: dict = None) -> Dict[str, Any]: + """ + 메인 라우터 함수 + - 분량에 따라 적절한 파이프라인으로 분기 + + Args: + content: HTML 문자열 + options: 추가 옵션 (page_option, instruction 등) + + Returns: + {'success': bool, 'html': str, 'pipeline': str, ...} + """ + if options is None: + options = {} + + if not content or not content.strip(): + return { + 'success': False, + 'error': '내용이 비어있습니다.' + } + + char_count = count_characters(content) + + if is_long_document(content): + result = run_long_pipeline(content, options) + else: + result = run_short_pipeline(content, options) + + # 공통 정보 추가 + result['char_count'] = char_count + result['threshold'] = LONG_DOC_THRESHOLD + + # ⭐ 템플릿 CSS 주입 + template_css = options.get('template_css') + if template_css and result.get('success') and result.get('html'): + result['html'] = inject_template_css(result['html'], template_css) + + return result \ No newline at end of file diff --git a/03. Code/geulbeot_10th/converters/pipeline/step1_convert.py b/03. Code/geulbeot_10th/converters/pipeline/step1_convert.py new file mode 100644 index 0000000..d15f2dc --- /dev/null +++ b/03. Code/geulbeot_10th/converters/pipeline/step1_convert.py @@ -0,0 +1,784 @@ +""" +측량/GIS/드론 관련 자료 PDF 변환 및 정리 시스템 +- 모든 파일 형식을 PDF로 변환 +- DWG 파일: DWG TrueView를 사용한 자동 PDF 변환 +- 동영상 파일: Whisper를 사용한 음성→텍스트 변환 후 PDF 생성 +- 원본 경로와 변환 파일 경로를 엑셀로 관리 +""" + +import os +import shutil +from pathlib import Path +from datetime import datetime +import openpyxl +from openpyxl.styles import Font, PatternFill, Alignment +import win32com.client +import pythoncom +from PIL import Image +import subprocess +import json + +class SurveyingFileConverter: + def _dbg(self, msg): + if getattr(self, "debug", False): + print(msg) + + def _ensure_ffmpeg_on_path(self): + import os + import shutil + from pathlib import Path + + found = shutil.which("ffmpeg") + self._dbg(f"DEBUG ffmpeg which before: {found}") + if found: + self.ffmpeg_exe = found + return True + + try: + import imageio_ffmpeg + + src = Path(imageio_ffmpeg.get_ffmpeg_exe()) + self._dbg(f"DEBUG imageio ffmpeg exe: {src}") + self._dbg(f"DEBUG imageio ffmpeg exists: {src.exists()}") + + if not src.exists(): + return False + + tools_dir = Path(self.output_dir) / "tools_ffmpeg" + tools_dir.mkdir(parents=True, exist_ok=True) + + dst = tools_dir / "ffmpeg.exe" + + if not dst.exists(): + shutil.copyfile(str(src), str(dst)) + + os.environ["PATH"] = str(tools_dir) + os.pathsep + os.environ.get("PATH", "") + + found2 = shutil.which("ffmpeg") + self._dbg(f"DEBUG ffmpeg which after: {found2}") + + if found2: + self.ffmpeg_exe = found2 + return True + + return False + + except Exception as e: + self._dbg(f"DEBUG ensure ffmpeg error: {e}") + return False + + + def __init__(self, source_dir, output_dir): + self.source_dir = Path(source_dir) + self.output_dir = Path(output_dir) + self.output_dir.mkdir(parents=True, exist_ok=True) + + self.debug = True + self.ffmpeg_exe = None + ok = self._ensure_ffmpeg_on_path() + self._dbg(f"DEBUG ensure_ffmpeg_on_path result: {ok}") + + # 변환 로그를 저장할 리스트 + self.conversion_log = [] + + # ★ 추가: 도메인 용어 사전 + self.domain_terms = "" + + # HWP 보안 모듈 후보 목록 추가 + self.hwp_security_modules = [ + "FilePathCheckerModuleExample", + "SecurityModule", + "" + ] + + # 지원 파일 확장자 정의 + self.image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff', '.tif', '.webp'} + self.office_extensions = {'.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.hwp', '.hwpx'} + self.video_extensions = {'.mp4', '.avi', '.mov', '.wmv', '.flv', '.mkv', '.m4v'} + self.text_extensions = {'.txt', '.csv', '.log', '.md'} + self.pdf_extension = {'.pdf'} + self.dwg_extensions = {'.dwg', '.dxf'} + + # DWG TrueView 경로 설정 (설치 버전에 맞게 조정) + self.trueview_path = self._find_trueview() + + def _find_trueview(self): + """DWG TrueView 설치 경로 자동 탐색""" + possible_paths = [ + r"C:\Program Files\Autodesk\DWG TrueView 2025\dwgviewr.exe", + r"C:\Program Files\Autodesk\DWG TrueView 2024\dwgviewr.exe", + r"C:\Program Files\Autodesk\DWG TrueView 2023\dwgviewr.exe", + r"C:\Program Files (x86)\Autodesk\DWG TrueView 2025\dwgviewr.exe", + r"C:\Program Files (x86)\Autodesk\DWG TrueView 2024\dwgviewr.exe", + ] + + for path in possible_paths: + if Path(path).exists(): + return path + + return None + + def get_all_files(self): + """하위 모든 폴더의 파일 목록 가져오기""" + all_files = [] + for file_path in self.source_dir.rglob('*'): + if file_path.is_file(): + all_files.append(file_path) + return all_files + + def extract_audio_from_video(self, video_path, audio_output_path): + try: + import imageio_ffmpeg + from pathlib import Path + + ffmpeg_exe = imageio_ffmpeg.get_ffmpeg_exe() + self._dbg(f"DEBUG extract ffmpeg_exe: {ffmpeg_exe}") + self._dbg(f"DEBUG extract ffmpeg_exe exists: {Path(ffmpeg_exe).exists()}") + self._dbg(f"DEBUG extract input exists: {Path(video_path).exists()}") + self._dbg(f"DEBUG extract out path: {audio_output_path}") + + cmd = [ + ffmpeg_exe, + "-i", str(video_path), + "-vn", + "-acodec", "pcm_s16le", + "-ar", "16000", + "-ac", "1", + "-y", + str(audio_output_path), + ] + self._dbg("DEBUG extract cmd: " + " ".join(cmd)) + + result = subprocess.run(cmd, capture_output=True, timeout=300, check=True, text=True) + self._dbg(f"DEBUG extract returncode: {result.returncode}") + self._dbg(f"DEBUG extract stderr tail: {(result.stderr or '')[-300:]}") + return True + + except subprocess.CalledProcessError as e: + self._dbg(f"DEBUG extract CalledProcessError returncode: {e.returncode}") + self._dbg(f"DEBUG extract stderr tail: {(e.stderr or '')[-300:]}") + return False + except Exception as e: + self._dbg(f"DEBUG extract exception: {e}") + return False + + def transcribe_audio_with_whisper(self, audio_path): + try: + self._ensure_ffmpeg_on_path() + + import shutil + from pathlib import Path + + ffmpeg_path = shutil.which("ffmpeg") + self._dbg(f"DEBUG whisper ffmpeg which: {ffmpeg_path}") + + if not ffmpeg_path: + if self.ffmpeg_exe: + import os + os.environ["PATH"] = str(Path(self.ffmpeg_exe).parent) + os.pathsep + os.environ.get("PATH", "") + + audio_file = Path(audio_path) + self._dbg(f"DEBUG whisper audio exists: {audio_file.exists()}") + self._dbg(f"DEBUG whisper audio size: {audio_file.stat().st_size if audio_file.exists() else 'NA'}") + + if not audio_file.exists() or audio_file.stat().st_size == 0: + return "[오디오 파일이 비어있거나 존재하지 않음]" + + import whisper + model = whisper.load_model("medium") # ★ base → medium 변경 + + # ★ domain_terms를 initial_prompt로 사용 + result = model.transcribe( + str(audio_path), + language="ko", + task="transcribe", + initial_prompt=self.domain_terms if self.domain_terms else None, + condition_on_previous_text=True, # ★ 다시 True로 + ) + + # ★ 후처리: 반복 및 이상한 텍스트 제거 + text = result["text"] + text = self.clean_transcript(text) + return text + + except Exception as e: + import traceback + self._dbg(f"DEBUG whisper traceback: {traceback.format_exc()}") + return f"[음성 인식 실패: {str(e)}]" + + def clean_transcript(self, text): + """Whisper 결과 후처리 - 반복/환각 제거""" + import re + + # 1. 영어/일본어/중국어 환각 제거 + text = re.sub(r'[A-Za-z]{3,}', '', text) # 3글자 이상 영어 제거 + text = re.sub(r'[\u3040-\u309F\u30A0-\u30FF]+', '', text) # 일본어 제거 + text = re.sub(r'[\u4E00-\u9FFF]+', '', text) # 한자 제거 (필요시) + + # 2. 반복 문장 제거 + sentences = text.split('.') + seen = set() + unique_sentences = [] + for s in sentences: + s_clean = s.strip() + if s_clean and s_clean not in seen: + seen.add(s_clean) + unique_sentences.append(s_clean) + + text = '. '.join(unique_sentences) + + # 3. 이상한 문자 정리 + text = re.sub(r'\s+', ' ', text) # 다중 공백 제거 + text = text.strip() + + return text + + def get_video_transcript(self, video_path): + """동영상 파일의 음성을 텍스트로 변환""" + try: + # 임시 오디오 파일 경로 + temp_audio = video_path.parent / f"{video_path.stem}_temp_audio.wav" + + # 1. 동영상에서 오디오 추출 + if not self.extract_audio_from_video(video_path, temp_audio): + return self.get_basic_file_info(video_path) + "\n\n[오디오 추출 실패]" + if (not temp_audio.exists()) or temp_audio.stat().st_size == 0: + return self.get_basic_file_info(video_path) + "\n\n[오디오 파일 생성 실패]" + + # 2. Whisper로 음성 인식 + transcript = self.transcribe_audio_with_whisper(temp_audio) + + # 3. 임시 오디오 파일 삭제 + if temp_audio.exists(): + temp_audio.unlink() + + # 4. 결과 포맷팅 + stat = video_path.stat() + lines = [] + lines.append(f"동영상 파일 음성 전사 (Speech-to-Text)") + lines.append(f"=" * 60) + lines.append(f"파일명: {video_path.name}") + lines.append(f"경로: {video_path}") + lines.append(f"파일 크기: {self.format_file_size(stat.st_size)}") + lines.append(f"생성일: {datetime.fromtimestamp(stat.st_ctime).strftime('%Y-%m-%d %H:%M:%S')}") + lines.append("") + lines.append("=" * 60) + lines.append("음성 내용:") + lines.append("=" * 60) + lines.append("") + lines.append(transcript) + + return "\n".join(lines) + + except Exception as e: + return self.get_basic_file_info(video_path) + f"\n\n[음성 인식 오류: {str(e)}]" + + def convert_dwg_to_pdf_trueview(self, dwg_path, pdf_path): + """DWG TrueView를 사용한 DWG → PDF 변환""" + if not self.trueview_path: + return False, "DWG TrueView가 설치되지 않음" + + try: + # AutoCAD 스크립트 생성 + script_content = f"""_-EXPORT_PDF{pdf_path}_Y""" + script_path = dwg_path.parent / f"{dwg_path.stem}_plot.scr" + with open(script_path, 'w') as f: + f.write(script_content) + + # TrueView 실행 + cmd = [ + self.trueview_path, + str(dwg_path.absolute()), + "/b", str(script_path.absolute()), + "/nologo" + ] + + result = subprocess.run(cmd, timeout=120, capture_output=True) + + # 스크립트 파일 삭제 + if script_path.exists(): + try: + script_path.unlink() + except: + pass + + # PDF 생성 확인 + if pdf_path.exists(): + return True, "성공" + else: + return False, "PDF 생성 실패" + + except subprocess.TimeoutExpired: + return False, "변환 시간 초과" + except Exception as e: + return False, f"DWG 변환 실패: {str(e)}" + + def get_basic_file_info(self, file_path): + """기본 파일 정보 반환""" + stat = file_path.stat() + lines = [] + lines.append(f"파일 정보") + lines.append(f"=" * 60) + lines.append(f"파일명: {file_path.name}") + lines.append(f"경로: {file_path}") + lines.append(f"파일 크기: {self.format_file_size(stat.st_size)}") + lines.append(f"생성일: {datetime.fromtimestamp(stat.st_ctime).strftime('%Y-%m-%d %H:%M:%S')}") + lines.append(f"수정일: {datetime.fromtimestamp(stat.st_mtime).strftime('%Y-%m-%d %H:%M:%S')}") + return "\n".join(lines) + + def format_file_size(self, size_bytes): + """파일 크기를 읽기 쉬운 형식으로 변환""" + for unit in ['B', 'KB', 'MB', 'GB']: + if size_bytes < 1024.0: + return f"{size_bytes:.2f} {unit}" + size_bytes /= 1024.0 + return f"{size_bytes:.2f} TB" + + def convert_image_to_pdf(self, image_path, output_path): + """이미지 파일을 PDF로 변환""" + try: + img = Image.open(image_path) + # RGB 모드로 변환 (RGBA나 다른 모드 처리) + if img.mode in ('RGBA', 'LA', 'P'): + # 흰색 배경 생성 + background = Image.new('RGB', img.size, (255, 255, 255)) + if img.mode == 'P': + img = img.convert('RGBA') + background.paste(img, mask=img.split()[-1] if img.mode == 'RGBA' else None) + img = background + elif img.mode != 'RGB': + img = img.convert('RGB') + + img.save(output_path, 'PDF', resolution=100.0) + return True, "성공" + except Exception as e: + return False, f"이미지 변환 실패: {str(e)}" + + def convert_office_to_pdf(self, file_path, output_path): + """Office 문서를 PDF로 변환""" + pythoncom.CoInitialize() + try: + ext = file_path.suffix.lower() + + if ext in {'.hwp', '.hwpx'}: + return self.convert_hwp_to_pdf(file_path, output_path) + elif ext in {'.doc', '.docx'}: + return self.convert_word_to_pdf(file_path, output_path) + elif ext in {'.xls', '.xlsx'}: + return self.convert_excel_to_pdf(file_path, output_path) + elif ext in {'.ppt', '.pptx'}: + return self.convert_ppt_to_pdf(file_path, output_path) + else: + return False, "지원하지 않는 Office 형식" + + except Exception as e: + return False, f"Office 변환 실패: {str(e)}" + finally: + pythoncom.CoUninitialize() + + def convert_word_to_pdf(self, file_path, output_path): + """Word 문서를 PDF로 변환""" + try: + word = win32com.client.Dispatch("Word.Application") + word.Visible = False + doc = word.Documents.Open(str(file_path.absolute())) + doc.SaveAs(str(output_path.absolute()), FileFormat=17) # 17 = PDF + doc.Close() + word.Quit() + return True, "성공" + except Exception as e: + return False, f"Word 변환 실패: {str(e)}" + + def convert_excel_to_pdf(self, file_path, output_path): + """Excel 파일을 PDF로 변환 - 열 너비에 맞춰 출력""" + try: + excel = win32com.client.Dispatch("Excel.Application") + excel.Visible = False + wb = excel.Workbooks.Open(str(file_path.absolute())) + + # 모든 시트에 대해 페이지 설정 + for ws in wb.Worksheets: + # 페이지 설정 + ws.PageSetup.Zoom = False # 자동 크기 조정 비활성화 + ws.PageSetup.FitToPagesWide = 1 # 너비를 1페이지에 맞춤 + ws.PageSetup.FitToPagesTall = False # 높이는 자동 (내용에 따라) + + # 여백 최소화 (단위: 포인트, 1cm ≈ 28.35 포인트) + ws.PageSetup.LeftMargin = excel.CentimetersToPoints(1) + ws.PageSetup.RightMargin = excel.CentimetersToPoints(1) + ws.PageSetup.TopMargin = excel.CentimetersToPoints(1) + ws.PageSetup.BottomMargin = excel.CentimetersToPoints(1) + + # 용지 방향 자동 결정 (가로가 긴 경우 가로 방향) + used_range = ws.UsedRange + if used_range.Columns.Count > used_range.Rows.Count: + ws.PageSetup.Orientation = 2 # xlLandscape (가로) + else: + ws.PageSetup.Orientation = 1 # xlPortrait (세로) + + # PDF로 저장 + wb.ExportAsFixedFormat(0, str(output_path.absolute())) # 0 = PDF + wb.Close() + excel.Quit() + return True, "성공" + except Exception as e: + return False, f"Excel 변환 실패: {str(e)}" + + + def convert_ppt_to_pdf(self, file_path, output_path): + """PowerPoint 파일을 PDF로 변환""" + try: + ppt = win32com.client.Dispatch("PowerPoint.Application") + ppt.Visible = True + presentation = ppt.Presentations.Open(str(file_path.absolute())) + presentation.SaveAs(str(output_path.absolute()), 32) # 32 = PDF + presentation.Close() + ppt.Quit() + return True, "성공" + except Exception as e: + return False, f"PowerPoint 변환 실패: {str(e)}" + + def convert_hwp_to_pdf(self, file_path, output_path): + hwp = None + try: + output_path.parent.mkdir(parents=True, exist_ok=True) + + try: + hwp = win32com.client.gencache.EnsureDispatch("HWPFrame.HwpObject") + except Exception: + hwp = win32com.client.Dispatch("HWPFrame.HwpObject") + + registered = False + last_reg_error = None + + for module_name in getattr(self, "hwp_security_modules", [""]): + try: + hwp.RegisterModule("FilePathCheckDLL", module_name) + registered = True + break + except Exception as e: + last_reg_error = e + + if not registered: + return False, f"HWP 보안 모듈 등록 실패: {last_reg_error}" + + hwp.Open(str(file_path.absolute()), "", "") + + hwp.HAction.GetDefault("FileSaveAsPdf", hwp.HParameterSet.HFileOpenSave.HSet) + hwp.HParameterSet.HFileOpenSave.filename = str(output_path.absolute()) + hwp.HParameterSet.HFileOpenSave.Format = "PDF" + hwp.HAction.Execute("FileSaveAsPdf", hwp.HParameterSet.HFileOpenSave.HSet) + + if output_path.exists() and output_path.stat().st_size > 0: + return True, "성공" + return False, "PDF 생성 확인 실패" + + except Exception as e: + return False, f"HWP 변환 실패: {str(e)}" + finally: + try: + if hwp: + try: + hwp.Clear(1) + except Exception: + pass + try: + hwp.Quit() + except Exception: + pass + except Exception: + pass + + + + def convert_text_to_pdf(self, text_path, output_path): + """텍스트 파일을 PDF로 변환 (reportlab 사용)""" + try: + from reportlab.lib.pagesizes import A4 + from reportlab.pdfgen import canvas + from reportlab.pdfbase import pdfmetrics + from reportlab.pdfbase.ttfonts import TTFont + + # 한글 폰트 등록 (시스템에 설치된 폰트 사용) + try: + pdfmetrics.registerFont(TTFont('Malgun', 'malgun.ttf')) + font_name = 'Malgun' + except: + font_name = 'Helvetica' + + # 텍스트 읽기 + with open(text_path, 'r', encoding='utf-8', errors='ignore') as f: + content = f.read() + + # PDF 생성 + c = canvas.Canvas(str(output_path), pagesize=A4) + width, height = A4 + + c.setFont(font_name, 10) + + # 여백 설정 + margin = 50 + y = height - margin + line_height = 14 + + # 줄 단위로 처리 + for line in content.split('\n'): + if y < margin: # 페이지 넘김 + c.showPage() + c.setFont(font_name, 10) + y = height - margin + + # 긴 줄은 자동으로 줄바꿈 + if len(line) > 100: + chunks = [line[i:i+100] for i in range(0, len(line), 100)] + for chunk in chunks: + c.drawString(margin, y, chunk) + y -= line_height + else: + c.drawString(margin, y, line) + y -= line_height + + c.save() + return True, "성공" + except Exception as e: + return False, f"텍스트 변환 실패: {str(e)}" + + def process_file(self, file_path): + """개별 파일 처리""" + ext = file_path.suffix.lower() + + # 출력 파일명 생성 (원본 경로 구조 유지) + relative_path = file_path.relative_to(self.source_dir) + output_subdir = self.output_dir / relative_path.parent + output_subdir.mkdir(parents=True, exist_ok=True) + + # PDF 파일명 + output_pdf = output_subdir / f"{file_path.stem}.pdf" + + success = False + message = "" + + try: + # 이미 PDF인 경우 + if ext in self.pdf_extension: + shutil.copy2(file_path, output_pdf) + success = True + message = "PDF 복사 완료" + + # DWG/DXF 파일 + elif ext in self.dwg_extensions: + success, message = self.convert_dwg_to_pdf_trueview(file_path, output_pdf) + + # 이미지 파일 + elif ext in self.image_extensions: + success, message = self.convert_image_to_pdf(file_path, output_pdf) + + # Office 문서 + elif ext in self.office_extensions: + success, message = self.convert_office_to_pdf(file_path, output_pdf) + + # 동영상 파일 - 음성을 텍스트로 변환 후 PDF 생성 + elif ext in self.video_extensions: + # 음성→텍스트 변환 + transcript_text = self.get_video_transcript(file_path) + + # 임시 txt 파일 생성 + temp_txt = output_subdir / f"{file_path.stem}_transcript.txt" + with open(temp_txt, 'w', encoding='utf-8') as f: + f.write(transcript_text) + + # txt를 PDF로 변환 + success, message = self.convert_text_to_pdf(temp_txt, output_pdf) + + if success: + message = "성공 (음성 인식 완료)" + + # 임시 txt 파일은 남겨둠 (참고용) + + # 텍스트 파일 + elif ext in self.text_extensions: + success, message = self.convert_text_to_pdf(file_path, output_pdf) + + else: + message = f"지원하지 않는 파일 형식: {ext}" + + except Exception as e: + message = f"처리 중 오류: {str(e)}" + + # 로그 기록 + self.conversion_log.append({ + '원본 경로': str(file_path), + '파일명': file_path.name, + '파일 형식': ext, + '변환 PDF 경로': str(output_pdf) if success else "", + '상태': "성공" if success else "실패", + '메시지': message, + '처리 시간': datetime.now().strftime('%Y-%m-%d %H:%M:%S') + }) + + return success, message + + def create_excel_report(self, excel_path): + """변환 결과를 엑셀로 저장""" + wb = openpyxl.Workbook() + ws = wb.active + ws.title = "변환 결과" + + # 헤더 스타일 + header_fill = PatternFill(start_color="366092", end_color="366092", fill_type="solid") + header_font = Font(bold=True, color="FFFFFF") + + # 헤더 작성 + headers = ['번호', '원본 경로', '파일명', '파일 형식', '변환 PDF 경로', '상태', '메시지', '처리 시간'] + for col, header in enumerate(headers, 1): + cell = ws.cell(row=1, column=col, value=header) + cell.fill = header_fill + cell.font = header_font + cell.alignment = Alignment(horizontal='center', vertical='center') + + # 데이터 작성 + for idx, log in enumerate(self.conversion_log, 2): + ws.cell(row=idx, column=1, value=idx-1) + ws.cell(row=idx, column=2, value=log['원본 경로']) + ws.cell(row=idx, column=3, value=log['파일명']) + ws.cell(row=idx, column=4, value=log['파일 형식']) + ws.cell(row=idx, column=5, value=log['변환 PDF 경로']) + + # 상태에 따라 색상 표시 + status_cell = ws.cell(row=idx, column=6, value=log['상태']) + if log['상태'] == "성공": + status_cell.fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid") + status_cell.font = Font(color="006100") + else: + status_cell.fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid") + status_cell.font = Font(color="9C0006") + + ws.cell(row=idx, column=7, value=log['메시지']) + ws.cell(row=idx, column=8, value=log['처리 시간']) + + # 열 너비 자동 조정 + for column in ws.columns: + max_length = 0 + column_letter = column[0].column_letter + for cell in column: + try: + if len(str(cell.value)) > max_length: + max_length = len(str(cell.value)) + except: + pass + adjusted_width = min(max_length + 2, 50) + ws.column_dimensions[column_letter].width = adjusted_width + + # 요약 시트 추가 + summary_ws = wb.create_sheet(title="요약") + + total_files = len(self.conversion_log) + success_count = sum(1 for log in self.conversion_log if log['상태'] == "성공") + fail_count = total_files - success_count + + summary_data = [ + ['항목', '값'], + ['총 파일 수', total_files], + ['변환 성공', success_count], + ['변환 실패', fail_count], + ['성공률', f"{(success_count/total_files*100):.1f}%" if total_files > 0 else "0%"], + ['', ''], + ['원본 폴더', str(self.source_dir)], + ['출력 폴더', str(self.output_dir)], + ['작업 완료 시간', datetime.now().strftime('%Y-%m-%d %H:%M:%S')] + ] + + for row_idx, row_data in enumerate(summary_data, 1): + for col_idx, value in enumerate(row_data, 1): + cell = summary_ws.cell(row=row_idx, column=col_idx, value=value) + if row_idx == 1: + cell.fill = header_fill + cell.font = header_font + cell.alignment = Alignment(horizontal='center' if col_idx == 1 else 'left') + + summary_ws.column_dimensions['A'].width = 20 + summary_ws.column_dimensions['B'].width = 60 + + # 저장 + wb.save(excel_path) + print(f"\n엑셀 보고서 생성 완료: {excel_path}") + + def run(self): + """전체 변환 작업 실행""" + print(f"작업 시작: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + print(f"원본 폴더: {self.source_dir}") + print(f"출력 폴더: {self.output_dir}") + + # DWG TrueView 확인 + if self.trueview_path: + print(f"DWG TrueView 발견: {self.trueview_path}") + else: + print("경고: DWG TrueView를 찾을 수 없습니다. DWG 파일 변환이 불가능합니다.") + + print("-" * 80) + + # 모든 파일 가져오기 + all_files = self.get_all_files() + total_files = len(all_files) + + # ★ 파일 분류: 동영상 vs 나머지 + video_files = [] + other_files = [] + + for file_path in all_files: + if file_path.suffix.lower() in self.video_extensions: + video_files.append(file_path) + else: + other_files.append(file_path) + + print(f"\n총 {total_files}개 파일 발견") + print(f" - 문서/이미지 등: {len(other_files)}개") + print(f" - 동영상: {len(video_files)}개") + print("\n[1단계] 문서 파일 변환 시작...\n") + + # ★ 1단계: 문서 파일 먼저 처리 + for idx, file_path in enumerate(other_files, 1): + print(f"[{idx}/{len(other_files)}] {file_path.name} 처리 중...", end=' ') + success, message = self.process_file(file_path) + print(f"{'✓' if success else '✗'} {message}") + + # ★ 2단계: domain.txt 로드 + domain_path = self.source_dir.parent / "domain.txt" # D:\for python\테스트 중(측량)\domain.txt + if domain_path.exists(): + self.domain_terms = domain_path.read_text(encoding='utf-8') + print(f"\n[2단계] 도메인 용어 사전 로드 완료: {domain_path}") + print(f" - 용어 수: 약 {len(self.domain_terms.split())}개 단어") + else: + print(f"\n[2단계] 도메인 용어 사전 없음: {domain_path}") + print(" - 기본 음성 인식으로 진행합니다.") + + # ★ 3단계: 동영상 파일 처리 + if video_files: + print(f"\n[3단계] 동영상 음성 인식 시작...\n") + for idx, file_path in enumerate(video_files, 1): + print(f"[{idx}/{len(video_files)}] {file_path.name} 처리 중...", end=' ') + success, message = self.process_file(file_path) + print(f"{'✓' if success else '✗'} {message}") + + # 엑셀 보고서 생성 + excel_path = self.output_dir / f"변환_결과_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx" + self.create_excel_report(excel_path) + + # 최종 요약 + success_count = sum(1 for log in self.conversion_log if log['상태'] == "성공") + print("\n" + "=" * 80) + print(f"작업 완료!") + print(f"총 파일: {total_files}개") + print(f"성공: {success_count}개") + print(f"실패: {total_files - success_count}개") + print(f"성공률: {(success_count/total_files*100):.1f}%" if total_files > 0 else "0%") + print("=" * 80) + +if __name__ == "__main__": + # 경로 설정 + SOURCE_DIR = r"D:\for python\geulbeot-light\geulbeot-light\00.test\hwpx\in" + OUTPUT_DIR = r"D:\for python\geulbeot-light\geulbeot-light\00.test\hwpx\out" + + # 변환기 실행 + converter = SurveyingFileConverter(SOURCE_DIR, OUTPUT_DIR) + converter.run() \ No newline at end of file diff --git a/03. Code/geulbeot_10th/converters/pipeline/step2_extract.py b/03. Code/geulbeot_10th/converters/pipeline/step2_extract.py new file mode 100644 index 0000000..9e9554f --- /dev/null +++ b/03. Code/geulbeot_10th/converters/pipeline/step2_extract.py @@ -0,0 +1,789 @@ +# -*- coding: utf-8 -*- +""" +extract_1_v2.py + +PDF에서 텍스트(md)와 이미지(png)를 추출 +- 하위 폴더 구조 유지 +- 이미지 메타데이터 JSON 생성 (폴더경로, 파일명, 페이지, 위치, 캡션 등) +""" + +import fitz # PyMuPDF +import os +import re +import json +import numpy as np +from pathlib import Path +from datetime import datetime +from PIL import Image +import io + +# ===== OCR 설정 (선택적) ===== +try: + import pytesseract + pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" + TESSERACT_AVAILABLE = True +except ImportError: + TESSERACT_AVAILABLE = False + print("[INFO] pytesseract 미설치 - 텍스트 잘림 필터 비활성화") + +# ===== 경로 설정 ===== +BASE_DIR = Path(r"D:\for python\geulbeot-light\geulbeot-light\00.test\hwpx\out") # PDF 원본 위치 +OUTPUT_BASE = Path(r"D:\for python\geulbeot-light\geulbeot-light\00.test\hwpx\out\out") # 출력 위치 + +CAPTION_PATTERN = re.compile( + r'^\s*(?:[<\[\(\{]\s*)?(그림|figure|fig)\s*\.?\s*(?:[<\[\(\{]\s*)?0*\d+(?:\s*[-–]\s*\d+)?', + re.IGNORECASE +) + + +def get_figure_rects(page): + """ + Identifies figure regions based on '<그림 N>' captions and vector drawings. + Returns a list of dicts: {'rect': fitz.Rect, 'caption_block': block_index} + """ + drawings = page.get_drawings() + + blocks = page.get_text("blocks") + captions = [] + + for i, b in enumerate(blocks): + text = b[4] + if CAPTION_PATTERN.search(text): + captions.append({'rect': fitz.Rect(b[:4]), 'index': i, 'text': text, 'drawings': []}) + + if not captions: + return [] + + filtered_drawings_rects = [] + for d in drawings: + r = d["rect"] + if r.height > page.rect.height / 3 and r.width < 5: + continue + if r.width > page.rect.width * 0.9: + continue + filtered_drawings_rects.append(r) + + page_area = page.rect.get_area() + img_rects = [] + for b in page.get_text("dict")["blocks"]: + if b.get("type") == 1: + ir = fitz.Rect(b["bbox"]) + if ir.get_area() < page_area * 0.01: + continue + img_rects.append(ir) + + remaining_drawings = filtered_drawings_rects + img_rects + caption_clusters = {cap['index']: [cap['rect']] for cap in captions} + + def is_text_between(r1, r2, text_blocks): + if r1.intersects(r2): + return False + union = r1 | r2 + for b in text_blocks: + b_rect = fitz.Rect(b[:4]) + text_content = b[4] + if len(text_content.strip()) < 20: + continue + if not b_rect.intersects(union): + continue + if b_rect.intersects(r1) or b_rect.intersects(r2): + continue + return True + return False + + changed = True + while changed: + changed = False + to_remove = [] + + for d_rect in remaining_drawings: + best_cluster_key = None + min_dist = float('inf') + + for cap_index, cluster_rects in caption_clusters.items(): + for r in cluster_rects: + dist = 0 + if d_rect.intersects(r): + dist = 0 + else: + x_dist = 0 + if d_rect.x1 < r.x0: x_dist = r.x0 - d_rect.x1 + elif d_rect.x0 > r.x1: x_dist = d_rect.x0 - r.x1 + + y_dist = 0 + if d_rect.y1 < r.y0: y_dist = r.y0 - d_rect.y1 + elif d_rect.y0 > r.y1: y_dist = d_rect.y0 - r.y1 + + if x_dist < 150 and y_dist < 150: + dist = max(x_dist, y_dist) + 0.1 + else: + dist = float('inf') + + if dist < min_dist: + if not is_text_between(r, d_rect, blocks): + min_dist = dist + best_cluster_key = cap_index + + if min_dist == 0: + break + + if best_cluster_key is not None and min_dist < 150: + caption_clusters[best_cluster_key].append(d_rect) + to_remove.append(d_rect) + changed = True + + for r in to_remove: + remaining_drawings.remove(r) + + figure_regions = [] + + for cap in captions: + cluster_rects = caption_clusters[cap['index']] + content_rects = cluster_rects[1:] + + if not content_rects: + continue + + union_rect = content_rects[0] + for r in content_rects[1:]: + union_rect = union_rect | r + + union_rect.x0 = max(0, union_rect.x0 - 5) + union_rect.x1 = min(page.rect.width, union_rect.x1 + 5) + union_rect.y0 = max(0, union_rect.y0 - 5) + union_rect.y1 = min(page.rect.height, union_rect.y1 + 5) + + cap_rect = cap['rect'] + + if cap_rect.y0 + cap_rect.height/2 < union_rect.y0 + union_rect.height/2: + if union_rect.y0 < cap_rect.y1: union_rect.y0 = cap_rect.y1 + 2 + else: + if union_rect.y1 > cap_rect.y0: union_rect.y1 = cap_rect.y0 - 2 + + area = union_rect.get_area() + page_area = page.rect.get_area() + + if area < page_area * 0.01: + continue + + if union_rect.height < 20 and union_rect.width > page.rect.width * 0.6: + continue + if union_rect.width < 20 and union_rect.height > page.rect.height * 0.6: + continue + + text_blocks = page.get_text("blocks") + text_count = 0 + + for b in text_blocks: + b_rect = fitz.Rect(b[:4]) + if not b_rect.intersects(union_rect): + continue + text = b[4].strip() + if len(text) < 5: + continue + text_count += 1 + + if text_count < 0: + continue + + figure_regions.append({ + 'rect': union_rect, + 'caption_index': cap['index'], + 'caption_rect': cap['rect'], + 'caption_text': cap['text'].strip() # ★ 캡션 텍스트 저장 + }) + + return figure_regions + + +def pixmap_metrics(pix): + arr = np.frombuffer(pix.samples, dtype=np.uint8) + c = 4 if pix.alpha else 3 + arr = arr.reshape(pix.height, pix.width, c)[:, :, :3] + gray = (0.299 * arr[:, :, 0] + 0.587 * arr[:, :, 1] + 0.114 * arr[:, :, 2]).astype(np.uint8) + white = gray > 245 + nonwhite_ratio = float(1.0 - white.mean()) + gx = np.abs(np.diff(gray.astype(np.int16), axis=1)) + gy = np.abs(np.diff(gray.astype(np.int16), axis=0)) + edge = (gx[:-1, :] + gy[:, :-1]) > 40 + edge_ratio = float(edge.mean()) + var = float(gray.var()) + return nonwhite_ratio, edge_ratio, var + + +def keep_figure(pix): + nonwhite_ratio, edge_ratio, var = pixmap_metrics(pix) + if nonwhite_ratio < 0.004: + return False, nonwhite_ratio, edge_ratio, var + if nonwhite_ratio < 0.012 and edge_ratio < 0.004 and var < 20: + return False, nonwhite_ratio, edge_ratio, var + return True, nonwhite_ratio, edge_ratio, var + + +# ===== 추가 이미지 필터 함수들 (v2.1) ===== + +def pix_to_pil(pix): + """PyMuPDF Pixmap을 PIL Image로 변환""" + img_data = pix.tobytes("png") + return Image.open(io.BytesIO(img_data)) + + +def has_cut_text_at_boundary(pix, margin=5): + """ + 이미지 경계에서 텍스트가 잘렸는지 감지 + - 이미지 테두리 근처에 텍스트 박스가 있으면 잘린 것으로 판단 + + Args: + pix: PyMuPDF Pixmap + margin: 경계로부터의 여유 픽셀 (기본 5px) + + Returns: + bool: 텍스트가 잘렸으면 True + """ + if not TESSERACT_AVAILABLE: + return False # OCR 없으면 필터 비활성화 + + try: + img = pix_to_pil(pix) + width, height = img.size + + # OCR로 텍스트 위치 추출 + data = pytesseract.image_to_data(img, lang='kor+eng', output_type=pytesseract.Output.DICT) + + for i, text in enumerate(data['text']): + text = str(text).strip() + if len(text) < 2: # 너무 짧은 텍스트는 무시 + continue + + x = data['left'][i] + y = data['top'][i] + w = data['width'][i] + h = data['height'][i] + + # 텍스트가 이미지 경계에 너무 가까우면 = 잘린 것 + # 왼쪽 경계 + if x <= margin: + return True + # 오른쪽 경계 + if x + w >= width - margin: + return True + # 상단 경계 (헤더 제외를 위해 좀 더 여유) + if y <= margin and h < height * 0.3: + return True + # 하단 경계 + if y + h >= height - margin: + return True + + return False + + except Exception as e: + # OCR 실패 시 필터 통과 (이미지 유지) + return False + + +def is_decorative_background(pix, edge_threshold=0.02, color_var_threshold=500): + """ + 배경 패턴 + 텍스트만 있는 장식용 이미지인지 감지 + - 엣지가 적고 (복잡한 도표/사진이 아님) + - 색상 다양성이 낮으면 (단순 그라데이션 배경) + + Args: + pix: PyMuPDF Pixmap + edge_threshold: 엣지 비율 임계값 (기본 0.02 = 2%) + color_var_threshold: 색상 분산 임계값 + + Returns: + bool: 장식용 배경이면 True + """ + try: + nonwhite_ratio, edge_ratio, var = pixmap_metrics(pix) + + # 엣지가 거의 없고 (단순한 이미지) + # 색상 분산도 낮으면 (배경 패턴) + if edge_ratio < edge_threshold and var < color_var_threshold: + # 추가 확인: 텍스트만 있는지 OCR로 체크 + if TESSERACT_AVAILABLE: + try: + img = pix_to_pil(pix) + text = pytesseract.image_to_string(img, lang='kor+eng').strip() + + # 텍스트가 있고, 이미지가 단순하면 = 텍스트 배경 + if len(text) > 3 and edge_ratio < 0.015: + return True + except: + pass + + return True + + return False + + except Exception: + return False + + +def is_header_footer_region(rect, page_rect, height_threshold=0.12): + """ + 헤더/푸터 영역에 있는 이미지인지 감지 + - 페이지 상단 12% 또는 하단 12%에 위치 + - 높이가 낮은 strip 형태 + + Args: + rect: 이미지 영역 (fitz.Rect) + page_rect: 페이지 전체 영역 (fitz.Rect) + height_threshold: 헤더/푸터 영역 비율 (기본 12%) + + Returns: + bool: 헤더/푸터 영역이면 True + """ + page_height = page_rect.height + img_height = rect.height + + # 상단 영역 체크 + if rect.y0 < page_height * height_threshold: + # 높이가 페이지의 15% 미만인 strip이면 헤더 + if img_height < page_height * 0.15: + return True + + # 하단 영역 체크 + if rect.y1 > page_height * (1 - height_threshold): + # 높이가 페이지의 15% 미만인 strip이면 푸터 + if img_height < page_height * 0.15: + return True + + return False + + +def should_filter_image(pix, rect, page_rect): + """ + 이미지를 필터링해야 하는지 종합 판단 + + Args: + pix: PyMuPDF Pixmap + rect: 이미지 영역 + page_rect: 페이지 전체 영역 + + Returns: + tuple: (필터링 여부, 필터링 사유) + """ + # 1. 헤더/푸터 영역 체크 + if is_header_footer_region(rect, page_rect): + return True, "header_footer" + + # 2. 텍스트 잘림 체크 + if has_cut_text_at_boundary(pix): + return True, "cut_text" + + # 3. 장식용 배경 체크 + if is_decorative_background(pix): + return True, "decorative_background" + + return False, None + + +def extract_pdf_content(pdf_path, output_md_path, img_dir, metadata): + """ + PDF 내용 추출 + + Args: + pdf_path: PDF 파일 경로 + output_md_path: 출력 MD 파일 경로 + img_dir: 이미지 저장 폴더 + metadata: 메타데이터 딕셔너리 (폴더 경로, 파일명 등) + + Returns: + image_metadata_list: 추출된 이미지들의 메타데이터 리스트 + """ + os.makedirs(img_dir, exist_ok=True) + + image_metadata_list = [] # ★ 이미지 메타데이터 수집 + + doc = fitz.open(pdf_path) + total_pages = len(doc) + + with open(output_md_path, "w", encoding="utf-8") as md_file: + # ★ 메타데이터 헤더 추가 + md_file.write(f"---\n") + md_file.write(f"source_pdf: {metadata['pdf_name']}\n") + md_file.write(f"source_folder: {metadata['relative_folder']}\n") + md_file.write(f"total_pages: {total_pages}\n") + md_file.write(f"extracted_at: {datetime.now().isoformat()}\n") + md_file.write(f"---\n\n") + md_file.write(f"# {metadata['pdf_name']}\n\n") + + for page_num, page in enumerate(doc): + md_file.write(f"\n## Page {page_num + 1}\n\n") + img_rel_dir = os.path.basename(img_dir) + + figure_regions = get_figure_rects(page) + + kept_figures = [] + for i, fig in enumerate(figure_regions): + rect = fig['rect'] + pix_preview = page.get_pixmap(clip=rect, dpi=100, colorspace=fitz.csRGB) + ok, nonwhite_ratio, edge_ratio, var = keep_figure(pix_preview) + if not ok: + continue + + pix = page.get_pixmap(clip=rect, dpi=150, colorspace=fitz.csRGB) + + # ★ 추가 필터 적용 (v2.1) + should_filter, filter_reason = should_filter_image(pix, rect, page.rect) + if should_filter: + continue + + img_name = f"p{page_num + 1:03d}_fig{len(kept_figures):02d}.png" + img_path = os.path.join(img_dir, img_name) + pix.save(img_path) + + fig['img_path'] = os.path.join(img_rel_dir, img_name).replace("\\", "/") + fig['img_name'] = img_name + kept_figures.append(fig) + + # ★ 이미지 메타데이터 수집 + image_metadata_list.append({ + "image_file": img_name, + "image_path": str(Path(img_dir) / img_name), + "type": "figure", + "source_pdf": metadata['pdf_name'], + "source_folder": metadata['relative_folder'], + "full_path": metadata['full_path'], + "page": page_num + 1, + "total_pages": total_pages, + "caption": fig.get('caption_text', ''), + "rect": { + "x0": round(rect.x0, 2), + "y0": round(rect.y0, 2), + "x1": round(rect.x1, 2), + "y1": round(rect.y1, 2) + } + }) + + figure_regions = kept_figures + + caption_present = any( + CAPTION_PATTERN.search((tb[4] or "")) for tb in page.get_text("blocks") + ) + uncaptioned_idx = 0 + + items = [] + + def inside_any_figure(block_rect, figures): + for fig in figures: + intersect = block_rect & fig["rect"] + if intersect.get_area() > 0.5 * block_rect.get_area(): + return True + return False + + def is_full_width_rect(r, page_rect): + return r.width >= page_rect.width * 0.78 + + def figure_anchor_rect(fig, page_rect): + cap = fig["caption_rect"] + rect = fig["rect"] + if cap.y0 >= rect.y0: + y = max(0.0, cap.y0 - 0.02) + else: + y = min(page_rect.height - 0.02, cap.y1 + 0.02) + return fitz.Rect(cap.x0, y, cap.x1, y + 0.02) + + for fig in figure_regions: + anchor = figure_anchor_rect(fig, page.rect) + md = ( + f"\n![{fig.get('caption_text', 'Figure')}]({fig['img_path']})\n" + f"*{fig.get('caption_text', '')}*\n\n" + ) + items.append({ + "kind": "figure", + "rect": anchor, + "kind_order": 0, + "md": md, + }) + + raw_blocks = page.get_text("dict")["blocks"] + + for block in raw_blocks: + block_rect = fitz.Rect(block["bbox"]) + + if block.get("type") == 0: + if inside_any_figure(block_rect, figure_regions): + continue + items.append({ + "kind": "text", + "rect": block_rect, + "kind_order": 2, + "block": block, + }) + continue + + if block.get("type") == 1: + if inside_any_figure(block_rect, figure_regions): + continue + if caption_present: + continue + + page_area = page.rect.get_area() + if block_rect.get_area() < page_area * 0.005: + continue + + ratio = block_rect.width / max(1.0, block_rect.height) + if ratio < 0.25 or ratio > 4.0: + continue + + pix_preview = page.get_pixmap( + clip=block_rect, dpi=80, colorspace=fitz.csRGB + ) + ok, nonwhite_ratio, edge_ratio, var = keep_figure(pix_preview) + if not ok: + continue + + pix = page.get_pixmap( + clip=block_rect, dpi=150, colorspace=fitz.csRGB + ) + + # ★ 추가 필터 적용 (v2.1) + should_filter, filter_reason = should_filter_image(pix, block_rect, page.rect) + if should_filter: + continue + + img_name = f"p{page_num + 1:03d}_photo{uncaptioned_idx:02d}.png" + img_path = os.path.join(img_dir, img_name) + pix.save(img_path) + + rel = os.path.join(img_rel_dir, img_name).replace("\\", "/") + r = block_rect + md = ( + f'\n![Photo]({rel})\n' + f'*Page {page_num + 1} Photo*\n\n' + ) + + items.append({ + "kind": "raster", + "rect": block_rect, + "kind_order": 1, + "md": md, + }) + + # ★ 캡션 없는 이미지 메타데이터 + image_metadata_list.append({ + "image_file": img_name, + "image_path": str(Path(img_dir) / img_name), + "type": "photo", + "source_pdf": metadata['pdf_name'], + "source_folder": metadata['relative_folder'], + "full_path": metadata['full_path'], + "page": page_num + 1, + "total_pages": total_pages, + "caption": "", + "rect": { + "x0": round(r.x0, 2), + "y0": round(r.y0, 2), + "x1": round(r.x1, 2), + "y1": round(r.y1, 2) + } + }) + + uncaptioned_idx += 1 + continue + + # 읽기 순서 정렬 + text_items = [it for it in items if it["kind"] == "text"] + page_w = page.rect.width + mid = page_w / 2.0 + + candidates = [] + for it in text_items: + r = it["rect"] + if is_full_width_rect(r, page.rect): + continue + if r.width < page_w * 0.2: + continue + candidates.append(it) + + left = [it for it in candidates if it["rect"].x0 < mid * 0.95] + right = [it for it in candidates if it["rect"].x0 > mid * 1.05] + two_cols = len(left) >= 3 and len(right) >= 3 + + col_y0 = None + col_y1 = None + seps = [] + + if two_cols and left and right: + col_y0 = min( + min(it["rect"].y0 for it in left), + min(it["rect"].y0 for it in right), + ) + col_y1 = max( + max(it["rect"].y1 for it in left), + max(it["rect"].y1 for it in right), + ) + for it in text_items: + r = it["rect"] + if col_y0 < r.y0 < col_y1 and is_full_width_rect(r, page.rect): + seps.append(r.y0) + seps = sorted(set(seps)) + + def seg_index(y0, separators): + if not separators: + return 0 + n = 0 + for s in separators: + if y0 >= s: + n += 1 + else: + break + return n + + def order_key(it): + r = it["rect"] + if not two_cols: + return (r.y0, r.x0, it["kind_order"]) + if col_y0 is not None and r.y1 <= col_y0: + return (0, r.y0, r.x0, it["kind_order"]) + if col_y1 is not None and r.y0 >= col_y1: + return (2, r.y0, r.x0, it["kind_order"]) + seg = seg_index(r.y0, seps) + if is_full_width_rect(r, page.rect): + col = 2 + else: + col = 0 if r.x0 < mid else 1 + return (1, seg, col, r.y0, r.x0, it["kind_order"]) + + items.sort(key=order_key) + + for it in items: + if it["kind"] in ("figure", "raster"): + md_file.write(it["md"]) + continue + + block = it["block"] + for line in block.get("lines", []): + for span in line.get("spans", []): + md_file.write(span.get("text", "") + " ") + md_file.write("\n") + md_file.write("\n") + + doc.close() + return image_metadata_list + + +def process_all_pdfs(): + """ + BASE_DIR 하위의 모든 PDF를 재귀적으로 처리 + 폴더 구조를 유지하면서 OUTPUT_BASE에 저장 + """ + # 출력 폴더 생성 + OUTPUT_BASE.mkdir(parents=True, exist_ok=True) + + # 전체 이미지 메타데이터 수집 + all_image_metadata = [] + + # 처리 통계 + stats = { + "total_pdfs": 0, + "success": 0, + "failed": 0, + "total_images": 0 + } + + # 실패 로그 + failed_files = [] + + print(f"=" * 60) + print(f"PDF 추출 시작") + print(f"원본 폴더: {BASE_DIR}") + print(f"출력 폴더: {OUTPUT_BASE}") + print(f"=" * 60) + + # 모든 PDF 파일 찾기 + pdf_files = list(BASE_DIR.rglob("*.pdf")) + stats["total_pdfs"] = len(pdf_files) + + print(f"\n총 {len(pdf_files)}개 PDF 발견\n") + + for idx, pdf_path in enumerate(pdf_files, 1): + try: + # 상대 경로 계산 + relative_path = pdf_path.relative_to(BASE_DIR) + relative_folder = str(relative_path.parent) + if relative_folder == ".": + relative_folder = "" + + pdf_name = pdf_path.name + pdf_stem = pdf_path.stem + + # 출력 경로 설정 (폴더 구조 유지) + output_folder = OUTPUT_BASE / relative_path.parent + output_folder.mkdir(parents=True, exist_ok=True) + + output_md = output_folder / f"{pdf_stem}.md" + img_folder = output_folder / f"{pdf_stem}_img" + + # 메타데이터 준비 + metadata = { + "pdf_name": pdf_name, + "pdf_stem": pdf_stem, + "relative_folder": relative_folder, + "full_path": str(relative_path), + } + + print(f"[{idx}/{len(pdf_files)}] {relative_path}") + + # PDF 처리 + image_metas = extract_pdf_content( + str(pdf_path), + str(output_md), + str(img_folder), + metadata + ) + + all_image_metadata.extend(image_metas) + stats["success"] += 1 + stats["total_images"] += len(image_metas) + + print(f" ✓ 완료 (이미지 {len(image_metas)}개)") + + except Exception as e: + stats["failed"] += 1 + failed_files.append({ + "file": str(pdf_path), + "error": str(e) + }) + print(f" ✗ 실패: {e}") + + # 전체 이미지 메타데이터 저장 + meta_output_path = OUTPUT_BASE / "image_metadata.json" + with open(meta_output_path, "w", encoding="utf-8") as f: + json.dump(all_image_metadata, f, ensure_ascii=False, indent=2) + + # 처리 요약 저장 + summary = { + "processed_at": datetime.now().isoformat(), + "source_dir": str(BASE_DIR), + "output_dir": str(OUTPUT_BASE), + "statistics": stats, + "failed_files": failed_files + } + + summary_path = OUTPUT_BASE / "extraction_summary.json" + with open(summary_path, "w", encoding="utf-8") as f: + json.dump(summary, f, ensure_ascii=False, indent=2) + + # 결과 출력 + print(f"\n" + "=" * 60) + print(f"추출 완료!") + print(f"=" * 60) + print(f"총 PDF: {stats['total_pdfs']}개") + print(f"성공: {stats['success']}개") + print(f"실패: {stats['failed']}개") + print(f"추출된 이미지: {stats['total_images']}개") + print(f"\n이미지 메타데이터: {meta_output_path}") + print(f"처리 요약: {summary_path}") + + if failed_files: + print(f"\n실패한 파일:") + for f in failed_files: + print(f" - {f['file']}: {f['error']}") + + +if __name__ == "__main__": + process_all_pdfs() \ No newline at end of file diff --git a/03. Code/geulbeot_10th/converters/pipeline/step3_domain.py b/03. Code/geulbeot_10th/converters/pipeline/step3_domain.py new file mode 100644 index 0000000..29a5547 --- /dev/null +++ b/03. Code/geulbeot_10th/converters/pipeline/step3_domain.py @@ -0,0 +1,265 @@ +# -*- coding: utf-8 -*- +""" +domain_prompt.py + +기능: +- D:\\test\\report 아래의 pdf/xlsx/png/txt/md 파일들의 + 파일명과 내용 일부를 샘플링한다. +- 이 샘플을 기반으로, 문서 묶음의 분야/업무 맥락을 파악하고 + "너는 ~~ 분야의 전문가이다. 나는 ~~를 하고 싶다..." 형식의 + 도메인 전용 시스템 프롬프트를 자동 생성한다. +- 결과는 output/context/domain_prompt.txt 로 저장된다. + +이 domain_prompt.txt 내용은 이후 모든 GPT 호출(system role)에 공통으로 붙여 사용할 수 있다. +""" + +import os +import sys +import json +from pathlib import Path + +import pdfplumber +import fitz # PyMuPDF +from PIL import Image +import pytesseract +import pandas as pd +from openai import OpenAI +import pytesseract +from api_config import API_KEYS +pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" + +# ===== 경로 설정 ===== +DATA_ROOT = Path(r"D:\for python\geulbeot-light\geulbeot-light\00.test\hwpx\out") +OUTPUT_ROOT = Path(r"D:\for python\geulbeot-light\geulbeot-light\00.test\hwpx\out\out") # 출력 위치 +CONTEXT_DIR = OUTPUT_ROOT / "context" +LOG_DIR = OUTPUT_ROOT / "logs" + +for d in [OUTPUT_ROOT, CONTEXT_DIR, LOG_DIR]: + d.mkdir(parents=True, exist_ok=True) + +# ===== OpenAI 설정 (구조만 유지, 키는 마스터가 직접 입력) ===== +OPENAI_API_KEY = API_KEYS.get('GPT_API_KEY', '') +GPT_MODEL = "gpt-5-2025-08-07" + +client = OpenAI(api_key=OPENAI_API_KEY) + +# ===== OCR 설정 ===== +OCR_LANG = "kor+eng" + +SKIP_DIR_NAMES = {"System Volume Information", "$RECYCLE.BIN", ".git", "__pycache__"} + + +def log(msg: str): + print(msg, flush=True) + with (LOG_DIR / "domain_prompt_log.txt").open("a", encoding="utf-8") as f: + f.write(msg + "\n") + + +def safe_rel(p: Path) -> str: + try: + return str(p.relative_to(DATA_ROOT)) + except Exception: + return str(p) + + +def ocr_image(img_path: Path) -> str: + try: + return pytesseract.image_to_string(Image.open(img_path), lang=OCR_LANG).strip() + except Exception as e: + log(f"[WARN] OCR 실패: {safe_rel(img_path)} | {e}") + return "" + + +def sample_from_pdf(p: Path, max_chars: int = 1000) -> str: + texts = [] + try: + with pdfplumber.open(str(p)) as pdf: + # 앞쪽 몇 페이지만 샘플링 + for page in pdf.pages[:3]: + t = page.extract_text() or "" + if t: + texts.append(t) + if sum(len(x) for x in texts) >= max_chars: + break + except Exception as e: + log(f"[WARN] PDF 샘플 추출 실패: {safe_rel(p)} | {e}") + joined = "\n".join(texts) + return joined[:max_chars] + + +def sample_from_xlsx(p: Path, max_chars: int = 1000) -> str: + texts = [f"[파일명] {p.name}"] + try: + xls = pd.ExcelFile(str(p)) + for sheet_name in xls.sheet_names[:3]: + try: + df = xls.parse(sheet_name) + except Exception as e: + log(f"[WARN] 시트 로딩 실패: {safe_rel(p)} | {sheet_name} | {e}") + continue + texts.append(f"\n[시트] {sheet_name}") + texts.append("컬럼: " + ", ".join(map(str, df.columns))) + head = df.head(5) + texts.append(head.to_string(index=False)) + if sum(len(x) for x in texts) >= max_chars: + break + except Exception as e: + log(f"[WARN] XLSX 샘플 추출 실패: {safe_rel(p)} | {e}") + joined = "\n".join(texts) + return joined[:max_chars] + + +def sample_from_text_file(p: Path, max_chars: int = 1000) -> str: + try: + t = p.read_text(encoding="utf-8", errors="ignore") + except Exception: + t = p.read_text(encoding="cp949", errors="ignore") + return t[:max_chars] + + +def gather_file_samples( + max_files_per_type: int = 100, + max_total_samples: int = 300, + max_chars_per_sample: int = 1000, +): + + file_names = [] + samples = [] + + count_pdf = 0 + count_xlsx = 0 + count_img = 0 + count_txt = 0 + + for root, dirs, files in os.walk(DATA_ROOT): + dirs[:] = [d for d in dirs if d not in SKIP_DIR_NAMES and not d.startswith(".")] + cur_dir = Path(root) + + for fname in files: + fpath = cur_dir / fname + ext = fpath.suffix.lower() + + # 파일명은 전체 다 모으되, 샘플 추출은 제한 + file_names.append(safe_rel(fpath)) + + if len(samples) >= max_total_samples: + continue + + try: + if ext == ".pdf" and count_pdf < max_files_per_type: + s = sample_from_pdf(fpath, max_chars=max_chars_per_sample) + if s.strip(): + samples.append(f"[PDF] {safe_rel(fpath)}\n{s}") + count_pdf += 1 + continue + + if ext in {".xlsx", ".xls"} and count_xlsx < max_files_per_type: + s = sample_from_xlsx(fpath, max_chars=max_chars_per_sample) + if s.strip(): + samples.append(f"[XLSX] {safe_rel(fpath)}\n{s}") + count_xlsx += 1 + continue + + if ext in {".png", ".jpg", ".jpeg"} and count_img < max_files_per_type: + s = ocr_image(fpath) + if s.strip(): + samples.append(f"[IMG] {safe_rel(fpath)}\n{s[:max_chars_per_sample]}") + count_img += 1 + continue + + if ext in {".txt", ".md"} and count_txt < max_files_per_type: + s = sample_from_text_file(fpath, max_chars=max_chars_per_sample) + if s.strip(): + samples.append(f"[TEXT] {safe_rel(fpath)}\n{s}") + count_txt += 1 + continue + + except Exception as e: + log(f"[WARN] 샘플 추출 실패: {safe_rel(fpath)} | {e}") + continue + + return file_names, samples + + +def build_domain_prompt(): + """ + 파일명 + 내용 샘플을 GPT에게 넘겨 + '너는 ~~ 분야의 전문가이다...' 형태의 시스템 프롬프트를 생성한다. + """ + log("도메인 프롬프트 생성을 위한 샘플 수집 중...") + file_names, samples = gather_file_samples() + + if not file_names and not samples: + log("파일 샘플이 없어 도메인 프롬프트를 생성할 수 없습니다.") + sys.exit(1) + + file_names_text = "\n".join(file_names[:80]) + sample_text = "\n\n".join(samples[:30]) + + prompt = f""" +다음은 한 기업의 '이슈 리포트 및 시스템 관련 자료'로 추정되는 파일들의 목록과, +각 파일에서 일부 추출한 내용 샘플이다. + +[파일명 목록] +{file_names_text} + +[내용 샘플] +{sample_text} + +위 자료를 바탕으로 다음을 수행하라. + +1) 이 문서 묶음이 어떤 산업, 업무, 분야에 대한 것인지, + 핵심 키워드를 포함해 2~3줄 정도로 설명하라. + +2) 이후, 이 문서들을 다루는 AI에게 사용할 "프롬프트 머리말"을 작성하라. + 이 머리말은 모든 후속 프롬프트 앞에 항상 붙일 예정이며, + 다음 조건을 만족해야 한다. + + - 첫 문단: "너는 ~~ 분야의 전문가이다." 형식으로, 이 문서 묶음의 분야와 역할을 정의한다. + - 두 번째 문단 이후: "나는 ~~을 하고 싶다.", "우리는 ~~ 의 문제를 분석하고 개선방안을 찾고자 한다." 등 + 사용자가 AI에게 요구하는 전반적 목적과 관점을 정리한다. + - 총 5~7줄 정도의 한국어 문장으로 작성한다. + - 이후에 붙을 프롬프트(청킹, 요약, RAG, 보고서 작성 등)와 자연스럽게 연결될 수 있도록, + 역할(role), 목적, 기준(추측 금지, 사실 기반, 근거 명시 등)을 모두 포함한다. + +출력 형식: +- 설명과 머리말을 한 번에 출력하되, + 별도의 마크다운 없이 순수 텍스트로만 작성하라. +- 이 출력 전체를 domain_prompt.txt에 그대로 저장할 것이다. +""" + + resp = client.chat.completions.create( + model=GPT_MODEL, + messages=[ + { + "role": "system", + "content": "너는 문서 묶음의 분야를 식별하고, 그에 맞는 AI 시스템 프롬프트와 컨텍스트를 설계하는 컨설턴트이다." + }, + { + "role": "user", + "content": prompt + } + ], + ) + + content = (resp.choices[0].message.content or "").strip() + out_path = CONTEXT_DIR / "domain_prompt.txt" + out_path.write_text(content, encoding="utf-8") + + log(f"도메인 프롬프트 생성 완료: {out_path}") + return content + + +def main(): + log("=== 도메인 프롬프트 생성 시작 ===") + out_path = CONTEXT_DIR / "domain_prompt.txt" + if out_path.exists(): + log(f"이미 domain_prompt.txt가 존재합니다: {out_path}") + log("기존 파일을 사용하려면 종료하고, 재생성이 필요하면 파일을 삭제한 뒤 다시 실행하십시오.") + else: + build_domain_prompt() + log("=== 도메인 프롬프트 작업 종료 ===") + + +if __name__ == "__main__": + main() diff --git a/03. Code/geulbeot_10th/converters/pipeline/step4_chunk.py b/03. Code/geulbeot_10th/converters/pipeline/step4_chunk.py new file mode 100644 index 0000000..b1309cf --- /dev/null +++ b/03. Code/geulbeot_10th/converters/pipeline/step4_chunk.py @@ -0,0 +1,357 @@ +# -*- coding: utf-8 -*- +""" +chunk_and_summary_v2.py + +기능: +- 정리중 폴더 아래의 .md 파일들을 대상으로 + 1) domain_prompt.txt 기반 GPT 의미 청킹 + 2) 청크별 요약 생성 + 3) 청크 내 이미지 참조 보존 + 4) JSON 저장 (원문+청크+요약+이미지) + 5) RAG용 *_chunks.json 저장 + +전제: +- extract_1_v2.py 실행 후 .md 파일들이 존재할 것 +- step1_domainprompt.py 실행 후 domain_prompt.txt가 존재할 것 +""" + +import os +import sys +import json +import re +from pathlib import Path +from datetime import datetime + +from openai import OpenAI +from api_config import API_KEYS + +# ===== 경로 ===== +DATA_ROOT = Path(r"D:\for python\geulbeot-light\geulbeot-light\00.test\hwpx\out") +OUTPUT_ROOT = Path(r"D:\for python\geulbeot-light\geulbeot-light\00.test\hwpx\out\out") # 출력 위치 + +TEXT_DIR = OUTPUT_ROOT / "text" +JSON_DIR = OUTPUT_ROOT / "json" +RAG_DIR = OUTPUT_ROOT / "rag" +CONTEXT_DIR = OUTPUT_ROOT / "context" +LOG_DIR = OUTPUT_ROOT / "logs" + +for d in [TEXT_DIR, JSON_DIR, RAG_DIR, CONTEXT_DIR, LOG_DIR]: + d.mkdir(parents=True, exist_ok=True) + +# ===== OpenAI 설정 ===== +OPENAI_API_KEY = API_KEYS.get('GPT_API_KEY', '') +GPT_MODEL = "gpt-5-2025-08-07" + +client = OpenAI(api_key=OPENAI_API_KEY) + +# ===== 스킵할 폴더 ===== +SKIP_DIR_NAMES = {"System Volume Information", "$RECYCLE.BIN", ".git", "__pycache__", "output"} + +# ===== 이미지 참조 패턴 ===== +IMAGE_PATTERN = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)') + + +def log(msg: str): + print(msg, flush=True) + with (LOG_DIR / "chunk_and_summary_log.txt").open("a", encoding="utf-8") as f: + f.write(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}\n") + + +def load_domain_prompt() -> str: + p = CONTEXT_DIR / "domain_prompt.txt" + if not p.exists(): + log(f"domain_prompt.txt가 없습니다: {p}") + log("먼저 step1_domainprompt.py를 실행해야 합니다.") + sys.exit(1) + return p.read_text(encoding="utf-8", errors="ignore").strip() + + +def safe_rel(p: Path) -> str: + """DATA_ROOT 기준 상대 경로 반환""" + try: + return str(p.relative_to(DATA_ROOT)) + except Exception: + return str(p) + + +def extract_text_md(p: Path) -> str: + """마크다운 파일 텍스트 읽기""" + try: + return p.read_text(encoding="utf-8", errors="ignore") + except Exception: + return p.read_text(encoding="cp949", errors="ignore") + + +def find_images_in_text(text: str) -> list: + """텍스트에서 이미지 참조 찾기""" + matches = IMAGE_PATTERN.findall(text) + return [{"alt": m[0], "path": m[1]} for m in matches] + + +def semantic_chunk(domain_prompt: str, text: str, source_name: str): + """GPT 기반 의미 청킹""" + if not text.strip(): + return [] + + # 텍스트가 너무 짧으면 그냥 하나의 청크로 + if len(text) < 500: + return [{ + "title": "전체 내용", + "keywords": "", + "content": text + }] + + user_prompt = f""" +아래 문서를 의미 단위(문단/항목/섹션 등)로 분리하고, +각 청크는 title / keywords / content 를 포함한 JSON 배열로 출력하라. + +규칙: +1. 추측 금지, 문서 내용 기반으로만 분리 +2. 이미지 참조(![...](...))는 관련 텍스트와 같은 청크에 포함 +3. 각 청크는 최소 100자 이상 +4. keywords는 쉼표로 구분된 핵심 키워드 3~5개 + +문서: +{text[:12000]} + +JSON 배열만 출력하라. 다른 설명 없이. +""" + + try: + resp = client.chat.completions.create( + model=GPT_MODEL, + messages=[ + {"role": "system", "content": domain_prompt + "\n\n너는 의미 기반 청킹 전문가이다. JSON 배열만 출력한다."}, + {"role": "user", "content": user_prompt}, + ], + ) + data = resp.choices[0].message.content.strip() + + # JSON 파싱 시도 + # ```json ... ``` 형식 처리 + if "```json" in data: + data = data.split("```json")[1].split("```")[0].strip() + elif "```" in data: + data = data.split("```")[1].split("```")[0].strip() + + if data.startswith("["): + return json.loads(data) + + except json.JSONDecodeError as e: + log(f"[WARN] JSON 파싱 실패 ({source_name}): {e}") + except Exception as e: + log(f"[WARN] semantic_chunk API 실패 ({source_name}): {e}") + + # fallback: 페이지/섹션 기반 분리 + log(f"[INFO] Fallback 청킹 적용: {source_name}") + return fallback_chunk(text) + + +def fallback_chunk(text: str) -> list: + """GPT 실패 시 대체 청킹 (페이지/섹션 기반)""" + chunks = [] + + # 페이지 구분자로 분리 시도 + if "## Page " in text: + pages = re.split(r'\n## Page \d+\n', text) + for i, page_content in enumerate(pages): + if page_content.strip(): + chunks.append({ + "title": f"Page {i+1}", + "keywords": "", + "content": page_content.strip() + }) + else: + # 빈 줄 2개 이상으로 분리 + sections = re.split(r'\n{3,}', text) + for i, section in enumerate(sections): + if section.strip() and len(section.strip()) > 50: + chunks.append({ + "title": f"섹션 {i+1}", + "keywords": "", + "content": section.strip() + }) + + # 청크가 없으면 전체를 하나로 + if not chunks: + chunks.append({ + "title": "전체 내용", + "keywords": "", + "content": text.strip() + }) + + return chunks + + +def summary_chunk(domain_prompt: str, text: str, limit: int = 300) -> str: + """청크 요약 생성""" + if not text.strip(): + return "" + + # 이미지 참조 제거 후 요약 (텍스트만) + text_only = IMAGE_PATTERN.sub('', text).strip() + + if len(text_only) < 100: + return text_only + + prompt = f""" +아래 텍스트를 {limit}자 이내로 사실 기반으로 요약하라. +추측 금지, 고유명사와 수치는 보존. + +{text_only[:8000]} +""" + try: + resp = client.chat.completions.create( + model=GPT_MODEL, + messages=[ + {"role": "system", "content": domain_prompt + "\n\n너는 사실만 요약하는 전문가이다."}, + {"role": "user", "content": prompt}, + ], + ) + return resp.choices[0].message.content.strip() + except Exception as e: + log(f"[WARN] summary 실패: {e}") + return text_only[:limit] + + +def save_chunk_files(src: Path, text: str, domain_prompt: str) -> int: + """ + 의미 청킹 → 요약 → JSON 저장 + + Returns: + 생성된 청크 수 + """ + stem = src.stem + folder_ctx = safe_rel(src.parent) + + # 원문 저장 + (TEXT_DIR / f"{stem}_text.txt").write_text(text, encoding="utf-8", errors="ignore") + + # 의미 청킹 + chunks = semantic_chunk(domain_prompt, text, src.name) + + if not chunks: + log(f"[WARN] 청크 없음: {src.name}") + return 0 + + rag_items = [] + + for idx, ch in enumerate(chunks, start=1): + content = ch.get("content", "") + + # 요약 생성 + summ = summary_chunk(domain_prompt, content, 300) + + # 이 청크에 포함된 이미지 찾기 + images_in_chunk = find_images_in_text(content) + + rag_items.append({ + "source": src.name, + "source_path": safe_rel(src), + "chunk": idx, + "total_chunks": len(chunks), + "title": ch.get("title", ""), + "keywords": ch.get("keywords", ""), + "text": content, + "summary": summ, + "folder_context": folder_ctx, + "images": images_in_chunk, + "has_images": len(images_in_chunk) > 0 + }) + + # JSON 저장 + (JSON_DIR / f"{stem}.json").write_text( + json.dumps(rag_items, ensure_ascii=False, indent=2), + encoding="utf-8" + ) + + # RAG용 JSON 저장 + (RAG_DIR / f"{stem}_chunks.json").write_text( + json.dumps(rag_items, ensure_ascii=False, indent=2), + encoding="utf-8" + ) + + return len(chunks) + + +def main(): + log("=" * 60) + log("청킹/요약 파이프라인 시작") + log(f"데이터 폴더: {DATA_ROOT}") + log(f"출력 폴더: {OUTPUT_ROOT}") + log("=" * 60) + + # 도메인 프롬프트 로드 + domain_prompt = load_domain_prompt() + log(f"도메인 프롬프트 로드 완료 ({len(domain_prompt)}자)") + + # 통계 + stats = {"docs": 0, "chunks": 0, "images": 0, "errors": 0} + + # .md 파일 찾기 + md_files = [] + for root, dirs, files in os.walk(DATA_ROOT): + dirs[:] = [d for d in dirs if d not in SKIP_DIR_NAMES and not d.startswith(".")] + for fname in files: + if fname.lower().endswith(".md"): + md_files.append(Path(root) / fname) + + log(f"\n총 {len(md_files)}개 .md 파일 발견\n") + + for idx, fpath in enumerate(md_files, 1): + try: + rel_path = safe_rel(fpath) + log(f"[{idx}/{len(md_files)}] {rel_path}") + + # 텍스트 읽기 + text = extract_text_md(fpath) + + if not text.strip(): + log(f" ⚠ 빈 파일, 스킵") + continue + + # 이미지 개수 확인 + images = find_images_in_text(text) + stats["images"] += len(images) + + # 청킹 및 저장 + chunk_count = save_chunk_files(fpath, text, domain_prompt) + + stats["docs"] += 1 + stats["chunks"] += chunk_count + + log(f" ✓ {chunk_count}개 청크, {len(images)}개 이미지") + + except Exception as e: + stats["errors"] += 1 + log(f" ✗ 오류: {e}") + + # 전체 통계 저장 + summary = { + "processed_at": datetime.now().isoformat(), + "data_root": str(DATA_ROOT), + "output_root": str(OUTPUT_ROOT), + "statistics": stats + } + + (LOG_DIR / "chunk_summary_stats.json").write_text( + json.dumps(summary, ensure_ascii=False, indent=2), + encoding="utf-8" + ) + + # 결과 출력 + log("\n" + "=" * 60) + log("청킹/요약 완료!") + log("=" * 60) + log(f"처리된 문서: {stats['docs']}개") + log(f"생성된 청크: {stats['chunks']}개") + log(f"포함된 이미지: {stats['images']}개") + log(f"오류: {stats['errors']}개") + log(f"\n결과 저장 위치:") + log(f" - 원문: {TEXT_DIR}") + log(f" - JSON: {JSON_DIR}") + log(f" - RAG: {RAG_DIR}") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/03. Code/geulbeot_10th/converters/pipeline/step5_rag.py b/03. Code/geulbeot_10th/converters/pipeline/step5_rag.py new file mode 100644 index 0000000..0525082 --- /dev/null +++ b/03. Code/geulbeot_10th/converters/pipeline/step5_rag.py @@ -0,0 +1,141 @@ +# -*- coding: utf-8 -*- +""" +build_rag.py + +기능: +- chunk_and_summary.py 에서 생성된 output/rag/*_chunks.json 파일들을 읽어서 + text + summary 를 임베딩(text-embedding-3-small)한다. +- FAISS IndexFlatIP 인덱스를 구축하여 + output/rag/faiss.index, meta.json, vectors.npy 를 생성한다. +""" + +import os +import sys +import json +from pathlib import Path + +import numpy as np +import faiss +from openai import OpenAI +from api_config import API_KEYS + +# ===== 경로 설정 ===== +DATA_ROOT = Path(r"D:\for python\geulbeot-light\geulbeot-light\00.test\hwpx\out") +OUTPUT_ROOT = Path(r"D:\for python\geulbeot-light\geulbeot-light\00.test\hwpx\out\out") # 출력 위치 +RAG_DIR = OUTPUT_ROOT / "rag" +LOG_DIR = OUTPUT_ROOT / "logs" + +for d in [RAG_DIR, LOG_DIR]: + d.mkdir(parents=True, exist_ok=True) + +# ===== OpenAI 설정 (구조 유지) ===== +OPENAI_API_KEY = API_KEYS.get('GPT_API_KEY', '') +GPT_MODEL = "gpt-5-2025-08-07" +EMBED_MODEL = "text-embedding-3-small" + +client = OpenAI(api_key=OPENAI_API_KEY) + + +def log(msg: str): + print(msg, flush=True) + with (LOG_DIR / "build_rag_log.txt").open("a", encoding="utf-8") as f: + f.write(msg + "\n") + + +def embed_texts(texts): + if not texts: + return np.zeros((0, 1536), dtype="float32") + embs = [] + B = 96 + for i in range(0, len(texts), B): + batch = texts[i:i+B] + resp = client.embeddings.create(model=EMBED_MODEL, input=batch) + for d in resp.data: + embs.append(np.array(d.embedding, dtype="float32")) + return np.vstack(embs) + + +def _build_embed_input(u: dict) -> str: + """ + text + summary 를 합쳐 임베딩 입력을 만든다. + - text, summary 중 없는 것은 생략 + - 공백 정리 + - 최대 길이 제한 + """ + sum_ = (u.get("summary") or "").strip() + txt = (u.get("text") or "").strip() + + if txt and sum_: + merged = txt + "\n\n요약: " + sum_[:1000] + else: + merged = txt or sum_ + + merged = " ".join(merged.split()) + if not merged: + return "" + if len(merged) > 4000: + merged = merged[:4000] + return merged + + +def build_faiss_index(): + docs = [] + metas = [] + + rag_files = list(RAG_DIR.glob("*_chunks.json")) + if not rag_files: + log("RAG 파일(*_chunks.json)이 없습니다. 먼저 chunk_and_summary.py를 실행해야 합니다.") + sys.exit(1) + + for f in rag_files: + try: + units = json.loads(f.read_text(encoding="utf-8", errors="ignore")) + except Exception as e: + log(f"[WARN] RAG 파일 읽기 실패: {f.name} | {e}") + continue + + for u in units: + embed_input = _build_embed_input(u) + if not embed_input: + continue + if len(embed_input) < 40: + continue + docs.append(embed_input) + metas.append({ + "source": u.get("source", ""), + "chunk": int(u.get("chunk", 0)), + "folder_context": u.get("folder_context", "") + }) + + if not docs: + log("임베딩할 텍스트가 없습니다.") + sys.exit(1) + + log(f"임베딩 대상 텍스트 수: {len(docs)}") + + E = embed_texts(docs) + if E.shape[0] != len(docs): + log(f"[WARN] 임베딩 수 불일치: E={E.shape[0]}, docs={len(docs)}") + + faiss.normalize_L2(E) + index = faiss.IndexFlatIP(E.shape[1]) + index.add(E) + + np.save(str(RAG_DIR / "vectors.npy"), E) + (RAG_DIR / "meta.json").write_text( + json.dumps(metas, ensure_ascii=False, indent=2), + encoding="utf-8" + ) + faiss.write_index(index, str(RAG_DIR / "faiss.index")) + + log(f"FAISS 인덱스 구축 완료: 벡터 수={len(metas)}") + + +def main(): + log("=== FAISS RAG 인덱스 구축 시작 ===") + build_faiss_index() + log("=== FAISS RAG 인덱스 구축 종료 ===") + + +if __name__ == "__main__": + main() diff --git a/03. Code/geulbeot_10th/converters/pipeline/step6_corpus.py b/03. Code/geulbeot_10th/converters/pipeline/step6_corpus.py new file mode 100644 index 0000000..4a3cb3e --- /dev/null +++ b/03. Code/geulbeot_10th/converters/pipeline/step6_corpus.py @@ -0,0 +1,232 @@ +# -*- coding: utf-8 -*- +""" +make_corpus_v2.py + +기능: +- output/rag/*_chunks.json 에서 모든 청크의 summary를 모아 +- AI가 CEL 목적(교육+자사솔루션 홍보)에 맞게 압축 정리 +- 중복은 빈도 표시, 희귀하지만 중요한 건 [핵심] 표시 +- 결과를 output/context/corpus.txt 로 저장 + +전제: +- chunk_and_summary.py 실행 후 *_chunks.json 들이 존재해야 한다. +- domain_prompt.txt가 존재해야 한다. +""" + +import os +import sys +import json +from pathlib import Path +from datetime import datetime + +from openai import OpenAI +from api_config import API_KEYS + +# ===== 경로 설정 ===== +DATA_ROOT = Path(r"D:\for python\geulbeot-light\geulbeot-light\00.test\hwpx\out") +OUTPUT_ROOT = Path(r"D:\for python\geulbeot-light\geulbeot-light\00.test\hwpx\out\out") # 출력 위치 +RAG_DIR = OUTPUT_ROOT / "rag" +CONTEXT_DIR = OUTPUT_ROOT / "context" +LOG_DIR = OUTPUT_ROOT / "logs" + +for d in [RAG_DIR, CONTEXT_DIR, LOG_DIR]: + d.mkdir(parents=True, exist_ok=True) + +# ===== OpenAI 설정 ===== +OPENAI_API_KEY = API_KEYS.get('GPT_API_KEY', '') +GPT_MODEL = "gpt-5-2025-08-07" + +client = OpenAI(api_key=OPENAI_API_KEY) + +# ===== 압축 설정 ===== +BATCH_SIZE = 80 # 한 번에 처리할 요약 개수 +MAX_CHARS_PER_BATCH = 3000 # 배치당 압축 결과 글자수 +MAX_FINAL_CHARS = 8000 # 최종 corpus 글자수 + + +def log(msg: str): + print(msg, flush=True) + with (LOG_DIR / "make_corpus_log.txt").open("a", encoding="utf-8") as f: + f.write(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}\n") + + +def load_domain_prompt() -> str: + p = CONTEXT_DIR / "domain_prompt.txt" + if not p.exists(): + log("domain_prompt.txt가 없습니다. 먼저 step1을 실행해야 합니다.") + sys.exit(1) + return p.read_text(encoding="utf-8", errors="ignore").strip() + + +def load_all_summaries() -> list: + """모든 청크의 summary + 출처 정보 수집""" + summaries = [] + rag_files = sorted(RAG_DIR.glob("*_chunks.json")) + + if not rag_files: + log("RAG 파일(*_chunks.json)이 없습니다. 먼저 chunk_and_summary.py를 실행해야 합니다.") + sys.exit(1) + + for f in rag_files: + try: + units = json.loads(f.read_text(encoding="utf-8", errors="ignore")) + except Exception as e: + log(f"[WARN] RAG 파일 읽기 실패: {f.name} | {e}") + continue + + for u in units: + summ = (u.get("summary") or "").strip() + source = (u.get("source") or "").strip() + keywords = (u.get("keywords") or "") + + if summ: + # 출처와 키워드 포함 + entry = f"[{source}] {summ}" + if keywords: + entry += f" (키워드: {keywords})" + summaries.append(entry) + + return summaries + + +def compress_batch(domain_prompt: str, batch: list, batch_num: int, total_batches: int) -> str: + """배치 단위로 요약들을 AI가 압축""" + + batch_text = "\n".join([f"{i+1}. {s}" for i, s in enumerate(batch)]) + + prompt = f""" +아래는 문서에서 추출한 요약 {len(batch)}개이다. (배치 {batch_num}/{total_batches}) + +[요약 목록] +{batch_text} + +다음 기준으로 이 요약들을 압축 정리하라: + +1) 중복/유사 내용: 하나로 통합하되, 여러 문서에서 언급되면 "(N회 언급)" 표시 +2) domain_prompt에 명시된 핵심 솔루션/시스템: 반드시 보존하고 [솔루션] 표시 +3) domain_prompt의 목적에 중요한 내용 우선 보존: + - 해당 분야의 기초 개념 + - 기존 방식의 한계점과 문제점 + - 새로운 기술/방식의 장점 +4) 단순 나열/절차만 있는 내용: 과감히 축약 +5) 희귀하지만 핵심적인 인사이트: [핵심] 표시 + +출력 형식: +- 주제별로 그룹핑 +- 각 항목은 1~2문장으로 간결하게 +- 전체 {MAX_CHARS_PER_BATCH}자 이내 +- 마크다운 없이 순수 텍스트로 +""" + + try: + resp = client.chat.completions.create( + model=GPT_MODEL, + messages=[ + {"role": "system", "content": domain_prompt + "\n\n너는 문서 요약을 주제별로 압축 정리하는 전문가이다."}, + {"role": "user", "content": prompt} + ] + ) + result = resp.choices[0].message.content.strip() + log(f" 배치 {batch_num}/{total_batches} 압축 완료 ({len(result)}자)") + return result + except Exception as e: + log(f"[ERROR] 배치 {batch_num} 압축 실패: {e}") + # 실패 시 원본 일부 반환 + return "\n".join(batch[:10]) + + +def merge_compressed_parts(domain_prompt: str, parts: list) -> str: + """배치별 압축 결과를 최종 통합""" + + if len(parts) == 1: + return parts[0] + + all_parts = "\n\n---\n\n".join([f"[파트 {i+1}]\n{p}" for i, p in enumerate(parts)]) + + prompt = f""" +아래는 대량의 문서 요약을 배치별로 압축한 결과이다. +이것을 최종 corpus로 통합하라. + +[배치별 압축 결과] +{all_parts} + +통합 기준: +1) 파트 간 중복 내용 제거 및 통합 +2) domain_prompt에 명시된 목적과 흐름에 맞게 재구성 +3) [솔루션], [핵심], (N회 언급) 표시는 유지 +4) 전체 {MAX_FINAL_CHARS}자 이내 + +출력: 주제별로 정리된 최종 corpus (마크다운 없이) +""" + + try: + resp = client.chat.completions.create( + model=GPT_MODEL, + messages=[ + {"role": "system", "content": domain_prompt + "\n\n너는 CEL 교육 콘텐츠 기획을 위한 corpus를 설계하는 전문가이다."}, + {"role": "user", "content": prompt} + ] + ) + return resp.choices[0].message.content.strip() + except Exception as e: + log(f"[ERROR] 최종 통합 실패: {e}") + return "\n\n".join(parts) + + +def main(): + log("=" * 60) + log("corpus 생성 시작 (AI 압축 버전)") + log("=" * 60) + + # 도메인 프롬프트 로드 + domain_prompt = load_domain_prompt() + log(f"도메인 프롬프트 로드 완료 ({len(domain_prompt)}자)") + + # 모든 요약 수집 + summaries = load_all_summaries() + if not summaries: + log("summary가 없습니다. corpus를 생성할 수 없습니다.") + sys.exit(1) + + log(f"원본 요약 수집 완료: {len(summaries)}개") + + # 원본 저장 (백업) + raw_corpus = "\n".join(summaries) + raw_path = CONTEXT_DIR / "corpus_raw.txt" + raw_path.write_text(raw_corpus, encoding="utf-8") + log(f"원본 corpus 백업: {raw_path} ({len(raw_corpus)}자)") + + # 배치별 압축 + total_batches = (len(summaries) + BATCH_SIZE - 1) // BATCH_SIZE + log(f"\n배치 압축 시작 ({BATCH_SIZE}개씩, 총 {total_batches}배치)") + + compressed_parts = [] + for i in range(0, len(summaries), BATCH_SIZE): + batch = summaries[i:i+BATCH_SIZE] + batch_num = (i // BATCH_SIZE) + 1 + + compressed = compress_batch(domain_prompt, batch, batch_num, total_batches) + compressed_parts.append(compressed) + + # 최종 통합 + log(f"\n최종 통합 시작 ({len(compressed_parts)}개 파트)") + final_corpus = merge_compressed_parts(domain_prompt, compressed_parts) + + # 저장 + out_path = CONTEXT_DIR / "corpus.txt" + out_path.write_text(final_corpus, encoding="utf-8") + + # 통계 + log("\n" + "=" * 60) + log("corpus 생성 완료!") + log("=" * 60) + log(f"원본 요약: {len(summaries)}개 ({len(raw_corpus)}자)") + log(f"압축 corpus: {len(final_corpus)}자") + log(f"압축률: {100 - (len(final_corpus) / len(raw_corpus) * 100):.1f}%") + log(f"\n저장 위치:") + log(f" - 원본: {raw_path}") + log(f" - 압축: {out_path}") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/03. Code/geulbeot_10th/converters/pipeline/step7_index.py b/03. Code/geulbeot_10th/converters/pipeline/step7_index.py new file mode 100644 index 0000000..4f40baf --- /dev/null +++ b/03. Code/geulbeot_10th/converters/pipeline/step7_index.py @@ -0,0 +1,504 @@ +# -*- coding: utf-8 -*- +""" +make_outline.py + +기능: +- output_context/context/domain_prompt.txt +- output_context/context/corpus.txt +을 기반으로 목차를 생성하고, + +1) outline_issue_report.txt 저장 +2) outline_issue_report.html 저장 (테스트.html 레이아웃 기반 표 형태) +""" + +import os +import sys +import re +from pathlib import Path +from datetime import datetime +from typing import List, Dict, Any, Tuple + +from openai import OpenAI +from api_config import API_KEYS + +# ===== 경로 설정 ===== +DATA_ROOT = Path(r"D:\for python\geulbeot-light\geulbeot-light\00.test\hwpx\out") +OUTPUT_ROOT = Path(r"D:\for python\geulbeot-light\geulbeot-light\00.test\hwpx\out\out") # 출력 위치 +CONTEXT_DIR = OUTPUT_ROOT / "context" +LOG_DIR = OUTPUT_ROOT / "logs" + +for d in [CONTEXT_DIR, LOG_DIR]: + d.mkdir(parents=True, exist_ok=True) + +# ===== OpenAI 설정 (구조 유지) ===== +OPENAI_API_KEY = API_KEYS.get('GPT_API_KEY', '') +GPT_MODEL = "gpt-5-2025-08-07" + +client = OpenAI(api_key=OPENAI_API_KEY) + +# ===== 목차 파싱용 정규식 보완 (5분할 대응) ===== +RE_KEYWORDS = re.compile(r"(#\S+)") +RE_L1 = re.compile(r"^\s*(\d+)\.\s+(.+?)\s*$") +RE_L2 = re.compile(r"^\s*(\d+\.\d+)\s+(.+?)\s*$") +RE_L3 = re.compile(r"^\s*(\d+\.\d+\.\d+)\s+(.+?)\s*$") + +def log(msg: str): + print(msg, flush=True) + with (LOG_DIR / "make_outline_log.txt").open("a", encoding="utf-8") as f: + f.write(msg + "\n") + +def load_domain_prompt() -> str: + p = CONTEXT_DIR / "domain_prompt.txt" + if not p.exists(): + log("domain_prompt.txt가 없습니다. 먼저 domain_prompt.py를 실행해야 합니다.") + sys.exit(1) + return p.read_text(encoding="utf-8", errors="ignore").strip() + +def load_corpus() -> str: + p = CONTEXT_DIR / "corpus.txt" + if not p.exists(): + log("corpus.txt가 없습니다. 먼저 make_corpus.py를 실행해야 합니다.") + sys.exit(1) + return p.read_text(encoding="utf-8", errors="ignore").strip() + + +# 기존 RE_L1, RE_L2는 유지하고 아래 두 개를 추가/교체합니다. +RE_L3_HEAD = re.compile(r"^\s*(\d+\.\d+\.\d+)\s+(.+)$") +RE_L3_TOPIC = re.compile(r"^\s*[\-\*]\s+(.+?)\s*\|\s*(.+?)\s*\|\s*(\[.+?\])\s*\|\s*(.+)$") + +def generate_outline(domain_prompt: str, corpus: str) -> str: + sys_msg = { + "role": "system", + "content": ( + domain_prompt + "\n\n" + "너는 건설/측량 DX 기술 보고서의 구조를 설계하는 시니어 기술사이다. " + "주어진 corpus를 분석하여, 실무자가 즉시 활용 가능한 고밀도 지침서 목차를 설계하라." + ), + } + + user_msg = { + "role": "user", + "content": f""" +아래 [corpus]를 바탕으로 보고서 제목과 전략적 목차를 설계하라. + +[corpus] +{corpus} + +요구 사항: +1) 첫 줄에 보고서 제목 1개를 작성하라. +2) 그 아래 목차를 번호 기반 계측 구조로 작성하라. + - 대목차: 1. / 2. / 3. ... + - 중목차: 1.1 / 1.2 / ... + - 소목차: 1.1.1 / 1.1.2 / ... +3) **수량 제약 (중요)**: + - 대목차(1.)는 5~8개로 구성하라. + - **중목차(1.1) 하나당 소목차(1.1.1, 1.1.2...)는 반드시 2개에서 4개 사이로 구성하라.** (절대 1개만 만들지 말 것) + - 소목차(1.1.1) 하나당 '핵심주제(꼭지)'는 반드시 2개에서 3개 사이로 구성하라. + +[소목차 작성 형식] +1.1.1 소목차 제목 + - 핵심주제 1 | #키워드 | [유형] | 집필가이드(데이터/표 구성 지침) + - 핵심주제 2 | #키워드 | [유형] | 집필가이드(데이터/표 구성 지침) + +5) [유형] 분류 가이드: + - [비교형]: 기존 vs DX 방식의 비교표(Table)가 필수적인 경우 + - [기술형]: RMSE, GSD, 중복도 등 정밀 수치와 사양 설명이 핵심인 경우 + - [절차형]: 단계별 워크플로 및 체크리스트가 중심인 경우 + - [인사이트형]: 한계점 분석 및 전문가 제언(☞)이 중심인 경우 +6) 집필가이드는 50자 내외로, "어떤 데이터를 검색해서 어떤 표를 그려라"와 같이 구체적으로 지시하라. +7) 대목차는 최대 8개 이내로 구성하라. +""" + } + resp = client.chat.completions.create( + model=GPT_MODEL, + messages=[sys_msg, user_msg], + ) + return (resp.choices[0].message.content or "").strip() + + + +def parse_outline(outline_text: str) -> Tuple[str, List[Dict[str, Any]]]: + lines = [ln.rstrip() for ln in outline_text.splitlines() if ln.strip()] + if not lines: return "", [] + + title = lines[0].strip() # 첫 줄은 보고서 제목 + rows = [] + current_section = None # 현재 처리 중인 소목차(1.1.1)를 추적 + + for ln in lines[1:]: + raw = ln.strip() + + # 1. 소목차 헤더(1.1.1 제목) 발견 시 + m3_head = RE_L3_HEAD.match(raw) + if m3_head: + num, s_title = m3_head.groups() + current_section = { + "depth": 3, + "num": num, + "title": s_title, + "sub_topics": [] # 여기에 아래 줄의 꼭지들을 담을 예정 + } + rows.append(current_section) + continue + + # 2. 세부 꼭지(- 주제 | #키워드 | [유형] | 가이드) 발견 시 + m_topic = RE_L3_TOPIC.match(raw) + if m_topic and current_section: + t_title, kws_raw, t_type, guide = m_topic.groups() + # 키워드 추출 (#키워드 형태) + kws = [k.lstrip("#").strip() for k in RE_KEYWORDS.findall(kws_raw)] + + # 현재 소목차(current_section)의 리스트에 추가 + current_section["sub_topics"].append({ + "topic_title": t_title, + "keywords": kws, + "type": t_type, + "guide": guide + }) + continue + + # 3. 대목차(1.) 처리 + m1 = RE_L1.match(raw) + if m1: + rows.append({"depth": 1, "num": m1.group(1).strip(), "title": m1.group(2).strip()}) + current_section = None # 소목차 구간 종료 + continue + + # 4. 중목차(1.1) 처리 + m2 = RE_L2.match(raw) + if m2: + rows.append({"depth": 2, "num": m2.group(1).strip(), "title": m2.group(2).strip()}) + current_section = None # 소목차 구간 종료 + continue + + return title, rows + +def html_escape(s: str) -> str: + s = s or "" + return (s.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace('"', """) + .replace("'", "'")) + +def chunk_rows(rows: List[Dict[str, Any]], max_rows_per_page: int = 26) -> List[List[Dict[str, Any]]]: + """ + A4 1장에 표가 길어지면 넘치므로, 단순 행 개수로 페이지 분할한다. + """ + out = [] + cur = [] + for r in rows: + cur.append(r) + if len(cur) >= max_rows_per_page: + out.append(cur) + cur = [] + if cur: + out.append(cur) + return out + +def build_outline_table_html(rows: List[Dict[str, Any]]) -> str: + """ + 테스트.html의 table 스타일을 그대로 쓰는 전제의 표 HTML + """ + head = """ + + + + + + + + + + + """ + + body_parts = [] + for r in rows: + depth = r["depth"] + num = html_escape(r["num"]) + title = html_escape(r["title"]) + kw = " ".join([f"#{k}" for k in r.get("keywords", []) if k]) + kw = html_escape(kw) + + if depth == 1: + body_parts.append( + f""" + + + + + + + """ + ) + elif depth == 2: + body_parts.append( + f""" + + + + + + + """ + ) + else: + body_parts.append( + f""" + + + + + + + """ + ) + + tail = """ + +
구분번호제목키워드
대목차{num}{title}
중목차{num}{title}
소목차{num}{title}{kw}
+ """ + return head + "\n".join(body_parts) + tail + +def build_outline_html(report_title: str, rows: List[Dict[str, Any]]) -> str: + """ + 테스트.html 레이아웃 구조를 그대로 따라 A4 시트 형태로 HTML 생성 + """ + css = r""" + @import url('https://fonts.googleapis.com/css2?family=Noto+Sans+KR:wght@300;400;500;700;900&display=swap'); + + :root { + --primary-blue: #3057B9; + --gray-light: #F2F2F2; + --gray-medium: #E6E6E6; + --gray-dark: #666666; + --border-light: #DDDDDD; + --text-black: #000000; + } + + * { + margin: 0; + padding: 0; + box-sizing: border-box; + -webkit-print-color-adjust: exact; + } + + body { + font-family: 'Noto Sans KR', sans-serif; + background-color: #f0f0f0; + color: var(--text-black); + line-height: 1.35; + display: flex; + justify-content: center; + padding: 10px 0; + } + + .sheet { + background-color: white; + width: 210mm; + height: 297mm; + padding: 20mm 20mm; + box-shadow: 0 0 10px rgba(0,0,0,0.1); + position: relative; + display: flex; + flex-direction: column; + overflow: hidden; + margin-bottom: 12px; + } + + @media print { + body { background: none; padding: 0; } + .sheet { box-shadow: none; margin: 0; border: none; page-break-after: always; } + } + + .page-header { + display: flex; + justify-content: space-between; + align-items: flex-start; + margin-bottom: 15px; + font-size: 8.5pt; + color: var(--gray-dark); + } + + .header-title { + font-size: 24pt; + font-weight: 900; + margin-bottom: 8px; + letter-spacing: -1.5px; + color: #111; + } + + .title-divider { + height: 4px; + background-color: var(--primary-blue); + width: 100%; + margin-bottom: 20px; + } + + .lead-box { + background-color: var(--gray-light); + padding: 18px 20px; + margin-bottom: 5px; + border-radius: 2px; + text-align: center; + } + + .lead-box div { + font-size: 13pt; + font-weight: 700; + color: var(--primary-blue); + letter-spacing: -0.5px; + } + + .lead-notes { + font-size: 8.5pt; + color: #777; + margin-bottom: 20px; + padding-left: 5px; + text-align: right; + } + + .body-content { flex: 1; } + + .section { margin-bottom: 22px; } + + .section-title { + font-size: 13pt; + font-weight: 700; + display: flex; + align-items: center; + margin-bottom: 10px; + color: #111; + } + + .section-title::before { + content: ""; + display: inline-block; + width: 10px; + height: 10px; + background-color: #999; + margin-right: 10px; + } + + table { + width: 100%; + border-collapse: collapse; + margin: 8px 0; + font-size: 9.5pt; + border-top: 1.5px solid #333; + } + + th { + background-color: var(--gray-medium); + font-weight: 700; + padding: 10px; + border: 1px solid var(--border-light); + } + + td { + padding: 10px; + border: 1px solid var(--border-light); + vertical-align: middle; + } + + .group-cell { + background-color: #F9F9F9; + font-weight: 700; + width: 16%; + text-align: center; + color: var(--primary-blue); + white-space: nowrap; + } + + .page-footer { + margin-top: 15px; + padding-top: 10px; + display: flex; + justify-content: space-between; + font-size: 8.5pt; + color: var(--gray-dark); + border-top: 1px solid #EEE; + } + + .footer-page { flex: 1; text-align: center; } + """ + + pages = chunk_rows(rows, max_rows_per_page=26) + + html_pages = [] + total_pages = len(pages) if pages else 1 + for i, page_rows in enumerate(pages, start=1): + table_html = build_outline_table_html(page_rows) + + html_pages.append(f""" +
+ + +
+

{html_escape(report_title)}

+
+
+ +
+
+
확정 목차 표 형태 정리본
+
+
목차는 outline_issue_report.txt를 기반으로 표로 재구성됨
+ +
+
목차
+ {table_html} +
+
+ +
+ + + +
+
+ """) + + return f""" + + + + {html_escape(report_title)} - Outline + + + + {''.join(html_pages)} + + +""" + +def main(): + log("=== 목차 생성 시작 ===") + domain_prompt = load_domain_prompt() + corpus = load_corpus() + + outline = generate_outline(domain_prompt, corpus) + + # TXT 저장 유지 + out_txt = CONTEXT_DIR / "outline_issue_report.txt" + out_txt.write_text(outline, encoding="utf-8") + log(f"목차 TXT 저장 완료: {out_txt}") + + # HTML 추가 저장 + title, rows = parse_outline(outline) + out_html = CONTEXT_DIR / "outline_issue_report.html" + out_html.write_text(build_outline_html(title, rows), encoding="utf-8") + log(f"목차 HTML 저장 완료: {out_html}") + + log("=== 목차 생성 종료 ===") + +if __name__ == "__main__": + main() diff --git a/03. Code/geulbeot_10th/converters/pipeline/step8_content.py b/03. Code/geulbeot_10th/converters/pipeline/step8_content.py new file mode 100644 index 0000000..4330251 --- /dev/null +++ b/03. Code/geulbeot_10th/converters/pipeline/step8_content.py @@ -0,0 +1,1021 @@ +# -*- coding: utf-8 -*- +""" +step8_generate_report_gemini.py + +기능 +- 확정 목차(outline_issue_report.txt)를 읽어 섹션(소목차) 목록을 만든다. +- 섹션별로 RAG에서 근거 청크를 검색한다(FAISS 있으면 FAISS, 없으면 키워드 기반). +- 섹션별 본문 초안을 생성한다(내부 근거 우선, 원문 보존 원칙). +- 섹션별 이미지 후보를 매핑하고, md에는 이미지 자리표시자를 삽입한다. +- 산출물 2개를 만든다. + 1) report_draft.md + 2) report_sections.json + +변경사항 (OpenAI → Gemini) +- google.genai 라이브러리 사용 +- 자율성 통제: temperature=0.3, thinking_budget=0 +- 원문 보존 원칙 강화 +- 소목차별 중복 방지 로직 추가 +- ★ 이미지 assets 복사 로직 추가 +""" + +import os +import re +import json +import shutil # ★ 추가: 이미지 복사용 +from dataclasses import dataclass, field +from pathlib import Path +from datetime import datetime +from typing import List, Dict, Any, Optional, Tuple + +import numpy as np + +try: + import faiss # type: ignore +except Exception: + faiss = None + +# ===== 하이브리드 API 설정 ===== +# 검색/임베딩: OpenAI (기존 FAISS 인덱스 호환) +# 본문 작성: Gemini (글쓰기 품질) + +from google import genai +from google.genai import types +from openai import OpenAI +from api_config import API_KEYS + +# OpenAI (임베딩/검색용) +OPENAI_API_KEY = API_KEYS.get('GPT_API_KEY', '') +EMBED_MODEL = "text-embedding-3-small" +openai_client = OpenAI(api_key=OPENAI_API_KEY) + +# Gemini (본문 작성용) +GEMINI_API_KEY = API_KEYS.get('GEMINI_API_KEY', '') +GEMINI_MODEL = "gemini-3-pro-preview" +gemini_client = genai.Client(api_key=GEMINI_API_KEY) + +# ===== 경로 설정 ===== +DATA_ROOT = Path(r"D:\for python\geulbeot-light\geulbeot-light\00.test\hwpx\out") +OUTPUT_ROOT = Path(r"D:\for python\geulbeot-light\geulbeot-light\00.test\hwpx\out\out") # 출력 위치 +CONTEXT_DIR = OUTPUT_ROOT / "context" +LOG_DIR = OUTPUT_ROOT / "logs" +RAG_DIR = OUTPUT_ROOT / "rag" +GEN_DIR = OUTPUT_ROOT / "generated" + +# ★ 추가: 이미지 assets 경로 +ASSETS_DIR = GEN_DIR / "assets" +IMAGES_ROOT = DATA_ROOT / "images" # 추출된 이미지 원본 위치 + +for d in [CONTEXT_DIR, LOG_DIR, RAG_DIR, GEN_DIR, ASSETS_DIR]: + d.mkdir(parents=True, exist_ok=True) + +# 파일명 +OUTLINE_PATH = CONTEXT_DIR / "outline_issue_report.txt" +DOMAIN_PROMPT_PATH = CONTEXT_DIR / "domain_prompt.txt" + +# 선택 파일(있으면 사용) +FAISS_INDEX_PATH = RAG_DIR / "faiss.index" +FAISS_META_PATH = RAG_DIR / "meta.json" +FAISS_VECTORS_PATH = RAG_DIR / "vectors.npy" + +# 이미지 메타(있으면 캡션 보강) +IMAGE_META_PATH = DATA_ROOT / "image_metadata.json" + +# 출력 파일 +REPORT_MD_PATH = GEN_DIR / "report_draft.md" +REPORT_JSON_PATH = GEN_DIR / "report_sections.json" + +# 설정값 +TOP_K_EVIDENCE = int(os.getenv("TOP_K_EVIDENCE", "10")) +MAX_IMAGES_PER_SECTION = int(os.getenv("MAX_IMAGES_PER_SECTION", "3")) +MAX_EVIDENCE_SNIPPET_CHARS = int(os.getenv("MAX_EVIDENCE_SNIPPET_CHARS", "900")) + +# 패턴 +RE_TITLE_LINE = re.compile(r"^\s*(.+?)\s*$") +RE_L1 = re.compile(r"^\s*(\d+)\.\s+(.+?)\s*$") +RE_L2 = re.compile(r"^\s*(\d+\.\d+)\s+(.+?)\s*$") +RE_L3 = re.compile(r"^\s*(\d+\.\d+\.\d+)\s+(.+?)\s*$") +RE_KEYWORDS = re.compile(r"(#\S+)") + +RE_IMAGE_PATH_IN_MD = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)") + + +def log(msg: str): + line = f"[{datetime.now().strftime('%H:%M:%S')}] {msg}" + print(line, flush=True) + with (LOG_DIR / "step8_generate_report_log.txt").open("a", encoding="utf-8") as f: + f.write(line + "\n") + + +@dataclass +class SubTopic: + title: str + keywords: List[str] + type: str + guide: str + + +@dataclass +class OutlineItem: + number: str + title: str + depth: int + sub_topics: List[SubTopic] = field(default_factory=list) + + +def read_text(p: Path) -> str: + return p.read_text(encoding="utf-8", errors="ignore").strip() + + +def load_domain_prompt() -> str: + if not DOMAIN_PROMPT_PATH.exists(): + raise RuntimeError(f"domain_prompt.txt 없음: {DOMAIN_PROMPT_PATH}") + return read_text(DOMAIN_PROMPT_PATH) + + +def load_outline() -> Tuple[str, List[OutlineItem]]: + if not OUTLINE_PATH.exists(): + raise RuntimeError("목차 파일이 없습니다.") + raw = OUTLINE_PATH.read_text(encoding="utf-8", errors="ignore").splitlines() + if not raw: + return "", [] + + report_title = raw[0].strip() + items: List[OutlineItem] = [] + current_l3 = None + + # 꼭지 파싱용 정규식 + re_l3_head = re.compile(r"^\s*(\d+\.\d+\.\d+)\s+(.+)$") + re_l3_topic = re.compile(r"^\s*[\-\*]\s+(.+?)\s*\|\s*(.+?)\s*\|\s*(\[.+?\])\s*\|\s*(.+)$") + + for ln in raw[1:]: + line = ln.strip() + if not line: + continue + + m3h = re_l3_head.match(line) + if m3h: + current_l3 = OutlineItem(number=m3h.group(1), title=m3h.group(2), depth=3) + items.append(current_l3) + continue + + m3t = re_l3_topic.match(line) + if m3t and current_l3: + kws = [k.lstrip("#").strip() for k in RE_KEYWORDS.findall(m3t.group(2))] + current_l3.sub_topics.append(SubTopic( + title=m3t.group(1), keywords=kws, type=m3t.group(3), guide=m3t.group(4) + )) + continue + + m2 = RE_L2.match(line) + if m2: + items.append(OutlineItem(number=m2.group(1), title=m2.group(2), depth=2)) + current_l3 = None + continue + m1 = RE_L1.match(line) + if m1: + items.append(OutlineItem(number=m1.group(1), title=m1.group(2), depth=1)) + current_l3 = None + continue + + return report_title, items + + +def load_image_metadata() -> Dict[str, Dict[str, Any]]: + """image_metadata.json이 있으면 image_file 기준으로 맵을 만든다.""" + if not IMAGE_META_PATH.exists(): + return {} + try: + data = json.loads(IMAGE_META_PATH.read_text(encoding="utf-8", errors="ignore")) + out: Dict[str, Dict[str, Any]] = {} + for it in data: + fn = (it.get("image_file") or "").strip() + if fn: + out[fn] = it + return out + except Exception as e: + log(f"[WARN] image_metadata.json 로드 실패: {e}") + return {} + + +def iter_rag_items() -> List[Dict[str, Any]]: + """rag 폴더의 *_chunks.json 모두 로드""" + items: List[Dict[str, Any]] = [] + files = sorted(RAG_DIR.glob("*_chunks.json")) + if not files: + raise RuntimeError(f"rag 폴더에 *_chunks.json 없음: {RAG_DIR}") + + for f in files: + try: + data = json.loads(f.read_text(encoding="utf-8", errors="ignore")) + if isinstance(data, list): + for it in data: + if isinstance(it, dict): + items.append(it) + except Exception as e: + log(f"[WARN] RAG 파일 로드 실패: {f.name} {e}") + + return items + + +def normalize_ws(s: str) -> str: + return " ".join((s or "").split()) + + +def make_evidence_snippet(text: str, max_chars: int) -> str: + t = normalize_ws(text) + if len(t) <= max_chars: + return t + return t[:max_chars] + "..." + + +def get_item_key(it: Dict[str, Any]) -> Tuple[str, int]: + src = (it.get("source") or "").strip() + ch = int(it.get("chunk") or 0) + return (src, ch) + + +def build_item_index(items: List[Dict[str, Any]]) -> Dict[Tuple[str, int], Dict[str, Any]]: + m: Dict[Tuple[str, int], Dict[str, Any]] = {} + for it in items: + m[get_item_key(it)] = it + return m + + +def try_load_faiss(): + """faiss.index, meta.json, vectors.npy가 모두 있고 faiss 모듈이 있으면 사용""" + if faiss is None: + log("[INFO] faiss 모듈 없음 - 키워드 검색 사용") + return None + if not (FAISS_INDEX_PATH.exists() and FAISS_META_PATH.exists() and FAISS_VECTORS_PATH.exists()): + log("[INFO] FAISS 파일 없음 - 키워드 검색 사용") + return None + try: + index = faiss.read_index(str(FAISS_INDEX_PATH)) + metas = json.loads(FAISS_META_PATH.read_text(encoding="utf-8", errors="ignore")) + vecs = np.load(str(FAISS_VECTORS_PATH)) + log(f"[INFO] FAISS 로드 성공 - 인덱스 차원: {index.d}, 메타 수: {len(metas)}") + return index, metas, vecs + except Exception as e: + log(f"[WARN] FAISS 로드 실패: {e}") + return None + + +def embed_query_openai(q: str) -> np.ndarray: + """OpenAI 임베딩 (기존 FAISS 인덱스와 호환)""" + try: + resp = openai_client.embeddings.create(model=EMBED_MODEL, input=[q]) + v = np.array(resp.data[0].embedding, dtype="float32") + n = np.linalg.norm(v) + 1e-12 + return v / n + except Exception as e: + log(f"[WARN] OpenAI 임베딩 실패: {e}") + return np.zeros(1536, dtype="float32") # OpenAI 차원 + + +def retrieve_with_faiss( + index, + metas: List[Dict[str, Any]], + item_map: Dict[Tuple[str, int], Dict[str, Any]], + query: str, + top_k: int +) -> List[Dict[str, Any]]: + qv = embed_query_openai(query).reshape(1, -1).astype("float32") + D, I = index.search(qv, top_k) + out: List[Dict[str, Any]] = [] + for idx in I[0]: + if idx < 0 or idx >= len(metas): + continue + meta = metas[idx] + src = (meta.get("source") or "").strip() + ch = int(meta.get("chunk") or 0) + it = item_map.get((src, ch)) + if it: + out.append(it) + return out + + +def tokenize_simple(s: str) -> List[str]: + s = normalize_ws(s).lower() + return [t for t in re.split(r"\s+", s) if t] + + +def retrieve_with_keywords( + all_items: List[Dict[str, Any]], + query: str, + keywords: List[str], + top_k: int +) -> List[Dict[str, Any]]: + q_tokens = set(tokenize_simple(query)) + k_tokens = set([kw.lower() for kw in keywords if kw]) + + scored: List[Tuple[float, Dict[str, Any]]] = [] + for it in all_items: + txt = " ".join([ + str(it.get("title") or ""), + str(it.get("keywords") or ""), + str(it.get("summary") or ""), + str(it.get("text") or ""), + str(it.get("folder_context") or ""), + str(it.get("source_path") or ""), + ]) + t = normalize_ws(txt).lower() + + score = 0.0 + for tok in q_tokens: + if tok and tok in t: + score += 1.0 + for tok in k_tokens: + if tok and tok in t: + score += 2.0 + + if it.get("has_images"): + score += 0.5 + + if score > 0: + scored.append((score, it)) + + scored.sort(key=lambda x: x[0], reverse=True) + return [it for _, it in scored[:top_k]] + + +def select_images_for_section( + evidences: List[Dict[str, Any]], + image_meta_by_file: Dict[str, Dict[str, Any]], + max_images: int +) -> List[Dict[str, Any]]: + """근거 청크에서 images를 모아 섹션 이미지 후보를 만들고 상한으로 자른다.""" + seen = set() + out: List[Dict[str, Any]] = [] + + def infer_image_file(p: str) -> str: + p = p.replace("\\", "/") + return p.split("/")[-1] + + for ev in evidences: + imgs = ev.get("images") or [] + if not isinstance(imgs, list): + continue + for img in imgs: + if not isinstance(img, dict): + continue + rel_path = (img.get("path") or "").strip() + if not rel_path: + continue + key = rel_path.replace("\\", "/") + if key in seen: + continue + seen.add(key) + + img_file = infer_image_file(key) + meta = image_meta_by_file.get(img_file, {}) + + caption = "" + if meta: + caption = (meta.get("caption") or "").strip() + if not caption: + caption = (img.get("alt") or "").strip() or img_file + + out.append({ + "image_id": "", + "rel_path": key, + "image_file": img_file, + "caption": caption, + "source_path": ev.get("source_path") or ev.get("source") or "", + "page": meta.get("page", None) if meta else None, + "type": meta.get("type", None) if meta else None, + }) + if len(out) >= max_images: + return out + + return out + + +def make_image_placeholders(section_number: str, images: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """섹션번호 기반으로 이미지아이디를 만들고 placeholder를 만든다.""" + sec_key = section_number.replace(".", "_") + out = [] + for i, img in enumerate(images, start=1): + img_id = f"{sec_key}_img{i:02d}" + out.append({**img, "image_id": img_id, "placeholder": f"{{{{IMG:{img_id}}}}}"}) + return out + + +# ★ 추가: 이미지 파일을 assets 폴더로 복사하는 함수 +def copy_images_to_assets(image_info_list: List[Dict[str, Any]]) -> None: + """선택된 이미지들을 generated/assets/로 복사""" + for img in image_info_list: + # 원본 경로 찾기 (여러 경로 시도) + rel_path = img.get('rel_path', '') + src_path = None + + # 1차: DATA_ROOT 기준 상대경로 + candidate1 = DATA_ROOT / rel_path + if candidate1.exists(): + src_path = candidate1 + + # 2차: IMAGES_ROOT에서 파일명으로 검색 + if src_path is None: + candidate2 = IMAGES_ROOT / img.get('image_file', '') + if candidate2.exists(): + src_path = candidate2 + + # 3차: DATA_ROOT 전체에서 파일명 검색 (재귀) + if src_path is None: + img_file = img.get('image_file', '') + if img_file: + for found in DATA_ROOT.rglob(img_file): + src_path = found + break + + if src_path and src_path.exists(): + # image_id 기반으로 새 파일명 생성 (확장자 유지) + ext = src_path.suffix or '.png' + dst_filename = f"{img['image_id']}{ext}" + dst_path = ASSETS_DIR / dst_filename + + try: + shutil.copy2(src_path, dst_path) + img['asset_path'] = f"assets/{dst_filename}" + log(f" [IMG] {img['image_id']} → {dst_filename}") + except Exception as e: + log(f" [WARN] 이미지 복사 실패: {img['image_id']} - {e}") + img['asset_path'] = None + else: + log(f" [WARN] 이미지 없음: {rel_path} ({img.get('image_file', '')})") + img['asset_path'] = None + + +# ===== Gemini 프롬프트 구성 (자율성 통제 강화) ===== + +def build_system_instruction(domain_prompt: str) -> str: + """ + Gemini 시스템 지시문 (v4 - 최종) + """ + return f"""{domain_prompt} + +═══════════════════════════════════════════════════════════════ + ★★★ 절대 준수 규칙 ★★★ +═══════════════════════════════════════════════════════════════ + +[금지 사항] +1. 원문의 수치, 용어, 표현을 임의로 변경 금지 +2. 제공되지 않은 정보 추론/창작 금지 +3. 추측성 표현 금지 ("~로 보인다", "~일 것이다") +4. 중복 내용 작성 금지 +5. 마크다운 헤딩(#, ##, ###, ####) 사용 금지 +6. ★ "꼭지", "항목 1", "Topic" 등 내부 분류 용어 출력 금지 +7. ★ "1. 2. 3." 형태 번호 사용 금지 (반드시 "1) 2) 3)" 사용) + +[필수 사항] +1. 원문 최대 보존 +2. 수치는 원본 그대로 +3. 전문 용어 변경 없이 사용 +4. 보고서 형식으로 전문적 작성 + +═══════════════════════════════════════════════════════════════ + ★★★ 번호 체계 및 서식 규칙 (필수) ★★★ +═══════════════════════════════════════════════════════════════ + +【레벨별 번호와 서식】 + +■ 1단계: 1), 2), 3) +■ 2단계: (1), (2), (3) +■ 3단계: ①, ②, ③ 또는 -, * + +【핵심 서식 규칙】 + +★ 모든 번호의 제목은 반드시 **볼드** 처리 +★ 제목과 본문 사이에 반드시 빈 줄(엔터) 삽입 +★ 본문과 다음 번호 사이에 반드시 빈 줄(엔터) 삽입 + +【올바른 예시】 +``` +1) **VRS GNSS 측량의 개요** + +인공위성과 위성기준점을 이용한 위치 측량 방식이다. 실시간 보정을 통해 높은 정확도를 확보할 수 있다. + +2) **UAV 사진측량의 특징** + +무인항공기를 활용한 광역 측량 방식이다. 목적에 따라 다음과 같이 구분된다. + + (1) **맵핑측량** + + 정사영상 제작에 특화된 촬영 방식이다. + + (2) **모델측량** + + 3D 모델 생성에 특화된 촬영 방식이다. +``` + +【잘못된 예시 - 절대 금지】 +``` +꼭지 1 VRS GNSS 측량 ← "꼭지" 용어 금지! +1. VRS GNSS 측량 ← "1." 형태 금지! +1) VRS GNSS 측량 인공위성을... ← 제목+본문 한줄 금지! +1) VRS GNSS 측량 ← 볼드 없음 금지! +``` + +═══════════════════════════════════════════════════════════════ + +[작성 형식] +- 섹션 제목 없이 바로 본문 시작 +- 주제별 구분: 1), 2), 3) + **볼드 제목** + 줄바꿈 + 본문 +- 하위 구분: (1), (2), (3) + **볼드 제목** + 줄바꿈 + 본문 +- [비교형]: 마크다운 표 포함 +- [기술형]: 기술 사양/수치 정확히 기재 +- [절차형]: 단계별 1), 2), 3) 사용 + +[출력 제한] +- 마크다운 헤딩 금지 +- "꼭지", "Topic", "항목" 등 분류 용어 출력 금지 +- 내부 메모용 표현 금지 +- 출처 표시 금지 +═══════════════════════════════════════════════════════════════ +""" + + +def build_user_prompt( + report_title: str, + item, # OutlineItem + evidences, + image_info_list, + previous_sections_summary: str = "" +) -> str: + """ + 섹션별 사용자 프롬프트 (v4) + """ + + # 근거 자료 정리 + ev_text = "" + for i, ev in enumerate(evidences, 1): + src = ev.get('source_path') or ev.get('source', '내부자료') + text = ev.get('text', '')[:1500] + title = ev.get('title', '') + keywords = ev.get('keywords', '') + + ev_text += f""" +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +[데이터 {i}] 출처: {src} +제목: {title} +키워드: {keywords} +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +{text} +""" + + # ★ "꼭지" → "주제"로 변경, 번호 부여 + topic_guides = "" + for idx, st in enumerate(item.sub_topics, 1): + topic_guides += f""" +【작성할 내용 {idx}】 {st.title} + - 유형: {st.type} + - 핵심 키워드: {', '.join(['#'+k for k in st.keywords]) if st.keywords else '없음'} + - 참고 지침: {st.guide} + - ★ 출력 시 "{idx}) **{st.title}**" 형태로 시작할 것 +""" + + # 이미지 안내 + img_guide = "" + if image_info_list: + img_guide = "\n【삽입 가능 이미지】\n" + for img in image_info_list: + img_guide += f" - {img['placeholder']}: {img['caption']}\n" + img_guide += " → 문맥에 맞는 위치에 삽입\n" + + # 중복 방지 + dup_guide = "" + if previous_sections_summary: + dup_guide = f""" +【중복 방지 - 이미 다룬 내용이므로 제외】 +{previous_sections_summary} +""" + + # ★ 서식 리마인더 강화 + format_reminder = """ +═══════════════════════════════════════════════════════════════ + ★★★ 출력 서식 필수 준수 ★★★ +═══════════════════════════════════════════════════════════════ +1) **제목은 반드시 볼드** + +본문은 제목 다음 줄에 작성 + +2) **다음 제목도 볼드** + +본문... + + (1) **하위 제목도 볼드** + + 하위 본문... + +★ "꼭지", "항목", "Topic" 등 내부 용어 절대 출력 금지! +★ 제목과 본문 사이 반드시 빈 줄! +═══════════════════════════════════════════════════════════════ +""" + + return f""" +╔═══════════════════════════════════════════════════════════════╗ +║ 보고서: {report_title} +║ 작성 섹션: {item.number} {item.title} +╚═══════════════════════════════════════════════════════════════╝ + +{dup_guide} + +【이 섹션에서 다룰 내용】 +{topic_guides} + +{img_guide} + +{format_reminder} + +【참고 데이터】 +{ev_text} + +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +지시: '{item.number} {item.title}' 섹션 본문을 작성하라. + +★ 번호: 1), 2) → (1), (2) → -, * +★ 제목: 반드시 **볼드** +★ 줄바꿈: 제목↔본문 사이 빈 줄 필수 +★ 금지어: "꼭지", "항목", "Topic" 출력 금지 +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +""" + + +def generate_section_text_gemini( + system_instruction: str, + user_prompt: str +) -> str: + """ + Gemini API를 사용한 섹션 본문 생성 + - temperature=0.3으로 자율성 억제 + """ + try: + response = gemini_client.models.generate_content( + model=GEMINI_MODEL, + contents=user_prompt, + config=types.GenerateContentConfig( + system_instruction=system_instruction, + temperature=0.3, # 낮은 temperature로 창의성 억제 + ) + ) + return (response.text or "").strip() + except Exception as e: + log(f"[ERROR] Gemini API 호출 실패: {e}") + return f"[생성 실패: {e}]" + +import re + +def extract_section_summary(text: str, max_chars: int = 200) -> str: + """섹션 본문에서 핵심 키워드/주제 추출 (중복 방지용)""" + # 첫 200자 또는 첫 문단 + lines = text.split('\n') + summary_parts = [] + char_count = 0 + + for line in lines: + line = line.strip() + if not line or line.startswith('#'): + continue + summary_parts.append(line) + char_count += len(line) + if char_count >= max_chars: + break + + return ' '.join(summary_parts)[:max_chars] + + +def fix_numbering_format(text: str) -> str: + """ + Gemini가 "1. 2. 3." 형태로 출력했을 때 "1) 2) 3)" 형태로 변환 + + 변환 규칙: + - "1. " → "1) " (줄 시작, 들여쓰기 0) + - " 1. " → " (1) " (들여쓰기 있으면 하위 레벨) + """ + lines = text.split('\n') + result = [] + + for line in lines: + # 원본 들여쓰기 측정 + stripped = line.lstrip() + indent = len(line) - len(stripped) + + # "숫자. " 패턴 감지 (마크다운 순서 리스트) + match = re.match(r'^(\d+)\.\s+(.+)$', stripped) + + if match: + num = match.group(1) + content = match.group(2) + + if indent == 0: + # 최상위 레벨: 1. → 1) + result.append(f"{num}) {content}") + elif indent <= 4: + # 1단계 들여쓰기: 1. → (1) + result.append(" " * indent + f"({num}) {content}") + else: + # 2단계 이상 들여쓰기: 그대로 유지 또는 - 로 변환 + result.append(" " * indent + f"- {content}") + else: + result.append(line) + + return '\n'.join(result) + + +def clean_generated_text_final(section_number: str, text: str) -> str: + """ + Gemini 출력 후처리 (최종 버전) + + 1. 중복 섹션 제목 제거 + 2. "꼭지 N" 패턴 제거 + 3. 번호 체계 변환 (1. → 1)) + 4. 제목 볼드 + 줄바꿈 강제 적용 + 5. #### 헤딩 → 볼드 변환 + """ + + # 1단계: 기본 정리 + lines = text.split('\n') + cleaned = [] + + for line in lines: + stripped = line.strip() + + # 중복 섹션 제목 제거 (# 숫자.숫자.숫자 형태) + if re.match(r'^#{1,4}\s*\d+(\.\d+)*\s+', stripped): + continue + + # "꼭지 N" 패턴 제거 (독립 라인) + if re.match(r'^[\*\*]*꼭지\s*\d+[\*\*]*\s*', stripped): + continue + + # "**꼭지 N 제목**" → "**제목**" 변환 + cleaned_line = re.sub(r'\*\*꼭지\s*\d+\s*', '**', stripped) + + # #### 헤딩 → 볼드 + h4_match = re.match(r'^####\s+(.+)$', cleaned_line) + if h4_match: + title = h4_match.group(1).strip() + if not re.match(r'^\d+', title): + cleaned.append(f"\n**{title}**\n") + continue + + # 빈 줄 연속 방지 (3줄 이상 → 2줄) + if not stripped: + if len(cleaned) >= 2 and not cleaned[-1].strip() and not cleaned[-2].strip(): + continue + + cleaned.append(cleaned_line if cleaned_line != stripped else line) + + result = '\n'.join(cleaned) + + # 2단계: 번호 체계 변환 + result = fix_numbering_format(result) + + # 3단계: 제목+본문 붙어있는 것 분리 + 볼드 적용 + result = fix_title_format(result) + + return result.strip() + + +def fix_numbering_format(text: str) -> str: + """ + "1. " → "1) " 변환 + 들여쓰기 있으면 "(1)" 형태로 + """ + lines = text.split('\n') + result = [] + + for line in lines: + stripped = line.lstrip() + indent = len(line) - len(stripped) + + # "숫자. " 패턴 (마크다운 순서 리스트) + match = re.match(r'^(\d+)\.\s+(.+)$', stripped) + + if match: + num = match.group(1) + content = match.group(2) + + if indent == 0: + # 최상위: 1. → 1) + result.append(f"{num}) {content}") + elif indent <= 4: + # 1단계 들여쓰기: → (1) + result.append(" " * indent + f"({num}) {content}") + else: + # 2단계 이상: → - + result.append(" " * indent + f"- {content}") + else: + result.append(line) + + return '\n'.join(result) + + +def fix_title_format(text: str) -> str: + """ + 번호+제목+본문 한줄 → 번호+제목 / 본문 분리 + 제목에 볼드 적용 + + 핵심: **볼드 제목** 뒤에 본문이 이어지면 줄바꿈 삽입 + """ + lines = text.split('\n') + result = [] + + for line in lines: + stripped = line.strip() + indent = len(line) - len(stripped) + indent_str = " " * indent + + # 패턴 1: "1) **제목** 본문..." → "1) **제목**\n\n본문..." + m1 = re.match(r'^(\d+)\)\s+(\*\*[^*]+\*\*)\s+(.{20,})$', stripped) + if m1: + num = m1.group(1) + title = m1.group(2) + body = m1.group(3).strip() + result.append(f"{indent_str}{num}) {title}") + result.append("") + result.append(f"{indent_str}{body}") + result.append("") + continue + + # 패턴 2: "(1) **제목** 본문..." → "(1) **제목**\n\n본문..." + m2 = re.match(r'^\((\d+)\)\s+(\*\*[^*]+\*\*)\s+(.{20,})$', stripped) + if m2: + num = m2.group(1) + title = m2.group(2) + body = m2.group(3).strip() + result.append(f"{indent_str}({num}) {title}") + result.append("") + result.append(f"{indent_str}{body}") + result.append("") + continue + + # 패턴 3: "1) 제목:" 또는 "1) 제목" (볼드 없음, 짧은 제목) → 볼드 적용 + m3 = re.match(r'^(\d+)\)\s+([^*\n]{3,40})$', stripped) + if m3: + num = m3.group(1) + title = m3.group(2).strip().rstrip(':') + # 문장이 아닌 제목으로 판단 (마침표로 안 끝남) + if not title.endswith(('.', '다', '요', '음', '함')): + result.append(f"{indent_str}{num}) **{title}**") + result.append("") + continue + + # 패턴 4: "(1) 제목" (볼드 없음) → 볼드 적용 + m4 = re.match(r'^\((\d+)\)\s+([^*\n]{3,40})$', stripped) + if m4: + num = m4.group(1) + title = m4.group(2).strip().rstrip(':') + if not title.endswith(('.', '다', '요', '음', '함')): + result.append(f"{indent_str}({num}) **{title}**") + result.append("") + continue + + result.append(line) + + # 연속 빈줄 정리 + final = [] + for line in result: + if not line.strip(): + if len(final) >= 2 and not final[-1].strip() and not final[-2].strip(): + continue + final.append(line) + + return '\n'.join(final) + + +def main(): + log("=== step8 Gemini 기반 보고서 생성 시작 ===") + + domain_prompt = load_domain_prompt() + report_title, outline_items = load_outline() + + log(f"보고서 제목: {report_title}") + log(f"목차 항목 수: {len(outline_items)}") + + # 데이터 및 이미지 메타 로드 + image_meta_by_file = load_image_metadata() + all_rag_items = iter_rag_items() + item_map = build_item_index(all_rag_items) + faiss_pack = try_load_faiss() + use_faiss = faiss_pack is not None + + log(f"RAG 청크 수: {len(all_rag_items)}") + log(f"FAISS 사용: {use_faiss}") + + # 시스템 지시문 (한 번만 생성) + system_instruction = build_system_instruction(domain_prompt) + + md_lines = [f"# {report_title}", ""] + report_json_sections = [] + + # 중복 방지를 위한 이전 섹션 요약 누적 + previous_sections_summary = "" + + # ★ 추가: 복사된 이미지 카운트 + total_images_copied = 0 + + for it in outline_items: + # 대목차와 중목차는 제목만 적고 통과 + if it.depth < 3: + prefix = "## " if it.depth == 1 else "### " + md_lines.append(f"\n{prefix}{it.number} {it.title}\n") + continue + + log(f"집필 중: {it.number} {it.title} (꼭지 {len(it.sub_topics)}개)") + + # 꼭지들의 키워드를 합쳐서 검색 + all_kws = [] + for st in it.sub_topics: + all_kws.extend(st.keywords) + query = f"{it.title} " + " ".join(all_kws) + + # RAG 검색 + if use_faiss: + evidences = retrieve_with_faiss(faiss_pack[0], faiss_pack[1], item_map, query, 12) + else: + evidences = retrieve_with_keywords(all_rag_items, query, all_kws, 12) + + log(f" → 검색된 근거 청크: {len(evidences)}개") + + # 이미지 선택 및 플레이스홀더 생성 + section_images = select_images_for_section(evidences, image_meta_by_file, MAX_IMAGES_PER_SECTION) + image_info_list = make_image_placeholders(it.number, section_images) + + # ★ 추가: 이미지 파일을 assets 폴더로 복사 + copy_images_to_assets(image_info_list) + copied_count = sum(1 for img in image_info_list if img.get('asset_path')) + total_images_copied += copied_count + + # 사용자 프롬프트 생성 + user_prompt = build_user_prompt( + report_title=report_title, + item=it, + evidences=evidences, + image_info_list=image_info_list, + previous_sections_summary=previous_sections_summary + ) + + # Gemini로 본문 생성 + section_text = generate_section_text_gemini(system_instruction, user_prompt) + section_text = clean_generated_text_final(it.number, section_text) # ★ 이 한 줄만 추가! + + # 마크다운 내용 추가 + md_lines.append(f"\n#### {it.number} {it.title}\n") + md_lines.append(section_text + "\n") + + # 중복 방지를 위해 현재 섹션 요약 누적 ← 이 부분은 그대로! + section_summary = extract_section_summary(section_text) + if section_summary: + previous_sections_summary += f"\n- {it.number}: {section_summary[:100]}..." + + # JSON용 데이터 수집 (★ asset_path 추가) + report_json_sections.append({ + "section_id": it.number, + "section_title": it.title, + "generated_text": section_text, + "sub_topics": [vars(st) for st in it.sub_topics], + "evidence_count": len(evidences), + "assets": [ + { + "type": "image", + "image_id": img["image_id"], + "filename": img["image_file"], + "caption": img["caption"], + "placeholder": img["placeholder"], + "source_path": img.get("source_path", ""), + "page": img.get("page"), + "asset_path": img.get("asset_path"), # ★ 추가 + } + for img in image_info_list + ] + }) + + log(f" → 생성 완료 ({len(section_text)} 자)") + + # 1. 마크다운(.md) 파일 저장 + REPORT_MD_PATH.write_text("\n".join(md_lines), encoding="utf-8") + + # 2. JSON(.json) 파일 저장 + REPORT_JSON_PATH.write_text( + json.dumps({ + "generated_at": datetime.now().isoformat(), + "report_title": report_title, + "model": GEMINI_MODEL, + "sections": report_json_sections + }, ensure_ascii=False, indent=2), + encoding="utf-8" + ) + + log(f"") + log(f"═══════════════════════════════════════════════════") + log(f"파일 저장 완료:") + log(f" 1. {REPORT_MD_PATH}") + log(f" 2. {REPORT_JSON_PATH}") + log(f" 3. {ASSETS_DIR} (이미지 {total_images_copied}개 복사)") # ★ 추가 + log(f"═══════════════════════════════════════════════════") + log("=== step8 보고서 생성 종료 ===") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/03. Code/geulbeot_10th/converters/pipeline/step9_html.py b/03. Code/geulbeot_10th/converters/pipeline/step9_html.py new file mode 100644 index 0000000..9e20780 --- /dev/null +++ b/03. Code/geulbeot_10th/converters/pipeline/step9_html.py @@ -0,0 +1,1249 @@ +# -*- coding: utf-8 -*- +""" +9_md_to_html_publisher.py + +기능: +- report_draft.md + report_sections.json → report.html 변환 +- A4 규격 페이지네이션 템플릿 적용 +- 마크다운 테이블 → HTML 테이블 변환 +- 이미지 플레이스홀더 {{IMG:xxx}} →
변환 +- 목차(TOC) 자동 생성 + +사용법: + python 9_md_to_html_publisher.py + python 9_md_to_html_publisher.py --md report_draft.md --json report_sections.json --output report.html + python 9_md_to_html_publisher.py --no-toc --no-summary +""" + +import os +import re +import json +import argparse +from pathlib import Path +from datetime import datetime +from typing import List, Dict, Any, Tuple, Optional +from dataclasses import dataclass, field + +# ===== 경로 설정 ===== +OUTPUT_ROOT = Path(r"D:\for python\geulbeot-light\geulbeot-light\00.test\hwpx\out\out") # 출력 위치 +GEN_DIR = OUTPUT_ROOT / "generated" +ASSETS_DIR = GEN_DIR / "assets" +LOG_DIR = OUTPUT_ROOT / "logs" + +# 기본 입출력 파일 +DEFAULT_MD_PATH = GEN_DIR / "report_draft.md" +DEFAULT_JSON_PATH = GEN_DIR / "report_sections.json" +DEFAULT_OUTPUT_PATH = GEN_DIR / "report.html" + +for d in [GEN_DIR, ASSETS_DIR, LOG_DIR]: + d.mkdir(parents=True, exist_ok=True) + + +def log(msg: str): + """로깅 함수""" + line = f"[{datetime.now().strftime('%H:%M:%S')}] {msg}" + print(line, flush=True) + with (LOG_DIR / "step9_html_publish_log.txt").open("a", encoding="utf-8") as f: + f.write(line + "\n") + + +# ===== 데이터 클래스 ===== +@dataclass +class ImageAsset: + """이미지 자산 정보""" + image_id: str + filename: str + caption: str + placeholder: str + source_path: str = "" + page: Optional[int] = None + asset_path: Optional[str] = None + + +@dataclass +class Section: + """섹션 정보""" + section_id: str + section_title: str + generated_text: str + assets: List[ImageAsset] = field(default_factory=list) + + +@dataclass +class TocItem: + """목차 항목""" + number: str + title: str + level: int # 1, 2, 3 + + +# ===== 파일 로더 ===== +def load_json_meta(json_path: Path) -> Tuple[str, List[Section]]: + """JSON 파일에서 메타정보와 섹션 로드""" + if not json_path.exists(): + raise FileNotFoundError(f"JSON 파일 없음: {json_path}") + + data = json.loads(json_path.read_text(encoding="utf-8")) + report_title = data.get("report_title", "보고서") + + sections = [] + for sec in data.get("sections", []): + assets = [] + for asset in sec.get("assets", []): + assets.append(ImageAsset( + image_id=asset.get("image_id", ""), + filename=asset.get("filename", ""), + caption=asset.get("caption", ""), + placeholder=asset.get("placeholder", ""), + source_path=asset.get("source_path", ""), + page=asset.get("page"), + asset_path=asset.get("asset_path") + )) + + sections.append(Section( + section_id=sec.get("section_id", ""), + section_title=sec.get("section_title", ""), + generated_text=sec.get("generated_text", ""), + assets=assets + )) + + return report_title, sections + + +def load_markdown(md_path: Path) -> str: + """마크다운 파일 로드""" + if not md_path.exists(): + raise FileNotFoundError(f"MD 파일 없음: {md_path}") + return md_path.read_text(encoding="utf-8") + + +# ===== 이미지 맵 생성 ===== +def build_image_map(sections: List[Section]) -> Dict[str, ImageAsset]: + """placeholder → ImageAsset 매핑 생성""" + img_map = {} + for sec in sections: + for asset in sec.assets: + if asset.placeholder: + # {{IMG:xxx}} 형태에서 xxx 추출 + img_map[asset.image_id] = asset + return img_map + + +# ===== 목차 생성 ===== +def extract_toc_from_md(md_content: str) -> List[TocItem]: + """마크다운에서 목차 구조 추출""" + toc_items = [] + + # 헤딩 패턴 + patterns = [ + (re.compile(r'^##\s+(\d+)\s+(.+)$', re.MULTILINE), 1), # ## 1 대목차 + (re.compile(r'^###\s+(\d+\.\d+)\s+(.+)$', re.MULTILINE), 2), # ### 1.1 중목차 + (re.compile(r'^####\s+(\d+\.\d+\.\d+)\s+(.+)$', re.MULTILINE), 3), # #### 1.1.1 소목차 + ] + + for pattern, level in patterns: + for match in pattern.finditer(md_content): + number = match.group(1) + title = match.group(2).strip() + toc_items.append(TocItem(number=number, title=title, level=level)) + + # 번호순 정렬 + def sort_key(item: TocItem) -> tuple: + parts = item.number.split('.') + return tuple(int(p) for p in parts) + + toc_items.sort(key=sort_key) + return toc_items + + +def generate_toc_html(toc_items: List[TocItem]) -> str: + """목차 HTML 생성""" + if not toc_items: + return "" + + lines = ['
    '] + + current_l1 = None + for item in toc_items: + if item.level == 1: + # 새로운 대목차 그룹 + if current_l1 is not None: + lines.append('') # 이전 그룹 닫기 + lines.append('
    ') + lines.append(f'
  • {item.number}. {item.title}
  • ') + current_l1 = item.number + elif item.level == 2: + lines.append(f'
  • {item.number} {item.title}
  • ') + elif item.level == 3: + lines.append(f'
  • {item.number} {item.title}
  • ') + + if current_l1 is not None: + lines.append('
    ') # 마지막 그룹 닫기 + + lines.append('
') + return '\n'.join(lines) + + +# ===== 마크다운 → HTML 변환 ===== +class MarkdownToHtmlConverter: + """마크다운을 HTML로 변환하는 클래스""" + + def __init__(self, image_map: Dict[str, ImageAsset]): + self.image_map = image_map + self.table_counter = {} # chapter -> count + self.figure_counter = {} # chapter -> count + + def get_chapter(self, context: str = "1") -> str: + """현재 챕터 번호 추출""" + return context.split('.')[0] if context else "1" + + def next_table_num(self, chapter: str) -> str: + """다음 표 번호""" + if chapter not in self.table_counter: + self.table_counter[chapter] = 0 + self.table_counter[chapter] += 1 + return f"{chapter}-{self.table_counter[chapter]}" + + def next_figure_num(self, chapter: str) -> str: + """다음 그림 번호""" + if chapter not in self.figure_counter: + self.figure_counter[chapter] = 0 + self.figure_counter[chapter] += 1 + return f"{chapter}-{self.figure_counter[chapter]}" + + def convert_table(self, md_table: str, caption: str = "", chapter: str = "1") -> str: + """마크다운 테이블 → HTML 테이블""" + lines = [l.strip() for l in md_table.strip().split('\n') if l.strip()] + if len(lines) < 2: + return "" + + # 헤더 행 + header_cells = [c.strip() for c in lines[0].split('|') if c.strip()] + + # 구분선 건너뛰기 (|---|---|) + data_start = 1 + if len(lines) > 1 and re.match(r'^[\|\s\-:]+$', lines[1]): + data_start = 2 + + # 데이터 행 + data_rows = [] + for line in lines[data_start:]: + cells = [c.strip() for c in line.split('|') if c.strip()] + if cells: + data_rows.append(cells) + + # HTML 생성 + html_lines = [''] + + # thead + html_lines.append('') + for cell in header_cells: + # **text** → text + cell = re.sub(r'\*\*(.+?)\*\*', r'\1', cell) + html_lines.append(f'') + html_lines.append('') + + # tbody + html_lines.append('') + for row in data_rows: + html_lines.append('') + for cell in row: + # **text** 처리 + cell = re.sub(r'\*\*(.+?)\*\*', r'\1', cell) + #
처리 + cell = cell.replace('
', '
') + html_lines.append(f'') + html_lines.append('') + html_lines.append('') + html_lines.append('
{cell}
{cell}
') + + # 캡션 추가 + if caption: + html_lines.append(f'
{caption}
') + + return '\n'.join(html_lines) + + def convert_image_placeholder(self, placeholder: str, chapter: str = "1") -> str: + """{{IMG:xxx}} →
변환""" + # {{IMG:1_1_1_img01}} 에서 ID 추출 + match = re.match(r'\{\{IMG:(.+?)\}\}', placeholder) + if not match: + return placeholder + + image_id = match.group(1) + asset = self.image_map.get(image_id) + + if asset and asset.asset_path: + fig_num = self.next_figure_num(chapter) + caption = asset.caption if asset.caption and asset.caption != "Photo" else "" + caption_text = f"[그림 {fig_num}] {caption}" if caption else f"[그림 {fig_num}]" + + return f'''
+ {caption} +
{caption_text}
+
''' + else: + # 이미지 파일이 없는 경우 플레이스홀더 주석으로 + return f'' + + def convert_list(self, md_list: str) -> str: + """마크다운 리스트 → HTML 리스트""" + lines = md_list.strip().split('\n') + html_lines = [] + in_list = False + list_type = 'ul' + + for line in lines: + line = line.strip() + if not line: + continue + + # 순서 없는 리스트 + ul_match = re.match(r'^[\*\-]\s+(.+)$', line) + # 순서 있는 리스트 + ol_match = re.match(r'^(\d+)\.\s+(.+)$', line) + + if ul_match: + if not in_list: + html_lines.append('