feat(step2+step3): slide-level rich ContentObject trace (IMP-03 #3)
- Add extract_rich_content_objects(normalized_assets, mdx_id) in
phase_z2_content_extractor.py emitting slide-level rich ContentObjects
for SPEC v1 §1.2 types: details (popups), image, table
- Extend ContentObject dataclass with optional scope/mdx_id/section_id
metadata fields (additive, default None — v0 unchanged)
- _stage0_chained_adapter() returns 5-tuple adding normalized_assets
({popups, images, tables}); empty on env=OFF / hard fallback
- Step 2 artifact gains additive stage0_normalized_assets nested field
(env=OFF / fallback → empty lists). Existing 7 fields preserved.
- Step 3 emits root-level rich_content_objects once at slide scope
with rich_content_objects_enabled / scope / source / disabled_reason /
skips / invariant_warnings. per_zone list still references v0 only.
- PHASE_Z_STEP3_RICH_OBJECTS_ENABLED env flag, default OFF (canary,
matches PHASE_Z_STAGE0_ADAPTER_ENABLED / PHASE_Z_B4_*). Enable
requires flag=1 AND non-empty normalized_assets; otherwise records
disabled_reason = FLAG_OFF or NO_NORMALIZED_ASSETS.
- transform_table dedup: arrow glyph detection in normalized table
rows/headers → skip with reason=skipped_transform_table_duplicate.
v0 _capture_3col_transform_table remains the sole transform_table
source; generic table only for non-transform tables.
- ID pattern {mdx_id}.{details,image,table}-N (slide-level namespace).
- plan_placement() input unchanged (v0 content_objects only) — rich
list never feeds placement/region planning in this issue.
- self-test extended with 5 rich extractor cases (popup/image/table
/arrow-skip/empty); v0 self-test unchanged and still PASS.
- mapper / V4 / composition / Step 6+ / AI/Kei / pipeline_path_connected
unchanged. trace fidelity only.
env OFF + rich OFF: legacy PASS, no regression
env OFF + rich=1 : disabled_reason=NO_NORMALIZED_ASSETS, rich list empty
env=1 + rich=1 : Step 2 stage0_normalized_assets populated (1 table on
MDX 03, invariant match adapter_counts). Step 3 write
blocked by inherited IMP-02 composition_planner abort
(downstream gap, not IMP-03 scope).
Refs Gitea #3 (IMP-03 A-1 popup/image/table trace)
This commit is contained in:
@@ -48,13 +48,16 @@ class ContentObject:
|
|||||||
|
|
||||||
Fields :
|
Fields :
|
||||||
id : section 내 unique id (예: '03-2.transform-1' / '03-2.text-1')
|
id : section 내 unique id (예: '03-2.transform-1' / '03-2.text-1')
|
||||||
type : "text_block" | "transform_table"
|
type : "text_block" | "transform_table" | "details" | "image" | "table"
|
||||||
role : v0 = "summary" 만 (정밀화는 별 axis)
|
role : v0 = "summary" 만 (정밀화는 별 axis)
|
||||||
raw_payload : 원본 markdown (자름 / 변형 X — 원문 보존 룰)
|
raw_payload : 원본 markdown (자름 / 변형 X — 원문 보존 룰)
|
||||||
size_estimate : type 별 (line_count / rows 등)
|
size_estimate : type 별 (line_count / rows 등)
|
||||||
type_specific : type 별 detail (SPEC v1 §1.2)
|
type_specific : type 별 detail (SPEC v1 §1.2)
|
||||||
source_shape_index : positional index within source_shape (Option 1, optional)
|
source_shape_index : positional index within source_shape (Option 1, optional)
|
||||||
source_shape_kind : "top_bullets" | "h3_subsections" | ... (Option 1, optional)
|
source_shape_kind : "top_bullets" | "h3_subsections" | ... (Option 1, optional)
|
||||||
|
scope : "section" (default for v0) | "slide" (IMP-03 rich objects)
|
||||||
|
mdx_id : 2-digit MDX id (예: '03') — slide-level rich objects 용
|
||||||
|
section_id : section 매핑 — slide-level rich objects 는 None
|
||||||
"""
|
"""
|
||||||
|
|
||||||
id: str
|
id: str
|
||||||
@@ -65,6 +68,9 @@ class ContentObject:
|
|||||||
type_specific: dict = field(default_factory=dict)
|
type_specific: dict = field(default_factory=dict)
|
||||||
source_shape_index: Optional[int] = None
|
source_shape_index: Optional[int] = None
|
||||||
source_shape_kind: Optional[str] = None
|
source_shape_kind: Optional[str] = None
|
||||||
|
scope: Optional[str] = None
|
||||||
|
mdx_id: Optional[str] = None
|
||||||
|
section_id: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
# ─── Transform table extraction ─────────────────────────────────
|
# ─── Transform table extraction ─────────────────────────────────
|
||||||
@@ -274,6 +280,169 @@ def extract_content_objects(section, source_shape: Optional[str] = None) -> list
|
|||||||
return objects
|
return objects
|
||||||
|
|
||||||
|
|
||||||
|
# ─── IMP-03 (Step 3) — rich ContentObject extractor (slide-level) ─
|
||||||
|
|
||||||
|
# scope-lock 16 조건 (Gitea #3) :
|
||||||
|
# - SPEC v1 §1.2 의 table / image / details 3 type 추가 (diagram 제외)
|
||||||
|
# - 별 함수 분리 — v0 `extract_content_objects` signature/behavior 미터치
|
||||||
|
# - slide-level attribution — section param 없음, id = `{mdx_id}.{type}-N`,
|
||||||
|
# ContentObject.scope='slide' / mdx_id=<id> / section_id=None
|
||||||
|
# - transform_table dedup — arrow row 감지 시 skip (v0 가 단독 source)
|
||||||
|
# - asset row shape contract (mdx_normalizer SoT) :
|
||||||
|
# popup = {title:str, content:str}
|
||||||
|
# image = {alt:str, path:str}
|
||||||
|
# table = {headers:list[str], rows:list[list[str]]}
|
||||||
|
# - render path 미연결 — Step 3 artifact trace only
|
||||||
|
# - plan_placement() 는 *v0 list 만* 받음 (B4 회귀 X)
|
||||||
|
|
||||||
|
|
||||||
|
def _looks_like_transform_table(table: dict) -> bool:
|
||||||
|
"""normalize_mdx_content 의 table 가 AS-IS / arrow / TO-BE 구조인지 감지.
|
||||||
|
|
||||||
|
arrow row 가 *어떤 column 이든* 1 개 이상 등장 → transform 으로 분류 (v0 가 처리).
|
||||||
|
|
||||||
|
Args :
|
||||||
|
table : {"headers": list[str], "rows": list[list[str]]}
|
||||||
|
Returns :
|
||||||
|
True = transform_table 후보 (rich extractor 는 skip)
|
||||||
|
False = 일반 table
|
||||||
|
"""
|
||||||
|
rows = table.get("rows") or []
|
||||||
|
for row in rows:
|
||||||
|
for cell in row:
|
||||||
|
cell_s = str(cell) if cell is not None else ""
|
||||||
|
if any(g in cell_s for g in _ARROW_GLYPHS):
|
||||||
|
return True
|
||||||
|
headers = table.get("headers") or []
|
||||||
|
for h in headers:
|
||||||
|
h_s = str(h) if h is not None else ""
|
||||||
|
if any(g in h_s for g in _ARROW_GLYPHS):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _reconstruct_markdown_table(headers: list, rows: list) -> str:
|
||||||
|
"""headers / rows → markdown table string (raw_md / raw_payload 용)."""
|
||||||
|
if not headers and not rows:
|
||||||
|
return ""
|
||||||
|
out_lines: list[str] = []
|
||||||
|
if headers:
|
||||||
|
out_lines.append("| " + " | ".join(str(h) for h in headers) + " |")
|
||||||
|
out_lines.append("|" + "|".join("---" for _ in headers) + "|")
|
||||||
|
for row in rows:
|
||||||
|
out_lines.append("| " + " | ".join(str(c) for c in row) + " |")
|
||||||
|
return "\n".join(out_lines)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_rich_content_objects(
|
||||||
|
normalized_assets: Optional[dict],
|
||||||
|
mdx_id: str,
|
||||||
|
) -> tuple[list[ContentObject], list[dict]]:
|
||||||
|
"""IMP-03 — slide-level rich ContentObject extractor.
|
||||||
|
|
||||||
|
Consumes mdx_normalizer's flat popup/image/table lists (via
|
||||||
|
`stage0_normalized_assets`) and emits typed ContentObjects with
|
||||||
|
slide-level attribution (`scope='slide'`, `section_id=None`).
|
||||||
|
|
||||||
|
transform_table dedup : arrow glyph 감지 시 skip — v0
|
||||||
|
`_capture_3col_transform_table()` 가 단독 transform_table source.
|
||||||
|
skip 시 진단 entry 반환 (`skipped_transform_table_duplicate` reason).
|
||||||
|
|
||||||
|
Args :
|
||||||
|
normalized_assets : {popups: [{title, content}], images: [{alt, path}],
|
||||||
|
tables: [{headers, rows}]} 또는 None
|
||||||
|
mdx_id : 2-digit MDX id (예: '03')
|
||||||
|
|
||||||
|
Returns :
|
||||||
|
(rich_objects, skip_diagnostics)
|
||||||
|
rich_objects : list[ContentObject] — slide-level
|
||||||
|
skip_diagnostics : list[dict] — 각 skip 사유 (index, reason)
|
||||||
|
"""
|
||||||
|
if not normalized_assets:
|
||||||
|
return [], []
|
||||||
|
|
||||||
|
out: list[ContentObject] = []
|
||||||
|
skips: list[dict] = []
|
||||||
|
|
||||||
|
# details (popups) — sequence 1..N
|
||||||
|
for i, p in enumerate(normalized_assets.get("popups") or [], start=1):
|
||||||
|
title = (p.get("title") or "").strip() if isinstance(p, dict) else ""
|
||||||
|
body = (p.get("content") or "").strip() if isinstance(p, dict) else ""
|
||||||
|
line_count = body.count("\n") + (1 if body else 0)
|
||||||
|
out.append(ContentObject(
|
||||||
|
id=f"{mdx_id}.details-{i}",
|
||||||
|
type="details",
|
||||||
|
role="summary",
|
||||||
|
raw_payload=body,
|
||||||
|
size_estimate={"line_count": line_count, "bytes": len(body)},
|
||||||
|
type_specific={
|
||||||
|
"summary": title,
|
||||||
|
"body_raw": body,
|
||||||
|
"display_hint": "popup",
|
||||||
|
},
|
||||||
|
scope="slide",
|
||||||
|
mdx_id=mdx_id,
|
||||||
|
section_id=None,
|
||||||
|
))
|
||||||
|
|
||||||
|
# image
|
||||||
|
for i, img in enumerate(normalized_assets.get("images") or [], start=1):
|
||||||
|
src = (img.get("path") or "").strip() if isinstance(img, dict) else ""
|
||||||
|
alt = (img.get("alt") or "").strip() if isinstance(img, dict) else ""
|
||||||
|
out.append(ContentObject(
|
||||||
|
id=f"{mdx_id}.image-{i}",
|
||||||
|
type="image",
|
||||||
|
role="summary",
|
||||||
|
raw_payload=src,
|
||||||
|
size_estimate={"bytes": len(src)},
|
||||||
|
type_specific={
|
||||||
|
"src": src,
|
||||||
|
"alt": alt,
|
||||||
|
"aspect_ratio": None,
|
||||||
|
"intrinsic_width_px": None,
|
||||||
|
"intrinsic_height_px": None,
|
||||||
|
},
|
||||||
|
scope="slide",
|
||||||
|
mdx_id=mdx_id,
|
||||||
|
section_id=None,
|
||||||
|
))
|
||||||
|
|
||||||
|
# table — arrow 감지 시 skip
|
||||||
|
for i, t in enumerate(normalized_assets.get("tables") or [], start=1):
|
||||||
|
if not isinstance(t, dict):
|
||||||
|
skips.append({"index": i, "reason": "invalid_table_shape"})
|
||||||
|
continue
|
||||||
|
if _looks_like_transform_table(t):
|
||||||
|
skips.append({
|
||||||
|
"index": i,
|
||||||
|
"reason": "skipped_transform_table_duplicate",
|
||||||
|
"headers": t.get("headers") or [],
|
||||||
|
})
|
||||||
|
continue
|
||||||
|
headers = t.get("headers") or []
|
||||||
|
rows = t.get("rows") or []
|
||||||
|
raw_md = _reconstruct_markdown_table(headers, rows)
|
||||||
|
out.append(ContentObject(
|
||||||
|
id=f"{mdx_id}.table-{i}",
|
||||||
|
type="table",
|
||||||
|
role="summary",
|
||||||
|
raw_payload=raw_md,
|
||||||
|
size_estimate={"rows": len(rows), "bytes": len(raw_md)},
|
||||||
|
type_specific={
|
||||||
|
"rows": len(rows),
|
||||||
|
"cols": len(headers),
|
||||||
|
"header_present": bool(headers),
|
||||||
|
"is_transform": False,
|
||||||
|
"raw_md": raw_md,
|
||||||
|
},
|
||||||
|
scope="slide",
|
||||||
|
mdx_id=mdx_id,
|
||||||
|
section_id=None,
|
||||||
|
))
|
||||||
|
|
||||||
|
return out, skips
|
||||||
|
|
||||||
|
|
||||||
# ─── Self-test (B1 v0 correctness 검증) ─────────────────────────
|
# ─── Self-test (B1 v0 correctness 검증) ─────────────────────────
|
||||||
|
|
||||||
|
|
||||||
@@ -354,5 +523,89 @@ def _run_self_test():
|
|||||||
print("\n=== B1 v0 self-test PASS ===")
|
print("\n=== B1 v0 self-test PASS ===")
|
||||||
|
|
||||||
|
|
||||||
|
def _run_rich_self_test():
|
||||||
|
"""IMP-03 (Step 3) — rich ContentObject extractor 3 case self-test.
|
||||||
|
|
||||||
|
cases :
|
||||||
|
1. popup → details ContentObject
|
||||||
|
2. image → image ContentObject
|
||||||
|
3. table (non-transform) → table ContentObject
|
||||||
|
4. table (arrow) → skip (transform_table dedup)
|
||||||
|
"""
|
||||||
|
|
||||||
|
# ─── Test 1 : popup → details ───
|
||||||
|
assets1 = {
|
||||||
|
"popups": [{"title": "F13 안", "content": "정책 사례 정리 ...\n2 번째 줄"}],
|
||||||
|
"images": [],
|
||||||
|
"tables": [],
|
||||||
|
}
|
||||||
|
rich1, skips1 = extract_rich_content_objects(assets1, mdx_id="03")
|
||||||
|
assert len(rich1) == 1 and not skips1, f"popup → 1 obj, got rich={len(rich1)} skips={len(skips1)}"
|
||||||
|
o = rich1[0]
|
||||||
|
assert o.id == "03.details-1"
|
||||||
|
assert o.type == "details" and o.role == "summary"
|
||||||
|
assert o.scope == "slide" and o.mdx_id == "03" and o.section_id is None
|
||||||
|
assert o.type_specific["summary"] == "F13 안"
|
||||||
|
assert o.type_specific["display_hint"] == "popup"
|
||||||
|
assert "정책 사례" in o.type_specific["body_raw"]
|
||||||
|
print("[OK] Rich Test 1 (popup → details) passed.")
|
||||||
|
|
||||||
|
# ─── Test 2 : image ───
|
||||||
|
assets2 = {
|
||||||
|
"popups": [],
|
||||||
|
"images": [{"alt": "BIM 모델", "path": "img/bim.png"}],
|
||||||
|
"tables": [],
|
||||||
|
}
|
||||||
|
rich2, skips2 = extract_rich_content_objects(assets2, mdx_id="03")
|
||||||
|
assert len(rich2) == 1 and not skips2
|
||||||
|
o = rich2[0]
|
||||||
|
assert o.id == "03.image-1"
|
||||||
|
assert o.type == "image" and o.scope == "slide"
|
||||||
|
assert o.type_specific["src"] == "img/bim.png"
|
||||||
|
assert o.type_specific["alt"] == "BIM 모델"
|
||||||
|
assert o.type_specific["aspect_ratio"] is None
|
||||||
|
print("[OK] Rich Test 2 (image) passed.")
|
||||||
|
|
||||||
|
# ─── Test 3 : non-transform table ───
|
||||||
|
assets3 = {
|
||||||
|
"popups": [],
|
||||||
|
"images": [],
|
||||||
|
"tables": [{"headers": ["분류", "내용"], "rows": [["기술", "BIM"], ["인력", "전문가"]]}],
|
||||||
|
}
|
||||||
|
rich3, skips3 = extract_rich_content_objects(assets3, mdx_id="03")
|
||||||
|
assert len(rich3) == 1 and not skips3
|
||||||
|
o = rich3[0]
|
||||||
|
assert o.id == "03.table-1"
|
||||||
|
assert o.type == "table" and o.scope == "slide"
|
||||||
|
assert o.type_specific["rows"] == 2 and o.type_specific["cols"] == 2
|
||||||
|
assert o.type_specific["header_present"] is True
|
||||||
|
assert o.type_specific["is_transform"] is False
|
||||||
|
assert "기술" in o.type_specific["raw_md"]
|
||||||
|
print("[OK] Rich Test 3 (non-transform table) passed.")
|
||||||
|
|
||||||
|
# ─── Test 4 : arrow transform table → skip ───
|
||||||
|
assets4 = {
|
||||||
|
"popups": [],
|
||||||
|
"images": [],
|
||||||
|
"tables": [{"headers": ["AS-IS", "➜", "TO-BE"],
|
||||||
|
"rows": [["도면 중심", "➜", "BIM 중심"]]}],
|
||||||
|
}
|
||||||
|
rich4, skips4 = extract_rich_content_objects(assets4, mdx_id="03")
|
||||||
|
assert len(rich4) == 0, f"arrow table → 0 rich obj 기대, got {len(rich4)}"
|
||||||
|
assert len(skips4) == 1
|
||||||
|
assert skips4[0]["reason"] == "skipped_transform_table_duplicate"
|
||||||
|
print("[OK] Rich Test 4 (arrow table → skip) passed.")
|
||||||
|
|
||||||
|
# ─── Test 5 : empty normalized_assets → empty ───
|
||||||
|
rich5, skips5 = extract_rich_content_objects(None, mdx_id="03")
|
||||||
|
assert rich5 == [] and skips5 == []
|
||||||
|
rich6, skips6 = extract_rich_content_objects({}, mdx_id="03")
|
||||||
|
assert rich6 == [] and skips6 == []
|
||||||
|
print("[OK] Rich Test 5 (empty) passed.")
|
||||||
|
|
||||||
|
print("\n=== IMP-03 rich extractor self-test PASS ===")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
_run_self_test()
|
_run_self_test()
|
||||||
|
_run_rich_self_test()
|
||||||
|
|||||||
@@ -64,7 +64,7 @@ from phase_z2_failure_router import enrich_retry_trace_with_failure_classificati
|
|||||||
|
|
||||||
# trace-only runtime 연결 v0 — B1 → B4 chain.
|
# trace-only runtime 연결 v0 — B1 → B4 chain.
|
||||||
# final.html / mapper / render path 미영향. debug_zones[i].placement_trace 만 기록.
|
# final.html / mapper / render path 미영향. debug_zones[i].placement_trace 만 기록.
|
||||||
from phase_z2_content_extractor import extract_content_objects
|
from phase_z2_content_extractor import extract_content_objects, extract_rich_content_objects
|
||||||
from phase_z2_placement_planner import plan_placement
|
from phase_z2_placement_planner import plan_placement
|
||||||
|
|
||||||
|
|
||||||
@@ -223,7 +223,7 @@ def _stage0_chained_adapter(
|
|||||||
legacy_slide_title: str,
|
legacy_slide_title: str,
|
||||||
legacy_sections: list[MdxSection],
|
legacy_sections: list[MdxSection],
|
||||||
legacy_footer: Optional[str],
|
legacy_footer: Optional[str],
|
||||||
) -> tuple[str, list[MdxSection], Optional[str], dict]:
|
) -> tuple[str, list[MdxSection], Optional[str], dict, dict]:
|
||||||
"""IMP-02 — chained adapter for Stage 0 normalize → Phase Z Step 2 input.
|
"""IMP-02 — chained adapter for Stage 0 normalize → Phase Z Step 2 input.
|
||||||
|
|
||||||
Chain: mdx_normalizer.normalize_mdx_content + section_parser.extract_major_sections
|
Chain: mdx_normalizer.normalize_mdx_content + section_parser.extract_major_sections
|
||||||
@@ -233,7 +233,9 @@ def _stage0_chained_adapter(
|
|||||||
output with diagnostics indicating disabled. When ON, runs adapter chain; on any
|
output with diagnostics indicating disabled. When ON, runs adapter chain; on any
|
||||||
hard contract failure or exception, falls back to legacy and records fallback_reason.
|
hard contract failure or exception, falls back to legacy and records fallback_reason.
|
||||||
|
|
||||||
Returns (slide_title, sections, footer, diagnostics).
|
Returns (slide_title, sections, footer, diagnostics, normalized_assets).
|
||||||
|
normalized_assets = {"popups": [...], "images": [...], "tables": [...]}
|
||||||
|
— IMP-03 Step 3 handoff. env=OFF or hard fallback 시 빈 list.
|
||||||
"""
|
"""
|
||||||
diagnostics: dict = {
|
diagnostics: dict = {
|
||||||
"enabled": False,
|
"enabled": False,
|
||||||
@@ -243,12 +245,14 @@ def _stage0_chained_adapter(
|
|||||||
"adapter_counts": None,
|
"adapter_counts": None,
|
||||||
"legacy_counts": {"sections": len(legacy_sections)},
|
"legacy_counts": {"sections": len(legacy_sections)},
|
||||||
}
|
}
|
||||||
|
# IMP-03 — Step 3 handoff. env=OFF / fallback 시 모든 list 가 비어 있음.
|
||||||
|
normalized_assets: dict = {"popups": [], "images": [], "tables": []}
|
||||||
|
|
||||||
raw_flag = os.environ.get("PHASE_Z_STAGE0_ADAPTER_ENABLED", "").strip().lower()
|
raw_flag = os.environ.get("PHASE_Z_STAGE0_ADAPTER_ENABLED", "").strip().lower()
|
||||||
enabled = raw_flag in {"1", "true", "yes"}
|
enabled = raw_flag in {"1", "true", "yes"}
|
||||||
diagnostics["enabled"] = enabled
|
diagnostics["enabled"] = enabled
|
||||||
if not enabled:
|
if not enabled:
|
||||||
return legacy_slide_title, legacy_sections, legacy_footer, diagnostics
|
return legacy_slide_title, legacy_sections, legacy_footer, diagnostics, normalized_assets
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Defer imports — legacy path must not depend on these modules.
|
# Defer imports — legacy path must not depend on these modules.
|
||||||
@@ -259,12 +263,12 @@ def _stage0_chained_adapter(
|
|||||||
normalized = normalize_mdx_content(raw_mdx)
|
normalized = normalize_mdx_content(raw_mdx)
|
||||||
if not isinstance(normalized, dict) or not isinstance(normalized.get("sections"), list):
|
if not isinstance(normalized, dict) or not isinstance(normalized.get("sections"), list):
|
||||||
diagnostics["fallback_reason"] = "MISSING_INVALID_IDS"
|
diagnostics["fallback_reason"] = "MISSING_INVALID_IDS"
|
||||||
return legacy_slide_title, legacy_sections, legacy_footer, diagnostics
|
return legacy_slide_title, legacy_sections, legacy_footer, diagnostics, normalized_assets
|
||||||
|
|
||||||
majors = extract_major_sections(normalized["sections"])
|
majors = extract_major_sections(normalized["sections"])
|
||||||
if not majors:
|
if not majors:
|
||||||
diagnostics["fallback_reason"] = "NO_USABLE_SECTIONS"
|
diagnostics["fallback_reason"] = "NO_USABLE_SECTIONS"
|
||||||
return legacy_slide_title, legacy_sections, legacy_footer, diagnostics
|
return legacy_slide_title, legacy_sections, legacy_footer, diagnostics, normalized_assets
|
||||||
|
|
||||||
adapter_title = (normalized.get("title") or "").strip() or legacy_slide_title
|
adapter_title = (normalized.get("title") or "").strip() or legacy_slide_title
|
||||||
conclusion = extract_conclusion_text(raw_mdx)
|
conclusion = extract_conclusion_text(raw_mdx)
|
||||||
@@ -304,10 +308,10 @@ def _stage0_chained_adapter(
|
|||||||
|
|
||||||
if section_num <= 0:
|
if section_num <= 0:
|
||||||
diagnostics["fallback_reason"] = "NON_POSITIVE_SECTION_NUM"
|
diagnostics["fallback_reason"] = "NON_POSITIVE_SECTION_NUM"
|
||||||
return legacy_slide_title, legacy_sections, legacy_footer, diagnostics
|
return legacy_slide_title, legacy_sections, legacy_footer, diagnostics, normalized_assets
|
||||||
if section_num in used_nums:
|
if section_num in used_nums:
|
||||||
diagnostics["fallback_reason"] = "DUPLICATE_IDS"
|
diagnostics["fallback_reason"] = "DUPLICATE_IDS"
|
||||||
return legacy_slide_title, legacy_sections, legacy_footer, diagnostics
|
return legacy_slide_title, legacy_sections, legacy_footer, diagnostics, normalized_assets
|
||||||
used_nums.add(section_num)
|
used_nums.add(section_num)
|
||||||
|
|
||||||
diagnostics["id_reconstruction_log"].append({
|
diagnostics["id_reconstruction_log"].append({
|
||||||
@@ -337,12 +341,19 @@ def _stage0_chained_adapter(
|
|||||||
"footer_match": adapter_footer == legacy_footer,
|
"footer_match": adapter_footer == legacy_footer,
|
||||||
}
|
}
|
||||||
diagnostics["used"] = True
|
diagnostics["used"] = True
|
||||||
return adapter_title, adapter_sections, adapter_footer, diagnostics
|
# IMP-03 — populate Step 3 handoff (success path only).
|
||||||
|
# All fallback paths leave normalized_assets as empty lists (defined at fn top).
|
||||||
|
normalized_assets = {
|
||||||
|
"popups": normalized.get("popups", []) or [],
|
||||||
|
"images": normalized.get("images", []) or [],
|
||||||
|
"tables": normalized.get("tables", []) or [],
|
||||||
|
}
|
||||||
|
return adapter_title, adapter_sections, adapter_footer, diagnostics, normalized_assets
|
||||||
|
|
||||||
except Exception as exc: # noqa: BLE001 — adapter must never break legacy path
|
except Exception as exc: # noqa: BLE001 — adapter must never break legacy path
|
||||||
diagnostics["fallback_reason"] = "ADAPTER_EXCEPTION"
|
diagnostics["fallback_reason"] = "ADAPTER_EXCEPTION"
|
||||||
diagnostics["exception"] = repr(exc)
|
diagnostics["exception"] = repr(exc)
|
||||||
return legacy_slide_title, legacy_sections, legacy_footer, diagnostics
|
return legacy_slide_title, legacy_sections, legacy_footer, diagnostics, normalized_assets
|
||||||
|
|
||||||
|
|
||||||
# ─── V4 lookup ──────────────────────────────────────────────────
|
# ─── V4 lookup ──────────────────────────────────────────────────
|
||||||
@@ -1480,7 +1491,14 @@ def run_phase_z2_mvp1(
|
|||||||
# (mdx_normalizer + section_parser) replaces legacy parse_mdx output;
|
# (mdx_normalizer + section_parser) replaces legacy parse_mdx output;
|
||||||
# on any contract failure or exception, falls back to legacy with
|
# on any contract failure or exception, falls back to legacy with
|
||||||
# fallback_reason recorded in stage0_adapter_diagnostics.
|
# fallback_reason recorded in stage0_adapter_diagnostics.
|
||||||
slide_title, sections, slide_footer, stage0_adapter_diagnostics = _stage0_chained_adapter(
|
# IMP-03 — 5-tuple return adds stage0_normalized_assets (Step 3 handoff).
|
||||||
|
(
|
||||||
|
slide_title,
|
||||||
|
sections,
|
||||||
|
slide_footer,
|
||||||
|
stage0_adapter_diagnostics,
|
||||||
|
stage0_normalized_assets,
|
||||||
|
) = _stage0_chained_adapter(
|
||||||
mdx_path, legacy_slide_title, legacy_sections, legacy_footer,
|
mdx_path, legacy_slide_title, legacy_sections, legacy_footer,
|
||||||
)
|
)
|
||||||
_adapter_tag = (
|
_adapter_tag = (
|
||||||
@@ -1516,6 +1534,10 @@ def run_phase_z2_mvp1(
|
|||||||
# IMP-02 — additive only. enabled/used/fallback_reason + id reconstruction
|
# IMP-02 — additive only. enabled/used/fallback_reason + id reconstruction
|
||||||
# trace + count diff. Out of scope: V4 / align / composition.
|
# trace + count diff. Out of scope: V4 / align / composition.
|
||||||
"stage0_adapter_diagnostics": stage0_adapter_diagnostics,
|
"stage0_adapter_diagnostics": stage0_adapter_diagnostics,
|
||||||
|
# IMP-03 — Step 3 handoff (slide-level rich asset list).
|
||||||
|
# env=OFF / fallback 시 모든 list 가 비어 있음. consumer = Step 3
|
||||||
|
# rich extractor (PHASE_Z_STEP3_RICH_OBJECTS_ENABLED canary).
|
||||||
|
"stage0_normalized_assets": stage0_normalized_assets,
|
||||||
},
|
},
|
||||||
step_status="partial",
|
step_status="partial",
|
||||||
pipeline_path_connected=True,
|
pipeline_path_connected=True,
|
||||||
@@ -1526,6 +1548,7 @@ def run_phase_z2_mvp1(
|
|||||||
"heading tree 미생성, orphan / details 감지 미완 (Step 2 ⚠ partial — 별 axis). "
|
"heading tree 미생성, orphan / details 감지 미완 (Step 2 ⚠ partial — 별 axis). "
|
||||||
"orphans / details 필드는 schema lock — 빈 배열이라도 'detection 미수행' marker. "
|
"orphans / details 필드는 schema lock — 빈 배열이라도 'detection 미수행' marker. "
|
||||||
"stage0_adapter_diagnostics = IMP-02 chained adapter trace (default OFF canary). "
|
"stage0_adapter_diagnostics = IMP-02 chained adapter trace (default OFF canary). "
|
||||||
|
"stage0_normalized_assets = IMP-03 Step 3 slide-level handoff (popups/images/tables list)."
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -1891,6 +1914,49 @@ def run_phase_z2_mvp1(
|
|||||||
})
|
})
|
||||||
|
|
||||||
# ─── Step 3: Content Object 추출 (B1, trace-only) ───
|
# ─── Step 3: Content Object 추출 (B1, trace-only) ───
|
||||||
|
# IMP-03 — slide-level rich ContentObject 추출 (default OFF canary).
|
||||||
|
# scope-lock 16 조건 (Gitea #3) :
|
||||||
|
# - 별 함수 (extract_rich_content_objects) — v0 extract_content_objects unchanged
|
||||||
|
# - slide-level — section_id=None, id=`{mdx_id}.{type}-N`, scope='slide'
|
||||||
|
# - root-level once (per-zone duplication X)
|
||||||
|
# - plan_placement() 는 v0 list 만 받음 (B4 회귀 X) — 본 rich 결과는 artifact only
|
||||||
|
# - transform_table dedup : arrow row 감지 시 skip
|
||||||
|
rich_flag = os.environ.get("PHASE_Z_STEP3_RICH_OBJECTS_ENABLED", "").strip().lower()
|
||||||
|
rich_enabled_flag = rich_flag in {"1", "true", "yes"}
|
||||||
|
_assets_total = (
|
||||||
|
len(stage0_normalized_assets.get("popups") or [])
|
||||||
|
+ len(stage0_normalized_assets.get("images") or [])
|
||||||
|
+ len(stage0_normalized_assets.get("tables") or [])
|
||||||
|
)
|
||||||
|
rich_disabled_reason: Optional[str] = None
|
||||||
|
if not rich_enabled_flag:
|
||||||
|
rich_disabled_reason = "FLAG_OFF"
|
||||||
|
elif _assets_total == 0:
|
||||||
|
rich_disabled_reason = "NO_NORMALIZED_ASSETS"
|
||||||
|
|
||||||
|
rich_objects: list = []
|
||||||
|
rich_skips: list = []
|
||||||
|
if rich_disabled_reason is None:
|
||||||
|
mdx_num_match = re.match(r"(\d+)", mdx_path.stem)
|
||||||
|
rich_mdx_id = mdx_num_match.group(1).zfill(2) if mdx_num_match else "00"
|
||||||
|
rich_objects, rich_skips = extract_rich_content_objects(
|
||||||
|
stage0_normalized_assets, mdx_id=rich_mdx_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Count/list invariant check (IMP-02 ↔ IMP-03 chain) — soft warning, no fail.
|
||||||
|
invariant_warnings: list[dict] = []
|
||||||
|
_adapter_counts = (stage0_adapter_diagnostics or {}).get("adapter_counts") or {}
|
||||||
|
if _adapter_counts:
|
||||||
|
for key in ("popups", "images", "tables"):
|
||||||
|
expected = _adapter_counts.get(key)
|
||||||
|
actual = len(stage0_normalized_assets.get(key) or [])
|
||||||
|
if expected is not None and expected != actual:
|
||||||
|
invariant_warnings.append({
|
||||||
|
"field": key,
|
||||||
|
"adapter_counts": expected,
|
||||||
|
"stage0_normalized_assets_len": actual,
|
||||||
|
})
|
||||||
|
|
||||||
_write_step_artifact(
|
_write_step_artifact(
|
||||||
run_dir, 3, "content_objects",
|
run_dir, 3, "content_objects",
|
||||||
data={
|
data={
|
||||||
@@ -1902,12 +1968,25 @@ def run_phase_z2_mvp1(
|
|||||||
}
|
}
|
||||||
for dz in debug_zones
|
for dz in debug_zones
|
||||||
],
|
],
|
||||||
|
# IMP-03 — slide-level rich trace (additive, trace-only).
|
||||||
|
"rich_content_objects": [asdict(o) for o in rich_objects],
|
||||||
|
"rich_content_objects_enabled": rich_disabled_reason is None,
|
||||||
|
"rich_content_objects_scope": "slide",
|
||||||
|
"rich_content_objects_source": "stage0_normalized_assets",
|
||||||
|
"rich_content_objects_disabled_reason": rich_disabled_reason,
|
||||||
|
"rich_content_objects_skips": rich_skips,
|
||||||
|
"rich_content_objects_invariant_warnings": invariant_warnings,
|
||||||
},
|
},
|
||||||
step_status="trace-only",
|
step_status="trace-only",
|
||||||
pipeline_path_connected=False,
|
pipeline_path_connected=False,
|
||||||
inputs=["step02_normalized.json"],
|
inputs=["step02_normalized.json"],
|
||||||
outputs=["step03_content_objects.json"],
|
outputs=["step03_content_objects.json"],
|
||||||
note="현재는 trace 로 기록되지만 render payload 를 직접 만들지는 않음. mapper.py 가 별도로 MDX 직접 파싱.",
|
note=(
|
||||||
|
"현재는 trace 로 기록되지만 render payload 를 직접 만들지는 않음. "
|
||||||
|
"mapper.py 가 별도로 MDX 직접 파싱. "
|
||||||
|
"IMP-03 rich_content_objects = slide-level popup/image/table trace "
|
||||||
|
"(PHASE_Z_STEP3_RICH_OBJECTS_ENABLED canary, default OFF)."
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
# ─── Step 4: Section Internal Composition (B2, trace-only) ───
|
# ─── Step 4: Section Internal Composition (B2, trace-only) ───
|
||||||
|
|||||||
Reference in New Issue
Block a user