From 0fa8e26f2d1f6f97de85d410822611fcf58a1b36 Mon Sep 17 00:00:00 2001 From: "Dana K. Williams" Date: Fri, 3 Jul 2026 01:04:04 +0000 Subject: [PATCH] fix(tree_renderer): embed extracted images in the rendered summary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For long documents (doc_type: pageindex), images are correctly extracted to wiki/sources/images// and referenced with correct wiki-relative paths in wiki/sources/.json — but nothing in the actual Obsidian-rendered wiki (summaries, concept pages, entity pages, index.md) ever surfaced them. That raw JSON file isn't rendered as a wiki page, so extracted images were effectively invisible anywhere a human browses the vault. render_summary_md now accepts the same per-page `pages` list already written to wiki/sources/.json, builds a page-number -> image-path map, and embeds each node's page-range images inline — de-duped the same way duplicate summaries are (a page split across many sibling nodes doesn't repeat the same figure at each one). _write_long_doc_artifacts already had `pages` in scope when calling render_summary_md; it just wasn't passing it through. Fixes #166. Verified against a real 31-page manual: 35/35 auto-extracted images now appear exactly once each in the rendered summary, correctly positioned by page range, with zero duplicates. --- openkb/indexer.py | 7 +++- openkb/tree_renderer.py | 74 ++++++++++++++++++++++++++++++++++--- tests/test_indexer.py | 12 ++++++ tests/test_tree_renderer.py | 47 +++++++++++++++++++++++ 4 files changed, 132 insertions(+), 8 deletions(-) diff --git a/openkb/indexer.py b/openkb/indexer.py index 0f58bef3..820667a4 100644 --- a/openkb/indexer.py +++ b/openkb/indexer.py @@ -135,7 +135,9 @@ def _write_long_doc_artifacts( Returns the summary path. Shared by :func:`index_long_document` (local) and :func:`import_cloud_document` (cloud) so both produce identical artifacts. Page images, when present, are written separately by the - caller's page extractor — this helper only persists page text + summary. + caller's page extractor — this helper persists page text + summary, and + passes ``pages`` through to ``render_summary_md`` so each node's images + get embedded in the summary too, not just the raw page JSON. """ sources_dir = kb_dir / "wiki" / "sources" sources_dir.mkdir(parents=True, exist_ok=True) @@ -148,7 +150,8 @@ def _write_long_doc_artifacts( summaries_dir.mkdir(parents=True, exist_ok=True) summary_path = summaries_dir / f"{doc_name}.md" summary_path.write_text( - render_summary_md(tree, doc_name, doc_id, description=description), encoding="utf-8" + render_summary_md(tree, doc_name, doc_id, description=description, pages=pages), + encoding="utf-8", ) return summary_path diff --git a/openkb/tree_renderer.py b/openkb/tree_renderer.py index 2424ba10..89587d5b 100644 --- a/openkb/tree_renderer.py +++ b/openkb/tree_renderer.py @@ -15,8 +15,46 @@ def _yaml_frontmatter(source_name: str, doc_id: str, description: str = "") -> s return "---\n" + "\n".join(lines) + "\n---\n" -def _render_nodes_summary(nodes: list[dict], depth: int) -> str: - """Recursively render nodes for the *summary* view (summaries only).""" +def _build_page_images(pages: list[dict] | None) -> dict[int, list[str]]: + """Map page number -> ordered, de-duplicated list of image paths. + + ``pages`` is the same per-page list written to ``wiki/sources/{doc}.json`` + (each item: ``{"page": int, "content": str, "images": [{"path": str}]}``). + Extracted images live there and nowhere else reachable from the rendered + summary — see ``_render_nodes_summary`` for why that alone left them + invisible in the actual Obsidian vault. + """ + page_images: dict[int, list[str]] = {} + for page in pages or []: + page_num_raw = page.get("page") + if page_num_raw is None: + continue + try: + page_num = int(page_num_raw) + except (TypeError, ValueError): + continue + paths: list[str] = [] + for img in page.get("images") or []: + path = img.get("path") if isinstance(img, dict) else None + if isinstance(path, str): + paths.append(path) + if paths: + page_images.setdefault(page_num, []).extend(paths) + return page_images + + +def _render_nodes_summary( + nodes: list[dict], + depth: int, + page_images: dict[int, list[str]], + seen_images: set[str], +) -> str: + """Recursively render nodes for the *summary* view (summaries only). + + A page's images are attached to whichever node covers that page first + (``seen_images`` tracks paths already emitted), so a page split across + many sibling nodes doesn't repeat the same figure at every one of them. + """ lines: list[str] = [] heading_prefix = "#" * min(depth, 6) for node in nodes: @@ -27,22 +65,46 @@ def _render_nodes_summary(nodes: list[dict], depth: int) -> str: children = node.get("nodes", []) lines.append(f"{heading_prefix} {title} (pages {start}–{end})\n") + + new_images: list[str] = [] + try: + page_range = range(int(start), int(end) + 1) + except (TypeError, ValueError): + page_range = range(0) + for page_num in page_range: + for path in page_images.get(page_num, []): + if path not in seen_images: + seen_images.add(path) + new_images.append(path) + for path in new_images: + lines.append(f"![image]({path})\n") + if summary: lines.append(f"Summary: {summary}\n") if children: - lines.append(_render_nodes_summary(children, depth + 1)) + lines.append(_render_nodes_summary(children, depth + 1, page_images, seen_images)) return "\n".join(lines) -def render_summary_md(tree: dict, source_name: str, doc_id: str, description: str = "") -> str: +def render_summary_md( + tree: dict, + source_name: str, + doc_id: str, + description: str = "", + pages: list[dict] | None = None, +) -> str: """Render the summary Markdown page for a PageIndex tree. Renders each node as a heading with page range and its summary text. Includes a YAML frontmatter block with ``type: "Summary"`` and an - optional ``description`` field. + optional ``description`` field. ``pages`` (the same per-page list written + to ``wiki/sources/{doc}.json``) is used to embed each node's page-range + images inline — without it, extracted images are only ever referenced + from that raw JSON file, never from anything the Obsidian vault renders. """ frontmatter = _yaml_frontmatter(source_name, doc_id, description) structure = tree.get("structure", []) - body = _render_nodes_summary(structure, depth=1) + page_images = _build_page_images(pages) + body = _render_nodes_summary(structure, depth=1, page_images=page_images, seen_images=set()) return frontmatter + "\n" + body diff --git a/tests/test_indexer.py b/tests/test_indexer.py index 3af85b04..df847618 100644 --- a/tests/test_indexer.py +++ b/tests/test_indexer.py @@ -370,6 +370,18 @@ def test_write_long_doc_artifacts_writes_json_and_summary(kb_dir, sample_tree): assert "doc_type: pageindex" in summary_path.read_text(encoding="utf-8") +def test_write_long_doc_artifacts_embeds_page_images_in_summary(kb_dir, sample_tree): + # sample_tree's "Introduction" node spans pages 0-120; a page in that + # range should have its image embedded in the written summary, not just + # referenced from the sibling wiki/sources/.json file. + from openkb.indexer import _write_long_doc_artifacts + + pages = [{"page": 5, "content": "...", "images": [{"path": "sources/images/my-doc/p5.png"}]}] + summary_path = _write_long_doc_artifacts(sample_tree, pages, "my-doc", "doc-1", kb_dir) + + assert "![image](sources/images/my-doc/p5.png)" in summary_path.read_text(encoding="utf-8") + + def test_fetch_cloud_pages_windows_over_1000_cap(): """get_page_content's range filter is capped at 1000 pages by parse_pages, so _fetch_cloud_pages must request fixed 1000-page windows (never a wider range) diff --git a/tests/test_tree_renderer.py b/tests/test_tree_renderer.py index 3786cfe4..6f2cd228 100644 --- a/tests/test_tree_renderer.py +++ b/tests/test_tree_renderer.py @@ -52,6 +52,53 @@ def test_summary_md_has_type_and_description(): assert 'full_text: "sources/my-doc.json"' in md +def test_node_images_are_embedded_from_pages(sample_tree): + # "Introduction" spans pages 0-120; page 5 is inside that range. + pages = [{"page": 5, "content": "...", "images": [{"path": "sources/images/doc/p5.png"}]}] + md = render_summary_md(sample_tree, "Sample Document", "doc-abc", pages=pages) + assert "![image](sources/images/doc/p5.png)" in md + + +def test_no_pages_argument_means_no_images(sample_tree): + md = render_summary_md(sample_tree, "Sample Document", "doc-abc") + assert "![image]" not in md + + +def test_image_on_a_page_shared_by_two_nodes_is_not_duplicated(): + # Two sibling nodes both cover page 3 (a "no TOC" fallback can split one + # physical page across several nodes) — the image on that page should + # only be attached to the first of them, not repeated at both. + tree = { + "structure": [ + {"title": "A", "start_index": 3, "end_index": 3, "summary": "", "nodes": []}, + {"title": "B", "start_index": 3, "end_index": 3, "summary": "", "nodes": []}, + ] + } + pages = [{"page": 3, "content": "...", "images": [{"path": "sources/images/doc/p3.png"}]}] + md = render_summary_md(tree, "my-doc", "doc-123", pages=pages) + assert md.count("![image](sources/images/doc/p3.png)") == 1 + + +def test_multiple_images_on_one_page_all_render_in_order(): + tree = { + "structure": [{"title": "A", "start_index": 1, "end_index": 1, "summary": "", "nodes": []}] + } + pages = [ + { + "page": 1, + "content": "...", + "images": [ + {"path": "sources/images/doc/a.png"}, + {"path": "sources/images/doc/b.png"}, + ], + } + ] + md = render_summary_md(tree, "my-doc", "doc-123", pages=pages) + a_idx = md.index("a.png") + b_idx = md.index("b.png") + assert a_idx < b_idx + + def test_summary_full_text_quoted_yaml_safe(): import yaml