From 0fa8e26f2d1f6f97de85d410822611fcf58a1b36 Mon Sep 17 00:00:00 2001
From: "Dana K. Williams" <dana_w@designcomputer.com>
Date: Fri, 3 Jul 2026 01:04:04 +0000
Subject: [PATCH] fix(tree_renderer): embed extracted images in the rendered
 summary
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For long documents (doc_type: pageindex), images are correctly
extracted to wiki/sources/images/<doc>/ and referenced with correct
wiki-relative paths in wiki/sources/<doc>.json — but nothing in the
actual Obsidian-rendered wiki (summaries, concept pages, entity pages,
index.md) ever surfaced them. That raw JSON file isn't rendered as a
wiki page, so extracted images were effectively invisible anywhere a
human browses the vault.

render_summary_md now accepts the same per-page `pages` list already
written to wiki/sources/<doc>.json, builds a page-number -> image-path
map, and embeds each node's page-range images inline — de-duped the
same way duplicate summaries are (a page split across many sibling
nodes doesn't repeat the same figure at each one).
_write_long_doc_artifacts already had `pages` in scope when calling
render_summary_md; it just wasn't passing it through.

Fixes #166. Verified against a real 31-page manual: 35/35
auto-extracted images now appear exactly once each in the rendered
summary, correctly positioned by page range, with zero duplicates.
---
 openkb/indexer.py           |  7 +++-
 openkb/tree_renderer.py     | 74 ++++++++++++++++++++++++++++++++++---
 tests/test_indexer.py       | 12 ++++++
 tests/test_tree_renderer.py | 47 +++++++++++++++++++++++
 4 files changed, 132 insertions(+), 8 deletions(-)
diff --git a/openkb/indexer.py b/openkb/indexer.py
index 0f58bef3..820667a4 100644
--- a/openkb/indexer.py
+++ b/openkb/indexer.py
@@ -135,7 +135,9 @@ def _write_long_doc_artifacts(
     Returns the summary path. Shared by :func:`index_long_document` (local)
     and :func:`import_cloud_document` (cloud) so both produce identical
     artifacts. Page images, when present, are written separately by the
-    caller's page extractor — this helper only persists page text + summary.
+    caller's page extractor — this helper persists page text + summary, and
+    passes ``pages`` through to ``render_summary_md`` so each node's images
+    get embedded in the summary too, not just the raw page JSON.
     """
     sources_dir = kb_dir / "wiki" / "sources"
     sources_dir.mkdir(parents=True, exist_ok=True)
@@ -148,7 +150,8 @@ def _write_long_doc_artifacts(
     summaries_dir.mkdir(parents=True, exist_ok=True)
     summary_path = summaries_dir / f"{doc_name}.md"
     summary_path.write_text(
-        render_summary_md(tree, doc_name, doc_id, description=description), encoding="utf-8"
+        render_summary_md(tree, doc_name, doc_id, description=description, pages=pages),
+        encoding="utf-8",
     )
     return summary_path
 
diff --git a/openkb/tree_renderer.py b/openkb/tree_renderer.py
index 2424ba10..89587d5b 100644
--- a/openkb/tree_renderer.py
+++ b/openkb/tree_renderer.py
@@ -15,8 +15,46 @@ def _yaml_frontmatter(source_name: str, doc_id: str, description: str = "") -> s
     return "---\n" + "\n".join(lines) + "\n---\n"
 
 
-def _render_nodes_summary(nodes: list[dict], depth: int) -> str:
-    """Recursively render nodes for the *summary* view (summaries only)."""
+def _build_page_images(pages: list[dict] | None) -> dict[int, list[str]]:
+    """Map page number -> ordered, de-duplicated list of image paths.
+
+    ``pages`` is the same per-page list written to ``wiki/sources/{doc}.json``
+    (each item: ``{"page": int, "content": str, "images": [{"path": str}]}``).
+    Extracted images live there and nowhere else reachable from the rendered
+    summary — see ``_render_nodes_summary`` for why that alone left them
+    invisible in the actual Obsidian vault.
+    """
+    page_images: dict[int, list[str]] = {}
+    for page in pages or []:
+        page_num_raw = page.get("page")
+        if page_num_raw is None:
+            continue
+        try:
+            page_num = int(page_num_raw)
+        except (TypeError, ValueError):
+            continue
+        paths: list[str] = []
+        for img in page.get("images") or []:
+            path = img.get("path") if isinstance(img, dict) else None
+            if isinstance(path, str):
+                paths.append(path)
+        if paths:
+            page_images.setdefault(page_num, []).extend(paths)
+    return page_images
+
+
+def _render_nodes_summary(
+    nodes: list[dict],
+    depth: int,
+    page_images: dict[int, list[str]],
+    seen_images: set[str],
+) -> str:
+    """Recursively render nodes for the *summary* view (summaries only).
+
+    A page's images are attached to whichever node covers that page first
+    (``seen_images`` tracks paths already emitted), so a page split across
+    many sibling nodes doesn't repeat the same figure at every one of them.
+    """
     lines: list[str] = []
     heading_prefix = "#" * min(depth, 6)
     for node in nodes:
@@ -27,22 +65,46 @@ def _render_nodes_summary(nodes: list[dict], depth: int) -> str:
         children = node.get("nodes", [])
 
         lines.append(f"{heading_prefix} {title} (pages {start}–{end})\n")
+
+        new_images: list[str] = []
+        try:
+            page_range = range(int(start), int(end) + 1)
+        except (TypeError, ValueError):
+            page_range = range(0)
+        for page_num in page_range:
+            for path in page_images.get(page_num, []):
+                if path not in seen_images:
+                    seen_images.add(path)
+                    new_images.append(path)
+        for path in new_images:
+            lines.append(f"![image]({path})\n")
+
         if summary:
             lines.append(f"Summary: {summary}\n")
         if children:
-            lines.append(_render_nodes_summary(children, depth + 1))
+            lines.append(_render_nodes_summary(children, depth + 1, page_images, seen_images))
 
     return "\n".join(lines)
 
 
-def render_summary_md(tree: dict, source_name: str, doc_id: str, description: str = "") -> str:
+def render_summary_md(
+    tree: dict,
+    source_name: str,
+    doc_id: str,
+    description: str = "",
+    pages: list[dict] | None = None,
+) -> str:
     """Render the summary Markdown page for a PageIndex tree.
 
     Renders each node as a heading with page range and its summary text.
     Includes a YAML frontmatter block with ``type: "Summary"`` and an
-    optional ``description`` field.
+    optional ``description`` field. ``pages`` (the same per-page list written
+    to ``wiki/sources/{doc}.json``) is used to embed each node's page-range
+    images inline — without it, extracted images are only ever referenced
+    from that raw JSON file, never from anything the Obsidian vault renders.
     """
     frontmatter = _yaml_frontmatter(source_name, doc_id, description)
     structure = tree.get("structure", [])
-    body = _render_nodes_summary(structure, depth=1)
+    page_images = _build_page_images(pages)
+    body = _render_nodes_summary(structure, depth=1, page_images=page_images, seen_images=set())
     return frontmatter + "\n" + body
diff --git a/tests/test_indexer.py b/tests/test_indexer.py
index 3af85b04..df847618 100644
--- a/tests/test_indexer.py
+++ b/tests/test_indexer.py
@@ -370,6 +370,18 @@ def test_write_long_doc_artifacts_writes_json_and_summary(kb_dir, sample_tree):
     assert "doc_type: pageindex" in summary_path.read_text(encoding="utf-8")
 
 
+def test_write_long_doc_artifacts_embeds_page_images_in_summary(kb_dir, sample_tree):
+    # sample_tree's "Introduction" node spans pages 0-120; a page in that
+    # range should have its image embedded in the written summary, not just
+    # referenced from the sibling wiki/sources/<doc>.json file.
+    from openkb.indexer import _write_long_doc_artifacts
+
+    pages = [{"page": 5, "content": "...", "images": [{"path": "sources/images/my-doc/p5.png"}]}]
+    summary_path = _write_long_doc_artifacts(sample_tree, pages, "my-doc", "doc-1", kb_dir)
+
+    assert "![image](sources/images/my-doc/p5.png)" in summary_path.read_text(encoding="utf-8")
+
+
 def test_fetch_cloud_pages_windows_over_1000_cap():
     """get_page_content's range filter is capped at 1000 pages by parse_pages, so
     _fetch_cloud_pages must request fixed 1000-page windows (never a wider range)
diff --git a/tests/test_tree_renderer.py b/tests/test_tree_renderer.py
index 3786cfe4..6f2cd228 100644
--- a/tests/test_tree_renderer.py
+++ b/tests/test_tree_renderer.py
@@ -52,6 +52,53 @@ def test_summary_md_has_type_and_description():
     assert 'full_text: "sources/my-doc.json"' in md
 
 
+def test_node_images_are_embedded_from_pages(sample_tree):
+    # "Introduction" spans pages 0-120; page 5 is inside that range.
+    pages = [{"page": 5, "content": "...", "images": [{"path": "sources/images/doc/p5.png"}]}]
+    md = render_summary_md(sample_tree, "Sample Document", "doc-abc", pages=pages)
+    assert "![image](sources/images/doc/p5.png)" in md
+
+
+def test_no_pages_argument_means_no_images(sample_tree):
+    md = render_summary_md(sample_tree, "Sample Document", "doc-abc")
+    assert "![image]" not in md
+
+
+def test_image_on_a_page_shared_by_two_nodes_is_not_duplicated():
+    # Two sibling nodes both cover page 3 (a "no TOC" fallback can split one
+    # physical page across several nodes) — the image on that page should
+    # only be attached to the first of them, not repeated at both.
+    tree = {
+        "structure": [
+            {"title": "A", "start_index": 3, "end_index": 3, "summary": "", "nodes": []},
+            {"title": "B", "start_index": 3, "end_index": 3, "summary": "", "nodes": []},
+        ]
+    }
+    pages = [{"page": 3, "content": "...", "images": [{"path": "sources/images/doc/p3.png"}]}]
+    md = render_summary_md(tree, "my-doc", "doc-123", pages=pages)
+    assert md.count("![image](sources/images/doc/p3.png)") == 1
+
+
+def test_multiple_images_on_one_page_all_render_in_order():
+    tree = {
+        "structure": [{"title": "A", "start_index": 1, "end_index": 1, "summary": "", "nodes": []}]
+    }
+    pages = [
+        {
+            "page": 1,
+            "content": "...",
+            "images": [
+                {"path": "sources/images/doc/a.png"},
+                {"path": "sources/images/doc/b.png"},
+            ],
+        }
+    ]
+    md = render_summary_md(tree, "my-doc", "doc-123", pages=pages)
+    a_idx = md.index("a.png")
+    b_idx = md.index("b.png")
+    assert a_idx < b_idx
+
+
 def test_summary_full_text_quoted_yaml_safe():
     import yaml