Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions openkb/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,9 @@ def _write_long_doc_artifacts(
Returns the summary path. Shared by :func:`index_long_document` (local)
and :func:`import_cloud_document` (cloud) so both produce identical
artifacts. Page images, when present, are written separately by the
caller's page extractor — this helper only persists page text + summary.
caller's page extractor — this helper persists page text + summary, and
passes ``pages`` through to ``render_summary_md`` so each node's images
get embedded in the summary too, not just the raw page JSON.
"""
sources_dir = kb_dir / "wiki" / "sources"
sources_dir.mkdir(parents=True, exist_ok=True)
Expand All @@ -148,7 +150,8 @@ def _write_long_doc_artifacts(
summaries_dir.mkdir(parents=True, exist_ok=True)
summary_path = summaries_dir / f"{doc_name}.md"
summary_path.write_text(
render_summary_md(tree, doc_name, doc_id, description=description), encoding="utf-8"
render_summary_md(tree, doc_name, doc_id, description=description, pages=pages),
encoding="utf-8",
)
return summary_path

Expand Down
74 changes: 68 additions & 6 deletions openkb/tree_renderer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,46 @@ def _yaml_frontmatter(source_name: str, doc_id: str, description: str = "") -> s
return "---\n" + "\n".join(lines) + "\n---\n"


def _render_nodes_summary(nodes: list[dict], depth: int) -> str:
"""Recursively render nodes for the *summary* view (summaries only)."""
def _build_page_images(pages: list[dict] | None) -> dict[int, list[str]]:
"""Map page number -> ordered, de-duplicated list of image paths.

``pages`` is the same per-page list written to ``wiki/sources/{doc}.json``
(each item: ``{"page": int, "content": str, "images": [{"path": str}]}``).
Extracted images live there and nowhere else reachable from the rendered
summary — see ``_render_nodes_summary`` for why that alone left them
invisible in the actual Obsidian vault.
"""
page_images: dict[int, list[str]] = {}
for page in pages or []:
page_num_raw = page.get("page")
if page_num_raw is None:
continue
try:
page_num = int(page_num_raw)
except (TypeError, ValueError):
continue
paths: list[str] = []
for img in page.get("images") or []:
path = img.get("path") if isinstance(img, dict) else None
if isinstance(path, str):
paths.append(path)
if paths:
page_images.setdefault(page_num, []).extend(paths)
return page_images


def _render_nodes_summary(
nodes: list[dict],
depth: int,
page_images: dict[int, list[str]],
seen_images: set[str],
) -> str:
"""Recursively render nodes for the *summary* view (summaries only).

A page's images are attached to whichever node covers that page first
(``seen_images`` tracks paths already emitted), so a page split across
many sibling nodes doesn't repeat the same figure at every one of them.
"""
lines: list[str] = []
heading_prefix = "#" * min(depth, 6)
for node in nodes:
Expand All @@ -27,22 +65,46 @@ def _render_nodes_summary(nodes: list[dict], depth: int) -> str:
children = node.get("nodes", [])

lines.append(f"{heading_prefix} {title} (pages {start}–{end})\n")

new_images: list[str] = []
try:
page_range = range(int(start), int(end) + 1)
except (TypeError, ValueError):
page_range = range(0)
for page_num in page_range:
for path in page_images.get(page_num, []):
if path not in seen_images:
seen_images.add(path)
new_images.append(path)
for path in new_images:
lines.append(f"![image]({path})\n")

if summary:
lines.append(f"Summary: {summary}\n")
if children:
lines.append(_render_nodes_summary(children, depth + 1))
lines.append(_render_nodes_summary(children, depth + 1, page_images, seen_images))

return "\n".join(lines)


def render_summary_md(tree: dict, source_name: str, doc_id: str, description: str = "") -> str:
def render_summary_md(
tree: dict,
source_name: str,
doc_id: str,
description: str = "",
pages: list[dict] | None = None,
) -> str:
"""Render the summary Markdown page for a PageIndex tree.

Renders each node as a heading with page range and its summary text.
Includes a YAML frontmatter block with ``type: "Summary"`` and an
optional ``description`` field.
optional ``description`` field. ``pages`` (the same per-page list written
to ``wiki/sources/{doc}.json``) is used to embed each node's page-range
images inline — without it, extracted images are only ever referenced
from that raw JSON file, never from anything the Obsidian vault renders.
"""
frontmatter = _yaml_frontmatter(source_name, doc_id, description)
structure = tree.get("structure", [])
body = _render_nodes_summary(structure, depth=1)
page_images = _build_page_images(pages)
body = _render_nodes_summary(structure, depth=1, page_images=page_images, seen_images=set())
return frontmatter + "\n" + body
12 changes: 12 additions & 0 deletions tests/test_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,18 @@ def test_write_long_doc_artifacts_writes_json_and_summary(kb_dir, sample_tree):
assert "doc_type: pageindex" in summary_path.read_text(encoding="utf-8")


def test_write_long_doc_artifacts_embeds_page_images_in_summary(kb_dir, sample_tree):
# sample_tree's "Introduction" node spans pages 0-120; a page in that
# range should have its image embedded in the written summary, not just
# referenced from the sibling wiki/sources/<doc>.json file.
from openkb.indexer import _write_long_doc_artifacts

pages = [{"page": 5, "content": "...", "images": [{"path": "sources/images/my-doc/p5.png"}]}]
summary_path = _write_long_doc_artifacts(sample_tree, pages, "my-doc", "doc-1", kb_dir)

assert "![image](sources/images/my-doc/p5.png)" in summary_path.read_text(encoding="utf-8")


def test_fetch_cloud_pages_windows_over_1000_cap():
"""get_page_content's range filter is capped at 1000 pages by parse_pages, so
_fetch_cloud_pages must request fixed 1000-page windows (never a wider range)
Expand Down
47 changes: 47 additions & 0 deletions tests/test_tree_renderer.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,53 @@ def test_summary_md_has_type_and_description():
assert 'full_text: "sources/my-doc.json"' in md


def test_node_images_are_embedded_from_pages(sample_tree):
# "Introduction" spans pages 0-120; page 5 is inside that range.
pages = [{"page": 5, "content": "...", "images": [{"path": "sources/images/doc/p5.png"}]}]
md = render_summary_md(sample_tree, "Sample Document", "doc-abc", pages=pages)
assert "![image](sources/images/doc/p5.png)" in md


def test_no_pages_argument_means_no_images(sample_tree):
md = render_summary_md(sample_tree, "Sample Document", "doc-abc")
assert "![image]" not in md


def test_image_on_a_page_shared_by_two_nodes_is_not_duplicated():
# Two sibling nodes both cover page 3 (a "no TOC" fallback can split one
# physical page across several nodes) — the image on that page should
# only be attached to the first of them, not repeated at both.
tree = {
"structure": [
{"title": "A", "start_index": 3, "end_index": 3, "summary": "", "nodes": []},
{"title": "B", "start_index": 3, "end_index": 3, "summary": "", "nodes": []},
]
}
pages = [{"page": 3, "content": "...", "images": [{"path": "sources/images/doc/p3.png"}]}]
md = render_summary_md(tree, "my-doc", "doc-123", pages=pages)
assert md.count("![image](sources/images/doc/p3.png)") == 1


def test_multiple_images_on_one_page_all_render_in_order():
tree = {
"structure": [{"title": "A", "start_index": 1, "end_index": 1, "summary": "", "nodes": []}]
}
pages = [
{
"page": 1,
"content": "...",
"images": [
{"path": "sources/images/doc/a.png"},
{"path": "sources/images/doc/b.png"},
],
}
]
md = render_summary_md(tree, "my-doc", "doc-123", pages=pages)
a_idx = md.index("a.png")
b_idx = md.index("b.png")
assert a_idx < b_idx


def test_summary_full_text_quoted_yaml_safe():
import yaml

Expand Down