Skip to content
This repository was archived by the owner on May 20, 2026. It is now read-only.

Commit 1d26378

Browse files
authored
Inline summarization: summarize within the agent loop for maximum prompt cache hits (#4956)
* Add inline summarization feature for agent conversation history - Introduced configuration option for inline summarization in package.json and configurationService.ts. - Updated agentIntent.ts to handle inline summarization logic during conversation. - Modified summarizedConversationHistory.tsx to support inline summarization instructions. - Enhanced tests to cover inline summarization scenarios and extraction of inline summaries. * Remove cache-friendly summarization prompt and related configurations * Refactor inline summarization handling in ToolCallingLoop and add summary application method * Add failure telemetry, deferred cleanup, and debugName tracking for inline summarization * Address PR review: fix empty string check, telemetry counts, cache token reporting, and test naming
1 parent 9c143e2 commit 1d26378

9 files changed

Lines changed: 597 additions & 86 deletions

File tree

package.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4387,10 +4387,10 @@
43874387
"experimental"
43884388
]
43894389
},
4390-
"github.copilot.chat.agentHistorySummarizationCacheFriendly": {
4390+
"github.copilot.chat.agentHistorySummarizationInline": {
43914391
"type": "boolean",
43924392
"default": false,
4393-
"markdownDescription": "%github.copilot.config.agentHistorySummarizationCacheFriendly%",
4393+
"markdownDescription": "%github.copilot.config.agentHistorySummarizationInline%",
43944394
"tags": [
43954395
"advanced",
43964396
"experimental",

package.nls.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -391,7 +391,7 @@
391391
"github.copilot.config.summarizeAgentConversationHistoryThreshold": "Threshold for compacting agent conversation history.",
392392
"github.copilot.config.agentHistorySummarizationMode": "Mode for agent history summarization.",
393393
"github.copilot.config.backgroundCompaction": "Enable background compaction of conversation history.",
394-
"github.copilot.config.agentHistorySummarizationCacheFriendly": "Use a cache-friendly summarization prompt that shares the agent prefix for prompt cache hits.",
394+
"github.copilot.config.agentHistorySummarizationInline": "Summarize conversation inline within the agent loop instead of a separate LLM call, maximizing prompt cache hits.",
395395

396396
"github.copilot.config.useResponsesApiTruncation": "Use Responses API for truncation.",
397397
"github.copilot.config.enableReadFileV2": "Enable version 2 of the read file tool.",

src/extension/intents/node/agentIntent.ts

Lines changed: 55 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,9 @@ import { IEnvService } from '../../../platform/env/common/envService';
1717
import { ILogService } from '../../../platform/log/common/logService';
1818
import { IEditLogService } from '../../../platform/multiFileEdit/common/editLogService';
1919
import { CUSTOM_TOOL_SEARCH_NAME, isAnthropicCustomToolSearchEnabled, isAnthropicToolSearchEnabled } from '../../../platform/networking/common/anthropic';
20-
import { IToolDeferralService } from '../../../platform/networking/common/toolDeferralService';
2120
import { IChatEndpoint } from '../../../platform/networking/common/networking';
2221
import { modelsWithoutResponsesContextManagement } from '../../../platform/networking/common/openai';
22+
import { IToolDeferralService } from '../../../platform/networking/common/toolDeferralService';
2323
import { INotebookService } from '../../../platform/notebook/common/notebookService';
2424
import { GenAiMetrics } from '../../../platform/otel/common/genAiMetrics';
2525
import { IOTelService } from '../../../platform/otel/common/otelService';
@@ -64,6 +64,8 @@ import { getAgentMaxRequests } from '../common/agentConfig';
6464
import { addCacheBreakpoints } from './cacheBreakpoints';
6565
import { EditCodeIntent, EditCodeIntentInvocation, EditCodeIntentInvocationOptions, mergeMetadata, toNewChatReferences } from './editCodeIntent';
6666

67+
const INLINE_SUMMARIZATION_BUDGET_EXPANSION = 1.15;
68+
6769
function isResponsesCompactionContextManagementEnabled(endpoint: IChatEndpoint, configurationService: IConfigurationService, experimentationService: IExperimentationService): boolean {
6870
return endpoint.apiType === 'responses'
6971
&& configurationService.getExperimentBasedConfig(ConfigKey.ResponsesApiContextManagementEnabled, experimentationService)
@@ -425,7 +427,9 @@ export class AgentIntentInvocation extends EditCodeIntentInvocation implements I
425427
const useTruncation = this.endpoint.apiType === 'responses' && this.configurationService.getConfig(ConfigKey.Advanced.UseResponsesApiTruncation);
426428
const responsesCompactionContextManagementEnabled = isResponsesCompactionContextManagementEnabled(this.endpoint, this.configurationService, this.expService);
427429
const summarizationEnabled = this.configurationService.getConfig(ConfigKey.SummarizeAgentConversationHistory) && this.prompt === AgentPrompt && !responsesCompactionContextManagementEnabled;
428-
const backgroundCompactionEnabled = summarizationEnabled && this.configurationService.getExperimentBasedConfig(ConfigKey.BackgroundCompaction, this.expService);
430+
const inlineSummarizationEnabled = summarizationEnabled && this.configurationService.getExperimentBasedConfig(ConfigKey.Advanced.AgentHistorySummarizationInline, this.expService);
431+
// Disable background compaction when inline summarization is active — they solve the same problem
432+
const backgroundCompactionEnabled = summarizationEnabled && !inlineSummarizationEnabled && this.configurationService.getExperimentBasedConfig(ConfigKey.BackgroundCompaction, this.expService);
429433

430434
// When tools are present, apply a 10% safety margin on the message portion
431435
// to account for tokenizer discrepancies between our tool-token counter and
@@ -472,6 +476,26 @@ export class AgentIntentInvocation extends EditCodeIntentInvocation implements I
472476
? (this._lastRenderTokenCount + toolTokens) / baseBudget
473477
: 0;
474478

479+
// ── Proactive inline summarization: pre-render check ──────────────
480+
// Use _lastRenderTokenCount (from the previous iteration) to decide
481+
// whether to append the summarize instruction *before* the main
482+
// render, avoiding a wasteful double-render.
483+
// Guard: skip when a summary was already stored on the current or
484+
// most-recent history turn — _lastRenderTokenCount is stale from the
485+
// summarization render and would falsely re-trigger.
486+
let proactiveInlineSummarization = false;
487+
if (inlineSummarizationEnabled && baseBudget > 0) {
488+
const hasRecentSummary = promptContext.toolCallRounds?.some(r => r.summary)
489+
|| promptContext.history.at(-1)?.rounds.some(r => r.summary);
490+
if (!hasRecentSummary) {
491+
const preRenderRatio = (this._lastRenderTokenCount + toolTokens) / baseBudget;
492+
if (preRenderRatio >= 0.85) {
493+
this.logService.debug(`[Agent] pre-render at ${(preRenderRatio * 100).toFixed(0)}% — proactively enabling inline summarization`);
494+
proactiveInlineSummarization = true;
495+
}
496+
}
497+
}
498+
475499
// Track whether we applied a summary in this iteration so we don't
476500
// immediately re-trigger background compaction in the post-render check.
477501
let summaryAppliedThisIteration = false;
@@ -594,10 +618,36 @@ export class AgentIntentInvocation extends EditCodeIntentInvocation implements I
594618
}
595619
};
596620

621+
// Helper function for inline summarization — appends summarize instruction
622+
// as a user message in the agent loop instead of making a separate LLM call.
623+
// Returns the render result with InlineSummarizationRequestedMetadata set.
624+
const renderWithInlineSummarization = async (reason: string, renderProps: AgentPromptProps = props): Promise<RenderPromptResult> => {
625+
this.logService.debug(`[Agent] ${reason}, triggering inline summarization`);
626+
try {
627+
// Expand from the *base* endpoint (not renderProps.endpoint which may already be expanded)
628+
const expandedEndpoint = endpoint.cloneWithTokenOverride(endpoint.modelMaxPromptTokens * INLINE_SUMMARIZATION_BUDGET_EXPANSION);
629+
const renderer = PromptRenderer.create(this.instantiationService, expandedEndpoint, this.prompt, {
630+
...renderProps,
631+
endpoint: expandedEndpoint,
632+
inlineSummarization: true,
633+
});
634+
return await renderer.render(progress, token);
635+
} catch (e) {
636+
this.logService.error(e, `[Agent] inline summarization render failed, falling back to separate-call summarization`);
637+
return await renderWithSummarization(`inline summarization failed (${e instanceof Error ? e.message : e}), falling back`, renderProps);
638+
}
639+
};
640+
597641
const contextLengthBefore = this._lastRenderTokenCount;
598642

599643
try {
600-
const renderer = PromptRenderer.create(this.instantiationService, endpoint, this.prompt, props);
644+
const renderEndpoint = proactiveInlineSummarization
645+
? endpoint.cloneWithTokenOverride(endpoint.modelMaxPromptTokens * INLINE_SUMMARIZATION_BUDGET_EXPANSION)
646+
: endpoint;
647+
const renderProps: AgentPromptProps = proactiveInlineSummarization
648+
? { ...props, endpoint: renderEndpoint, inlineSummarization: true }
649+
: props;
650+
const renderer = PromptRenderer.create(this.instantiationService, renderEndpoint, this.prompt, renderProps);
601651
result = await renderer.render(progress, token);
602652
} catch (e) {
603653
if (e instanceof BudgetExceededError && summarizationEnabled) {
@@ -645,6 +695,8 @@ export class AgentIntentInvocation extends EditCodeIntentInvocation implements I
645695
// Background compaction failed — fall back to synchronous summarization
646696
result = await renderWithSummarization(`budget exceeded(${e.message}), background compaction failed`);
647697
}
698+
} else if (inlineSummarizationEnabled) {
699+
result = await renderWithInlineSummarization(`budget exceeded(${e.message})`);
648700
} else {
649701
result = await renderWithSummarization(`budget exceeded(${e.message})`);
650702
}

0 commit comments

Comments
 (0)