microsoft
diff --git a/‎package.json‎
Lines changed: 2 additions & 2 deletions b/‎package.json‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎package.nls.json‎
Lines changed: 1 addition & 1 deletion b/‎package.nls.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/extension/intents/node/agentIntent.ts‎
Lines changed: 55 additions & 3 deletions b/‎src/extension/intents/node/agentIntent.ts‎
Lines changed: 55 additions & 3 deletions
@@ -4387,10 +4387,10 @@
 							"experimental"
 						]
 					},
-					"github.copilot.chat.agentHistorySummarizationCacheFriendly": {
+					"github.copilot.chat.agentHistorySummarizationInline": {
 						"type": "boolean",
 						"default": false,
-						"markdownDescription": "%github.copilot.config.agentHistorySummarizationCacheFriendly%",
+						"markdownDescription": "%github.copilot.config.agentHistorySummarizationInline%",
 						"tags": [
 							"advanced",
 							"experimental",
 
@@ -391,7 +391,7 @@
 	"github.copilot.config.summarizeAgentConversationHistoryThreshold": "Threshold for compacting agent conversation history.",
 	"github.copilot.config.agentHistorySummarizationMode": "Mode for agent history summarization.",
 	"github.copilot.config.backgroundCompaction": "Enable background compaction of conversation history.",
-	"github.copilot.config.agentHistorySummarizationCacheFriendly": "Use a cache-friendly summarization prompt that shares the agent prefix for prompt cache hits.",
+	"github.copilot.config.agentHistorySummarizationInline": "Summarize conversation inline within the agent loop instead of a separate LLM call, maximizing prompt cache hits.",
 
 	"github.copilot.config.useResponsesApiTruncation": "Use Responses API for truncation.",
 	"github.copilot.config.enableReadFileV2": "Enable version 2 of the read file tool.",
 
@@ -17,9 +17,9 @@ import { IEnvService } from '../../../platform/env/common/envService';
 import { ILogService } from '../../../platform/log/common/logService';
 import { IEditLogService } from '../../../platform/multiFileEdit/common/editLogService';
 import { CUSTOM_TOOL_SEARCH_NAME, isAnthropicCustomToolSearchEnabled, isAnthropicToolSearchEnabled } from '../../../platform/networking/common/anthropic';
-import { IToolDeferralService } from '../../../platform/networking/common/toolDeferralService';
 import { IChatEndpoint } from '../../../platform/networking/common/networking';
 import { modelsWithoutResponsesContextManagement } from '../../../platform/networking/common/openai';
+import { IToolDeferralService } from '../../../platform/networking/common/toolDeferralService';
 import { INotebookService } from '../../../platform/notebook/common/notebookService';
 import { GenAiMetrics } from '../../../platform/otel/common/genAiMetrics';
 import { IOTelService } from '../../../platform/otel/common/otelService';
@@ -64,6 +64,8 @@ import { getAgentMaxRequests } from '../common/agentConfig';
 import { addCacheBreakpoints } from './cacheBreakpoints';
 import { EditCodeIntent, EditCodeIntentInvocation, EditCodeIntentInvocationOptions, mergeMetadata, toNewChatReferences } from './editCodeIntent';
 
+const INLINE_SUMMARIZATION_BUDGET_EXPANSION = 1.15;
+
 function isResponsesCompactionContextManagementEnabled(endpoint: IChatEndpoint, configurationService: IConfigurationService, experimentationService: IExperimentationService): boolean {
 	return endpoint.apiType === 'responses'
 		&& configurationService.getExperimentBasedConfig(ConfigKey.ResponsesApiContextManagementEnabled, experimentationService)
@@ -425,7 +427,9 @@ export class AgentIntentInvocation extends EditCodeIntentInvocation implements I
 		const useTruncation = this.endpoint.apiType === 'responses' && this.configurationService.getConfig(ConfigKey.Advanced.UseResponsesApiTruncation);
 		const responsesCompactionContextManagementEnabled = isResponsesCompactionContextManagementEnabled(this.endpoint, this.configurationService, this.expService);
 		const summarizationEnabled = this.configurationService.getConfig(ConfigKey.SummarizeAgentConversationHistory) && this.prompt === AgentPrompt && !responsesCompactionContextManagementEnabled;
-		const backgroundCompactionEnabled = summarizationEnabled && this.configurationService.getExperimentBasedConfig(ConfigKey.BackgroundCompaction, this.expService);
+		const inlineSummarizationEnabled = summarizationEnabled && this.configurationService.getExperimentBasedConfig(ConfigKey.Advanced.AgentHistorySummarizationInline, this.expService);
+		// Disable background compaction when inline summarization is active — they solve the same problem
+		const backgroundCompactionEnabled = summarizationEnabled && !inlineSummarizationEnabled && this.configurationService.getExperimentBasedConfig(ConfigKey.BackgroundCompaction, this.expService);
 
 		// When tools are present, apply a 10% safety margin on the message portion
 		// to account for tokenizer discrepancies between our tool-token counter and
@@ -472,6 +476,26 @@ export class AgentIntentInvocation extends EditCodeIntentInvocation implements I
 			? (this._lastRenderTokenCount + toolTokens) / baseBudget
 			: 0;
 
+		// ── Proactive inline summarization: pre-render check ──────────────
+		// Use _lastRenderTokenCount (from the previous iteration) to decide
+		// whether to append the summarize instruction *before* the main
+		// render, avoiding a wasteful double-render.
+		// Guard: skip when a summary was already stored on the current or
+		// most-recent history turn — _lastRenderTokenCount is stale from the
+		// summarization render and would falsely re-trigger.
+		let proactiveInlineSummarization = false;
+		if (inlineSummarizationEnabled && baseBudget > 0) {
+			const hasRecentSummary = promptContext.toolCallRounds?.some(r => r.summary)
+				|| promptContext.history.at(-1)?.rounds.some(r => r.summary);
+			if (!hasRecentSummary) {
+				const preRenderRatio = (this._lastRenderTokenCount + toolTokens) / baseBudget;
+				if (preRenderRatio >= 0.85) {
+					this.logService.debug(`[Agent] pre-render at ${(preRenderRatio * 100).toFixed(0)}% — proactively enabling inline summarization`);
+					proactiveInlineSummarization = true;
+				}
+			}
+		}
+
 		// Track whether we applied a summary in this iteration so we don't
 		// immediately re-trigger background compaction in the post-render check.
 		let summaryAppliedThisIteration = false;
@@ -594,10 +618,36 @@ export class AgentIntentInvocation extends EditCodeIntentInvocation implements I
 			}
 		};
 
+		// Helper function for inline summarization — appends summarize instruction
+		// as a user message in the agent loop instead of making a separate LLM call.
+		// Returns the render result with InlineSummarizationRequestedMetadata set.
+		const renderWithInlineSummarization = async (reason: string, renderProps: AgentPromptProps = props): Promise<RenderPromptResult> => {
+			this.logService.debug(`[Agent] ${reason}, triggering inline summarization`);
+			try {
+				// Expand from the *base* endpoint (not renderProps.endpoint which may already be expanded)
+				const expandedEndpoint = endpoint.cloneWithTokenOverride(endpoint.modelMaxPromptTokens * INLINE_SUMMARIZATION_BUDGET_EXPANSION);
+				const renderer = PromptRenderer.create(this.instantiationService, expandedEndpoint, this.prompt, {
+					...renderProps,
+					endpoint: expandedEndpoint,
+					inlineSummarization: true,
+				});
+				return await renderer.render(progress, token);
+			} catch (e) {
+				this.logService.error(e, `[Agent] inline summarization render failed, falling back to separate-call summarization`);
+				return await renderWithSummarization(`inline summarization failed (${e instanceof Error ? e.message : e}), falling back`, renderProps);
+			}
+		};
+
 		const contextLengthBefore = this._lastRenderTokenCount;
 
 		try {
-			const renderer = PromptRenderer.create(this.instantiationService, endpoint, this.prompt, props);
+			const renderEndpoint = proactiveInlineSummarization
+				? endpoint.cloneWithTokenOverride(endpoint.modelMaxPromptTokens * INLINE_SUMMARIZATION_BUDGET_EXPANSION)
+				: endpoint;
+			const renderProps: AgentPromptProps = proactiveInlineSummarization
+				? { ...props, endpoint: renderEndpoint, inlineSummarization: true }
+				: props;
+			const renderer = PromptRenderer.create(this.instantiationService, renderEndpoint, this.prompt, renderProps);
 			result = await renderer.render(progress, token);
 		} catch (e) {
 			if (e instanceof BudgetExceededError && summarizationEnabled) {
@@ -645,6 +695,8 @@ export class AgentIntentInvocation extends EditCodeIntentInvocation implements I
 						// Background compaction failed — fall back to synchronous summarization
 						result = await renderWithSummarization(`budget exceeded(${e.message}), background compaction failed`);
 					}
+				} else if (inlineSummarizationEnabled) {
+					result = await renderWithInlineSummarization(`budget exceeded(${e.message})`);
 				} else {
 					result = await renderWithSummarization(`budget exceeded(${e.message})`);
 				}