Skip to content

Commit dd89a8e

Browse files
marcominervadluc
andauthored
Add token usage tracking (#947)
## Motivation and Context (Why the change? What's the scenario?) Adds a new TokenUsage property to MemoryAnswer to hold information about token usage. ## High level description (Approach, Design) * Include token count provided by the internal tokenizer * Include token count provided by the service, if available * Support streaming and multiple services, if needed --------- Co-authored-by: Devis Lucato <[email protected]>
1 parent f21cc53 commit dd89a8e

File tree

25 files changed

+338
-80
lines changed

25 files changed

+338
-80
lines changed

examples/001-dotnet-WebClient/Program.cs

Lines changed: 35 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -253,31 +253,58 @@ private static async Task AskSimpleQuestionStreamingTheAnswer()
253253
{
254254
var question = "What's E = m*c^2?";
255255
Console.WriteLine($"Question: {question}");
256-
Console.WriteLine($"Expected result: formula explanation using the information loaded");
256+
Console.WriteLine("Expected result: formula explanation using the information loaded");
257257

258258
Console.Write("\nAnswer: ");
259+
var tokenUsage = new List<TokenUsage>();
259260
var answerStream = s_memory.AskStreamingAsync(question, options: new SearchOptions { Stream = true });
260261

261262
await foreach (var answer in answerStream)
262263
{
263264
// Print token received by LLM
264265
Console.Write(answer.Result);
266+
267+
// Collect token usage
268+
if (answer.TokenUsage?.Count > 0)
269+
{
270+
tokenUsage = tokenUsage.Union(answer.TokenUsage).ToList();
271+
}
272+
265273
// Slow down the stream for demo purpose
266274
await Task.Delay(25);
267275
}
268276

277+
Console.WriteLine("\n\nToken usage report:");
278+
foreach (var report in tokenUsage)
279+
{
280+
Console.WriteLine($"{report.ServiceType}: {report.ModelName} [{report.ModelType}]");
281+
Console.WriteLine($"- Input : {report.TokenizerTokensIn} tokens (measured by KM tokenizer)");
282+
Console.WriteLine($"- Input : {report.ServiceTokensIn} tokens (measured by remote service)");
283+
Console.WriteLine($"- Output: {report.ServiceTokensOut} tokens (measured by remote service)");
284+
Console.WriteLine($"- Output: {report.TokenizerTokensOut} tokens (measured by KM tokenizer)");
285+
Console.WriteLine();
286+
}
287+
269288
Console.WriteLine("\n\n====================================\n");
270289

271290
/* OUTPUT
272291
273292
Question: What's E = m*c^2?
274-
275-
Answer: E = m*c^2 is the formula representing the principle of mass-energy equivalence, which was introduced by Albert Einstein. In this equation,
276-
E stands for energy, m represents mass, and c is the speed of light in a vacuum, which is approximately 299,792,458 meters per second (m/s).
277-
The equation states that the energy (E) of a system in its rest frame is equal to its mass (m) multiplied by the square of the speed of light (c^2).
278-
This implies that mass and energy are interchangeable; a small amount of mass can be converted into a large amount of energy and vice versa,
279-
due to the speed of light being a very large number when squared. This concept is a fundamental principle in physics and has important implications
280-
in various fields, including nuclear physics and cosmology.
293+
Expected result: formula explanation using the information loaded
294+
295+
Answer: E = m*c^2 is a formula derived by the physicist Albert Einstein, which describes the principle of
296+
mass–energy equivalence. In this equation, E represents energy, m represents mass, and c represents the
297+
speed of light in a vacuum (approximately 3 x 10^8 meters per second). The formula indicates that mass and
298+
energy are interchangeable; they are different forms of the same thing and can be converted into each other.
299+
This principle is fundamental in physics and has significant implications in various fields, including nuclear
300+
physics and cosmology.
301+
302+
Token usage report:
303+
Azure OpenAI: gpt-4o [TextGeneration]
304+
- Input : 15657 tokens (measured by KM tokenizer)
305+
- Input : 15664 tokens (measured by remote service)
306+
- Output: 110 tokens (measured by remote service)
307+
- Output: 110 tokens (measured by KM tokenizer)
281308
282309
*/
283310
}

examples/002-dotnet-Serverless/Program.cs

Lines changed: 34 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -311,31 +311,57 @@ private static async Task AskSimpleQuestionStreamingTheAnswer()
311311
{
312312
var question = "What's E = m*c^2?";
313313
Console.WriteLine($"Question: {question}");
314-
Console.WriteLine($"Expected result: formula explanation using the information loaded");
314+
Console.WriteLine("Expected result: formula explanation using the information loaded");
315315

316316
Console.Write("\nAnswer: ");
317+
var tokenUsage = new List<TokenUsage>();
317318
var answerStream = s_memory.AskStreamingAsync(question, options: new SearchOptions { Stream = true });
318319

319320
await foreach (var answer in answerStream)
320321
{
321322
// Print token received by LLM
322323
Console.Write(answer.Result);
324+
325+
// Collect token usage
326+
if (answer.TokenUsage?.Count > 0)
327+
{
328+
tokenUsage = tokenUsage.Union(answer.TokenUsage).ToList();
329+
}
330+
323331
// Slow down the stream for demo purpose
324332
await Task.Delay(25);
325333
}
326334

335+
Console.WriteLine("\n\nToken usage report:");
336+
foreach (var report in tokenUsage)
337+
{
338+
Console.WriteLine($"{report.ServiceType}: {report.ModelName} [{report.ModelType}]");
339+
Console.WriteLine($"- Input : {report.TokenizerTokensIn} tokens (measured by KM tokenizer)");
340+
Console.WriteLine($"- Input : {report.ServiceTokensIn} tokens (measured by remote service)");
341+
Console.WriteLine($"- Output: {report.ServiceTokensOut} tokens (measured by remote service)");
342+
Console.WriteLine($"- Output: {report.TokenizerTokensOut} tokens (measured by KM tokenizer)");
343+
Console.WriteLine();
344+
}
345+
327346
Console.WriteLine("\n\n====================================\n");
328347

329348
/* OUTPUT
330349
331350
Question: What's E = m*c^2?
332-
333-
Answer: E = m*c^2 is the formula representing the principle of mass-energy equivalence, which was introduced by Albert Einstein. In this equation,
334-
E stands for energy, m represents mass, and c is the speed of light in a vacuum, which is approximately 299,792,458 meters per second (m/s).
335-
The equation states that the energy (E) of a system in its rest frame is equal to its mass (m) multiplied by the square of the speed of light (c^2).
336-
This implies that mass and energy are interchangeable; a small amount of mass can be converted into a large amount of energy and vice versa,
337-
due to the speed of light being a very large number when squared. This concept is a fundamental principle in physics and has important implications
338-
in various fields, including nuclear physics and cosmology.
351+
Expected result: formula explanation using the information loaded
352+
353+
Answer: E = m*c^2 is a formula derived by physicist Albert Einstein, which expresses the principle of
354+
mass–energy equivalence. In this equation, E represents energy, m represents mass, and c represents the
355+
speed of light in a vacuum (approximately 3 x 10^8 meters per second). The formula indicates that mass and
356+
energy are interchangeable; a small amount of mass can be converted into a large amount of energy, and vice
357+
versa, differing only by a multiplicative constant (c^2).
358+
359+
Token usage report:
360+
Azure OpenAI: gpt-4o [TextGeneration]
361+
- Input : 24349 tokens (measured by KM tokenizer)
362+
- Input : 24356 tokens (measured by remote service)
363+
- Output: 103 tokens (measured by remote service)
364+
- Output: 103 tokens (measured by KM tokenizer)
339365
340366
*/
341367
}

examples/104-dotnet-custom-LLM/Program.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ public IReadOnlyList<string> GetTokens(string text)
6868
}
6969

7070
/// <inheritdoc />
71-
public async IAsyncEnumerable<string> GenerateTextAsync(
71+
public async IAsyncEnumerable<GeneratedTextContent> GenerateTextAsync(
7272
string prompt,
7373
TextGenerationOptions options,
7474
[EnumeratorCancellation] CancellationToken cancellationToken = default)

extensions/Anthropic/AnthropicTextGeneration.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ public IReadOnlyList<string> GetTokens(string text)
9797
}
9898

9999
/// <inheritdoc />
100-
public async IAsyncEnumerable<string> GenerateTextAsync(
100+
public async IAsyncEnumerable<GeneratedTextContent> GenerateTextAsync(
101101
string prompt,
102102
TextGenerationOptions options,
103103
[EnumeratorCancellation] CancellationToken cancellationToken = default)

extensions/AzureOpenAI/AzureOpenAI/AzureOpenAITextGenerator.cs

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
// Copyright (c) Microsoft. All rights reserved.
22

3+
using System;
34
using System.Collections.Generic;
45
using System.Diagnostics.CodeAnalysis;
56
using System.Net.Http;
@@ -12,6 +13,7 @@
1213
using Microsoft.KernelMemory.Diagnostics;
1314
using Microsoft.SemanticKernel;
1415
using Microsoft.SemanticKernel.Connectors.AzureOpenAI;
16+
using OpenAI.Chat;
1517

1618
namespace Microsoft.KernelMemory.AI.AzureOpenAI;
1719

@@ -28,6 +30,8 @@ public sealed class AzureOpenAITextGenerator : ITextGenerator
2830
private readonly ITextTokenizer _textTokenizer;
2931
private readonly ILogger<AzureOpenAITextGenerator> _log;
3032

33+
private readonly string _deployment;
34+
3135
/// <inheritdoc/>
3236
public int MaxTokenTotal { get; }
3337

@@ -87,6 +91,7 @@ public AzureOpenAITextGenerator(
8791
{
8892
this._client = skClient;
8993
this._log = (loggerFactory ?? DefaultLogger.Factory).CreateLogger<AzureOpenAITextGenerator>();
94+
this._deployment = config.Deployment;
9095
this.MaxTokenTotal = config.MaxTokenTotal;
9196

9297
textTokenizer ??= TokenizerFactory.GetTokenizerForEncoding(config.Tokenizer);
@@ -114,7 +119,7 @@ public IReadOnlyList<string> GetTokens(string text)
114119
}
115120

116121
/// <inheritdoc/>
117-
public async IAsyncEnumerable<string> GenerateTextAsync(
122+
public async IAsyncEnumerable<GeneratedTextContent> GenerateTextAsync(
118123
string prompt,
119124
TextGenerationOptions options,
120125
[EnumeratorCancellation] CancellationToken cancellationToken = default)
@@ -153,9 +158,33 @@ public async IAsyncEnumerable<string> GenerateTextAsync(
153158

154159
await foreach (StreamingTextContent x in result.WithCancellation(cancellationToken))
155160
{
156-
if (x.Text == null) { continue; }
157-
158-
yield return x.Text;
161+
TokenUsage? tokenUsage = null;
162+
163+
// The last message includes tokens usage metadata.
164+
// https://platform.openai.com/docs/api-reference/chat/create#chat-create-stream_options
165+
if (x.Metadata?["Usage"] is ChatTokenUsage usage)
166+
{
167+
this._log.LogTrace("Usage report: input tokens: {InputTokenCount}, output tokens: {OutputTokenCount}, output reasoning tokens: {ReasoningTokenCount}",
168+
usage.InputTokenCount, usage.OutputTokenCount, usage.OutputTokenDetails?.ReasoningTokenCount ?? 0);
169+
170+
tokenUsage = new TokenUsage
171+
{
172+
Timestamp = (DateTimeOffset?)x.Metadata["CreatedAt"] ?? DateTimeOffset.UtcNow,
173+
ServiceType = "Azure OpenAI",
174+
ModelType = Constants.ModelType.TextGeneration,
175+
ModelName = this._deployment,
176+
ServiceTokensIn = usage.InputTokenCount,
177+
ServiceTokensOut = usage.OutputTokenCount,
178+
ServiceReasoningTokens = usage.OutputTokenDetails?.ReasoningTokenCount
179+
};
180+
}
181+
182+
// NOTE: as stated at https://platform.openai.com/docs/api-reference/chat/streaming#chat/streaming-choices,
183+
// the Choice can also be empty for the last chunk if we set stream_options: { "include_usage": true} to get token counts, so it is possible that
184+
// x.Text is null, but tokenUsage is not (token usage statistics for the entire request are included in the last chunk).
185+
if (x.Text is null && tokenUsage is null) { continue; }
186+
187+
yield return new(x.Text ?? string.Empty, tokenUsage);
159188
}
160189
}
161190
}

extensions/LlamaSharp/LlamaSharp.FunctionalTests/LlamaSharpTextGeneratorTest.cs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ public void ItCountsTokens()
4040

4141
// Assert
4242
Console.WriteLine("Phi3 token count: " + tokenCount);
43-
Console.WriteLine("GPT4 token count: " + (new CL100KTokenizer()).CountTokens(text));
43+
Console.WriteLine("GPT4 token count: " + new CL100KTokenizer().CountTokens(text));
4444
Console.WriteLine($"Time: {this._timer.ElapsedMilliseconds / 1000} secs");
4545

4646
// Expected result with Phi-3-mini-4k-instruct-q4.gguf, without BoS (https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf)
@@ -90,9 +90,8 @@ public async Task ItGeneratesText()
9090
this._timer.Restart();
9191
var tokens = this._target.GenerateTextAsync(prompt, options);
9292
var result = new StringBuilder();
93-
await foreach (string token in tokens)
93+
await foreach (var token in tokens)
9494
{
95-
// Console.WriteLine(token);
9695
result.Append(token);
9796
}
9897

extensions/LlamaSharp/LlamaSharp/LlamaSharpTextGenerator.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ public IReadOnlyList<string> GetTokens(string text)
7474
}
7575

7676
/// <inheritdoc/>
77-
public IAsyncEnumerable<string> GenerateTextAsync(
77+
public IAsyncEnumerable<GeneratedTextContent> GenerateTextAsync(
7878
string prompt,
7979
TextGenerationOptions options,
8080
CancellationToken cancellationToken = default)
@@ -85,7 +85,7 @@ public IAsyncEnumerable<string> GenerateTextAsync(
8585
? options.TokenSelectionBiases.ToDictionary(pair => (LLamaToken)pair.Key, pair => pair.Value)
8686
: [];
8787

88-
var samplingPipeline = new DefaultSamplingPipeline()
88+
var samplingPipeline = new DefaultSamplingPipeline
8989
{
9090
Temperature = (float)options.Temperature,
9191
TopP = (float)options.NucleusSampling,
@@ -103,7 +103,7 @@ public IAsyncEnumerable<string> GenerateTextAsync(
103103
};
104104

105105
this._log.LogTrace("Generating text, temperature {0}, max tokens {1}", samplingPipeline.Temperature, settings.MaxTokens);
106-
return executor.InferAsync(prompt, settings, cancellationToken);
106+
return executor.InferAsync(prompt, settings, cancellationToken).Select(x => new GeneratedTextContent(x));
107107
}
108108

109109
/// <inheritdoc/>

extensions/ONNX/Onnx.FunctionalTests/OnnxTextGeneratorTest.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ public async Task ItGeneratesText()
4545
this._timer.Restart();
4646
var tokens = this._target.GenerateTextAsync(prompt, options);
4747
var result = new StringBuilder();
48-
await foreach (string token in tokens)
48+
await foreach (var token in tokens)
4949
{
5050
result.Append(token);
5151
}

extensions/ONNX/Onnx/OnnxTextGenerator.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ public OnnxTextGenerator(
8585
}
8686

8787
/// <inheritdoc/>
88-
public async IAsyncEnumerable<string> GenerateTextAsync(
88+
public async IAsyncEnumerable<GeneratedTextContent> GenerateTextAsync(
8989
string prompt,
9090
TextGenerationOptions? options = null,
9191
[EnumeratorCancellation] CancellationToken cancellationToken = default)

extensions/Ollama/Ollama/OllamaTextGenerator.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ public IReadOnlyList<string> GetTokens(string text)
9191
return this._textTokenizer.GetTokens(text);
9292
}
9393

94-
public async IAsyncEnumerable<string> GenerateTextAsync(
94+
public async IAsyncEnumerable<GeneratedTextContent> GenerateTextAsync(
9595
string prompt,
9696
TextGenerationOptions options,
9797
[EnumeratorCancellation] CancellationToken cancellationToken = default)

extensions/OpenAI/OpenAI/OpenAITextGenerator.cs

Lines changed: 33 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
// Copyright (c) Microsoft. All rights reserved.
22

3+
using System;
34
using System.Collections.Generic;
45
using System.Diagnostics.CodeAnalysis;
56
using System.Net.Http;
@@ -12,6 +13,7 @@
1213
using Microsoft.SemanticKernel;
1314
using Microsoft.SemanticKernel.Connectors.OpenAI;
1415
using OpenAI;
16+
using OpenAI.Chat;
1517

1618
namespace Microsoft.KernelMemory.AI.OpenAI;
1719

@@ -29,6 +31,8 @@ public sealed class OpenAITextGenerator : ITextGenerator
2931
private readonly ITextTokenizer _textTokenizer;
3032
private readonly ILogger<OpenAITextGenerator> _log;
3133

34+
private readonly string _textModel;
35+
3236
/// <inheritdoc/>
3337
public int MaxTokenTotal { get; }
3438

@@ -87,6 +91,7 @@ public OpenAITextGenerator(
8791
{
8892
this._client = skClient;
8993
this._log = (loggerFactory ?? DefaultLogger.Factory).CreateLogger<OpenAITextGenerator>();
94+
this._textModel = config.TextModel;
9095
this.MaxTokenTotal = config.TextModelMaxTokenTotal;
9196

9297
if (textTokenizer == null && !string.IsNullOrEmpty(config.TextModelTokenizer))
@@ -119,7 +124,7 @@ public IReadOnlyList<string> GetTokens(string text)
119124
}
120125

121126
/// <inheritdoc/>
122-
public async IAsyncEnumerable<string> GenerateTextAsync(
127+
public async IAsyncEnumerable<GeneratedTextContent> GenerateTextAsync(
123128
string prompt,
124129
TextGenerationOptions options,
125130
[EnumeratorCancellation] CancellationToken cancellationToken = default)
@@ -159,17 +164,33 @@ public async IAsyncEnumerable<string> GenerateTextAsync(
159164

160165
await foreach (StreamingTextContent x in result.WithCancellation(cancellationToken))
161166
{
162-
// TODO: try catch
163-
// if (x.Metadata?["Usage"] is not null)
164-
// {
165-
// var usage = x.Metadata["Usage"] as ChatTokenUsage;
166-
// this._log.LogTrace("Usage report: input tokens {0}, output tokens {1}, output reasoning tokens {2}",
167-
// usage?.InputTokenCount, usage?.OutputTokenCount, usage?.OutputTokenDetails.ReasoningTokenCount);
168-
// }
169-
170-
if (x.Text == null) { continue; }
171-
172-
yield return x.Text;
167+
TokenUsage? tokenUsage = null;
168+
169+
// The last message in the chunk has the usage metadata.
170+
// https://platform.openai.com/docs/api-reference/chat/create#chat-create-stream_options
171+
if (x.Metadata?["Usage"] is ChatTokenUsage { } usage)
172+
{
173+
this._log.LogTrace("Usage report: input tokens {0}, output tokens {1}, output reasoning tokens {2}",
174+
usage.InputTokenCount, usage.OutputTokenCount, usage.OutputTokenDetails?.ReasoningTokenCount ?? 0);
175+
176+
tokenUsage = new TokenUsage
177+
{
178+
Timestamp = (DateTimeOffset?)x.Metadata["CreatedAt"] ?? DateTimeOffset.UtcNow,
179+
ServiceType = "OpenAI",
180+
ModelType = Constants.ModelType.TextGeneration,
181+
ModelName = this._textModel,
182+
ServiceTokensIn = usage!.InputTokenCount,
183+
ServiceTokensOut = usage.OutputTokenCount,
184+
ServiceReasoningTokens = usage.OutputTokenDetails?.ReasoningTokenCount
185+
};
186+
}
187+
188+
// NOTE: as stated at https://platform.openai.com/docs/api-reference/chat/streaming#chat/streaming-choices,
189+
// The Choice can also be empty for the last chunk if we set stream_options: { "include_usage": true} to get token counts, so it is possible that
190+
// x.Text is null, but tokenUsage is not (token usage statistics for the entire request are included in the last chunk).
191+
if (x.Text is null && tokenUsage is null) { continue; }
192+
193+
yield return new(x.Text ?? string.Empty, tokenUsage);
173194
}
174195
}
175196
}

service/Abstractions/AI/ITextGenerator.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ public interface ITextGenerator : ITextTokenizer
1919
/// <param name="options">Options for the LLM request</param>
2020
/// <param name="cancellationToken">Async task cancellation token</param>
2121
/// <returns>Text generated, returned as a stream of strings/tokens</returns>
22-
public IAsyncEnumerable<string> GenerateTextAsync(
22+
public IAsyncEnumerable<GeneratedTextContent> GenerateTextAsync(
2323
string prompt,
2424
TextGenerationOptions options,
2525
CancellationToken cancellationToken = default);

0 commit comments

Comments
 (0)