Skip to content

Commit

Permalink
Add token usage tracking (#947)
Browse files Browse the repository at this point in the history
## Motivation and Context (Why the change? What's the scenario?)

Adds a new TokenUsage property to MemoryAnswer to hold information about token usage.

## High level description (Approach, Design)

* Include token count provided by the internal tokenizer
* Include token count provided by the service, if available
* Support streaming and multiple services, if needed

---------

Co-authored-by: Devis Lucato <[email protected]>
  • Loading branch information
marcominerva and dluc authored Jan 15, 2025
1 parent f21cc53 commit dd89a8e
Show file tree
Hide file tree
Showing 25 changed files with 338 additions and 80 deletions.
43 changes: 35 additions & 8 deletions examples/001-dotnet-WebClient/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -253,31 +253,58 @@ private static async Task AskSimpleQuestionStreamingTheAnswer()
{
var question = "What's E = m*c^2?";
Console.WriteLine($"Question: {question}");
Console.WriteLine($"Expected result: formula explanation using the information loaded");
Console.WriteLine("Expected result: formula explanation using the information loaded");

Console.Write("\nAnswer: ");
var tokenUsage = new List<TokenUsage>();
var answerStream = s_memory.AskStreamingAsync(question, options: new SearchOptions { Stream = true });

await foreach (var answer in answerStream)
{
// Print token received by LLM
Console.Write(answer.Result);

// Collect token usage
if (answer.TokenUsage?.Count > 0)
{
tokenUsage = tokenUsage.Union(answer.TokenUsage).ToList();
}

// Slow down the stream for demo purpose
await Task.Delay(25);
}

Console.WriteLine("\n\nToken usage report:");
foreach (var report in tokenUsage)
{
Console.WriteLine($"{report.ServiceType}: {report.ModelName} [{report.ModelType}]");
Console.WriteLine($"- Input : {report.TokenizerTokensIn} tokens (measured by KM tokenizer)");
Console.WriteLine($"- Input : {report.ServiceTokensIn} tokens (measured by remote service)");
Console.WriteLine($"- Output: {report.ServiceTokensOut} tokens (measured by remote service)");
Console.WriteLine($"- Output: {report.TokenizerTokensOut} tokens (measured by KM tokenizer)");
Console.WriteLine();
}

Console.WriteLine("\n\n====================================\n");

/* OUTPUT
Question: What's E = m*c^2?
Answer: E = m*c^2 is the formula representing the principle of mass-energy equivalence, which was introduced by Albert Einstein. In this equation,
E stands for energy, m represents mass, and c is the speed of light in a vacuum, which is approximately 299,792,458 meters per second (m/s).
The equation states that the energy (E) of a system in its rest frame is equal to its mass (m) multiplied by the square of the speed of light (c^2).
This implies that mass and energy are interchangeable; a small amount of mass can be converted into a large amount of energy and vice versa,
due to the speed of light being a very large number when squared. This concept is a fundamental principle in physics and has important implications
in various fields, including nuclear physics and cosmology.
Expected result: formula explanation using the information loaded
Answer: E = m*c^2 is a formula derived by the physicist Albert Einstein, which describes the principle of
mass–energy equivalence. In this equation, E represents energy, m represents mass, and c represents the
speed of light in a vacuum (approximately 3 x 10^8 meters per second). The formula indicates that mass and
energy are interchangeable; they are different forms of the same thing and can be converted into each other.
This principle is fundamental in physics and has significant implications in various fields, including nuclear
physics and cosmology.
Token usage report:
Azure OpenAI: gpt-4o [TextGeneration]
- Input : 15657 tokens (measured by KM tokenizer)
- Input : 15664 tokens (measured by remote service)
- Output: 110 tokens (measured by remote service)
- Output: 110 tokens (measured by KM tokenizer)
*/
}
Expand Down
42 changes: 34 additions & 8 deletions examples/002-dotnet-Serverless/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -311,31 +311,57 @@ private static async Task AskSimpleQuestionStreamingTheAnswer()
{
var question = "What's E = m*c^2?";
Console.WriteLine($"Question: {question}");
Console.WriteLine($"Expected result: formula explanation using the information loaded");
Console.WriteLine("Expected result: formula explanation using the information loaded");

Console.Write("\nAnswer: ");
var tokenUsage = new List<TokenUsage>();
var answerStream = s_memory.AskStreamingAsync(question, options: new SearchOptions { Stream = true });

await foreach (var answer in answerStream)
{
// Print token received by LLM
Console.Write(answer.Result);

// Collect token usage
if (answer.TokenUsage?.Count > 0)
{
tokenUsage = tokenUsage.Union(answer.TokenUsage).ToList();
}

// Slow down the stream for demo purpose
await Task.Delay(25);
}

Console.WriteLine("\n\nToken usage report:");
foreach (var report in tokenUsage)
{
Console.WriteLine($"{report.ServiceType}: {report.ModelName} [{report.ModelType}]");
Console.WriteLine($"- Input : {report.TokenizerTokensIn} tokens (measured by KM tokenizer)");
Console.WriteLine($"- Input : {report.ServiceTokensIn} tokens (measured by remote service)");
Console.WriteLine($"- Output: {report.ServiceTokensOut} tokens (measured by remote service)");
Console.WriteLine($"- Output: {report.TokenizerTokensOut} tokens (measured by KM tokenizer)");
Console.WriteLine();
}

Console.WriteLine("\n\n====================================\n");

/* OUTPUT
Question: What's E = m*c^2?
Answer: E = m*c^2 is the formula representing the principle of mass-energy equivalence, which was introduced by Albert Einstein. In this equation,
E stands for energy, m represents mass, and c is the speed of light in a vacuum, which is approximately 299,792,458 meters per second (m/s).
The equation states that the energy (E) of a system in its rest frame is equal to its mass (m) multiplied by the square of the speed of light (c^2).
This implies that mass and energy are interchangeable; a small amount of mass can be converted into a large amount of energy and vice versa,
due to the speed of light being a very large number when squared. This concept is a fundamental principle in physics and has important implications
in various fields, including nuclear physics and cosmology.
Expected result: formula explanation using the information loaded
Answer: E = m*c^2 is a formula derived by physicist Albert Einstein, which expresses the principle of
mass–energy equivalence. In this equation, E represents energy, m represents mass, and c represents the
speed of light in a vacuum (approximately 3 x 10^8 meters per second). The formula indicates that mass and
energy are interchangeable; a small amount of mass can be converted into a large amount of energy, and vice
versa, differing only by a multiplicative constant (c^2).
Token usage report:
Azure OpenAI: gpt-4o [TextGeneration]
- Input : 24349 tokens (measured by KM tokenizer)
- Input : 24356 tokens (measured by remote service)
- Output: 103 tokens (measured by remote service)
- Output: 103 tokens (measured by KM tokenizer)
*/
}
Expand Down
2 changes: 1 addition & 1 deletion examples/104-dotnet-custom-LLM/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ public IReadOnlyList<string> GetTokens(string text)
}

/// <inheritdoc />
public async IAsyncEnumerable<string> GenerateTextAsync(
public async IAsyncEnumerable<GeneratedTextContent> GenerateTextAsync(
string prompt,
TextGenerationOptions options,
[EnumeratorCancellation] CancellationToken cancellationToken = default)
Expand Down
2 changes: 1 addition & 1 deletion extensions/Anthropic/AnthropicTextGeneration.cs
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ public IReadOnlyList<string> GetTokens(string text)
}

/// <inheritdoc />
public async IAsyncEnumerable<string> GenerateTextAsync(
public async IAsyncEnumerable<GeneratedTextContent> GenerateTextAsync(
string prompt,
TextGenerationOptions options,
[EnumeratorCancellation] CancellationToken cancellationToken = default)
Expand Down
37 changes: 33 additions & 4 deletions extensions/AzureOpenAI/AzureOpenAI/AzureOpenAITextGenerator.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// Copyright (c) Microsoft. All rights reserved.

using System;
using System.Collections.Generic;
using System.Diagnostics.CodeAnalysis;
using System.Net.Http;
Expand All @@ -12,6 +13,7 @@
using Microsoft.KernelMemory.Diagnostics;
using Microsoft.SemanticKernel;
using Microsoft.SemanticKernel.Connectors.AzureOpenAI;
using OpenAI.Chat;

namespace Microsoft.KernelMemory.AI.AzureOpenAI;

Expand All @@ -28,6 +30,8 @@ public sealed class AzureOpenAITextGenerator : ITextGenerator
private readonly ITextTokenizer _textTokenizer;
private readonly ILogger<AzureOpenAITextGenerator> _log;

private readonly string _deployment;

/// <inheritdoc/>
public int MaxTokenTotal { get; }

Expand Down Expand Up @@ -87,6 +91,7 @@ public AzureOpenAITextGenerator(
{
this._client = skClient;
this._log = (loggerFactory ?? DefaultLogger.Factory).CreateLogger<AzureOpenAITextGenerator>();
this._deployment = config.Deployment;
this.MaxTokenTotal = config.MaxTokenTotal;

textTokenizer ??= TokenizerFactory.GetTokenizerForEncoding(config.Tokenizer);
Expand Down Expand Up @@ -114,7 +119,7 @@ public IReadOnlyList<string> GetTokens(string text)
}

/// <inheritdoc/>
public async IAsyncEnumerable<string> GenerateTextAsync(
public async IAsyncEnumerable<GeneratedTextContent> GenerateTextAsync(
string prompt,
TextGenerationOptions options,
[EnumeratorCancellation] CancellationToken cancellationToken = default)
Expand Down Expand Up @@ -153,9 +158,33 @@ public async IAsyncEnumerable<string> GenerateTextAsync(

await foreach (StreamingTextContent x in result.WithCancellation(cancellationToken))
{
if (x.Text == null) { continue; }

yield return x.Text;
TokenUsage? tokenUsage = null;

// The last message includes tokens usage metadata.
// https://platform.openai.com/docs/api-reference/chat/create#chat-create-stream_options
if (x.Metadata?["Usage"] is ChatTokenUsage usage)
{
this._log.LogTrace("Usage report: input tokens: {InputTokenCount}, output tokens: {OutputTokenCount}, output reasoning tokens: {ReasoningTokenCount}",
usage.InputTokenCount, usage.OutputTokenCount, usage.OutputTokenDetails?.ReasoningTokenCount ?? 0);

tokenUsage = new TokenUsage
{
Timestamp = (DateTimeOffset?)x.Metadata["CreatedAt"] ?? DateTimeOffset.UtcNow,
ServiceType = "Azure OpenAI",
ModelType = Constants.ModelType.TextGeneration,
ModelName = this._deployment,
ServiceTokensIn = usage.InputTokenCount,
ServiceTokensOut = usage.OutputTokenCount,
ServiceReasoningTokens = usage.OutputTokenDetails?.ReasoningTokenCount
};
}

// NOTE: as stated at https://platform.openai.com/docs/api-reference/chat/streaming#chat/streaming-choices,
// the Choice can also be empty for the last chunk if we set stream_options: { "include_usage": true} to get token counts, so it is possible that
// x.Text is null, but tokenUsage is not (token usage statistics for the entire request are included in the last chunk).
if (x.Text is null && tokenUsage is null) { continue; }

yield return new(x.Text ?? string.Empty, tokenUsage);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ public void ItCountsTokens()

// Assert
Console.WriteLine("Phi3 token count: " + tokenCount);
Console.WriteLine("GPT4 token count: " + (new CL100KTokenizer()).CountTokens(text));
Console.WriteLine("GPT4 token count: " + new CL100KTokenizer().CountTokens(text));
Console.WriteLine($"Time: {this._timer.ElapsedMilliseconds / 1000} secs");

// Expected result with Phi-3-mini-4k-instruct-q4.gguf, without BoS (https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf)
Expand Down Expand Up @@ -90,9 +90,8 @@ public async Task ItGeneratesText()
this._timer.Restart();
var tokens = this._target.GenerateTextAsync(prompt, options);
var result = new StringBuilder();
await foreach (string token in tokens)
await foreach (var token in tokens)
{
// Console.WriteLine(token);
result.Append(token);
}

Expand Down
6 changes: 3 additions & 3 deletions extensions/LlamaSharp/LlamaSharp/LlamaSharpTextGenerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ public IReadOnlyList<string> GetTokens(string text)
}

/// <inheritdoc/>
public IAsyncEnumerable<string> GenerateTextAsync(
public IAsyncEnumerable<GeneratedTextContent> GenerateTextAsync(
string prompt,
TextGenerationOptions options,
CancellationToken cancellationToken = default)
Expand All @@ -85,7 +85,7 @@ public IAsyncEnumerable<string> GenerateTextAsync(
? options.TokenSelectionBiases.ToDictionary(pair => (LLamaToken)pair.Key, pair => pair.Value)
: [];

var samplingPipeline = new DefaultSamplingPipeline()
var samplingPipeline = new DefaultSamplingPipeline
{
Temperature = (float)options.Temperature,
TopP = (float)options.NucleusSampling,
Expand All @@ -103,7 +103,7 @@ public IAsyncEnumerable<string> GenerateTextAsync(
};

this._log.LogTrace("Generating text, temperature {0}, max tokens {1}", samplingPipeline.Temperature, settings.MaxTokens);
return executor.InferAsync(prompt, settings, cancellationToken);
return executor.InferAsync(prompt, settings, cancellationToken).Select(x => new GeneratedTextContent(x));
}

/// <inheritdoc/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ public async Task ItGeneratesText()
this._timer.Restart();
var tokens = this._target.GenerateTextAsync(prompt, options);
var result = new StringBuilder();
await foreach (string token in tokens)
await foreach (var token in tokens)
{
result.Append(token);
}
Expand Down
2 changes: 1 addition & 1 deletion extensions/ONNX/Onnx/OnnxTextGenerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ public OnnxTextGenerator(
}

/// <inheritdoc/>
public async IAsyncEnumerable<string> GenerateTextAsync(
public async IAsyncEnumerable<GeneratedTextContent> GenerateTextAsync(
string prompt,
TextGenerationOptions? options = null,
[EnumeratorCancellation] CancellationToken cancellationToken = default)
Expand Down
2 changes: 1 addition & 1 deletion extensions/Ollama/Ollama/OllamaTextGenerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ public IReadOnlyList<string> GetTokens(string text)
return this._textTokenizer.GetTokens(text);
}

public async IAsyncEnumerable<string> GenerateTextAsync(
public async IAsyncEnumerable<GeneratedTextContent> GenerateTextAsync(
string prompt,
TextGenerationOptions options,
[EnumeratorCancellation] CancellationToken cancellationToken = default)
Expand Down
45 changes: 33 additions & 12 deletions extensions/OpenAI/OpenAI/OpenAITextGenerator.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// Copyright (c) Microsoft. All rights reserved.

using System;
using System.Collections.Generic;
using System.Diagnostics.CodeAnalysis;
using System.Net.Http;
Expand All @@ -12,6 +13,7 @@
using Microsoft.SemanticKernel;
using Microsoft.SemanticKernel.Connectors.OpenAI;
using OpenAI;
using OpenAI.Chat;

namespace Microsoft.KernelMemory.AI.OpenAI;

Expand All @@ -29,6 +31,8 @@ public sealed class OpenAITextGenerator : ITextGenerator
private readonly ITextTokenizer _textTokenizer;
private readonly ILogger<OpenAITextGenerator> _log;

private readonly string _textModel;

/// <inheritdoc/>
public int MaxTokenTotal { get; }

Expand Down Expand Up @@ -87,6 +91,7 @@ public OpenAITextGenerator(
{
this._client = skClient;
this._log = (loggerFactory ?? DefaultLogger.Factory).CreateLogger<OpenAITextGenerator>();
this._textModel = config.TextModel;
this.MaxTokenTotal = config.TextModelMaxTokenTotal;

if (textTokenizer == null && !string.IsNullOrEmpty(config.TextModelTokenizer))
Expand Down Expand Up @@ -119,7 +124,7 @@ public IReadOnlyList<string> GetTokens(string text)
}

/// <inheritdoc/>
public async IAsyncEnumerable<string> GenerateTextAsync(
public async IAsyncEnumerable<GeneratedTextContent> GenerateTextAsync(
string prompt,
TextGenerationOptions options,
[EnumeratorCancellation] CancellationToken cancellationToken = default)
Expand Down Expand Up @@ -159,17 +164,33 @@ public async IAsyncEnumerable<string> GenerateTextAsync(

await foreach (StreamingTextContent x in result.WithCancellation(cancellationToken))
{
// TODO: try catch
// if (x.Metadata?["Usage"] is not null)
// {
// var usage = x.Metadata["Usage"] as ChatTokenUsage;
// this._log.LogTrace("Usage report: input tokens {0}, output tokens {1}, output reasoning tokens {2}",
// usage?.InputTokenCount, usage?.OutputTokenCount, usage?.OutputTokenDetails.ReasoningTokenCount);
// }

if (x.Text == null) { continue; }

yield return x.Text;
TokenUsage? tokenUsage = null;

// The last message in the chunk has the usage metadata.
// https://platform.openai.com/docs/api-reference/chat/create#chat-create-stream_options
if (x.Metadata?["Usage"] is ChatTokenUsage { } usage)
{
this._log.LogTrace("Usage report: input tokens {0}, output tokens {1}, output reasoning tokens {2}",
usage.InputTokenCount, usage.OutputTokenCount, usage.OutputTokenDetails?.ReasoningTokenCount ?? 0);

tokenUsage = new TokenUsage
{
Timestamp = (DateTimeOffset?)x.Metadata["CreatedAt"] ?? DateTimeOffset.UtcNow,
ServiceType = "OpenAI",
ModelType = Constants.ModelType.TextGeneration,
ModelName = this._textModel,
ServiceTokensIn = usage!.InputTokenCount,
ServiceTokensOut = usage.OutputTokenCount,
ServiceReasoningTokens = usage.OutputTokenDetails?.ReasoningTokenCount
};
}

// NOTE: as stated at https://platform.openai.com/docs/api-reference/chat/streaming#chat/streaming-choices,
// The Choice can also be empty for the last chunk if we set stream_options: { "include_usage": true} to get token counts, so it is possible that
// x.Text is null, but tokenUsage is not (token usage statistics for the entire request are included in the last chunk).
if (x.Text is null && tokenUsage is null) { continue; }

yield return new(x.Text ?? string.Empty, tokenUsage);
}
}
}
2 changes: 1 addition & 1 deletion service/Abstractions/AI/ITextGenerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ public interface ITextGenerator : ITextTokenizer
/// <param name="options">Options for the LLM request</param>
/// <param name="cancellationToken">Async task cancellation token</param>
/// <returns>Text generated, returned as a stream of strings/tokens</returns>
public IAsyncEnumerable<string> GenerateTextAsync(
public IAsyncEnumerable<GeneratedTextContent> GenerateTextAsync(
string prompt,
TextGenerationOptions options,
CancellationToken cancellationToken = default);
Expand Down
Loading

0 comments on commit dd89a8e

Please sign in to comment.