Make MaxNumTokens work with newer models

ccreutzi · ccreutzi · commit ce08807fd6a0 · 2025-03-14T13:30:03.000+01:00
Most  models accept both `max_tokens` and `max_completion_tokens`; the newer reasoning models only want to see `max_completion_tokens`, while GPT-3.5 deployments error when given `max_completion_tokens`. Since we do not know which model is behind some deployment name, we just try with the newer name and come back with the old name if we get the corresponding error message.

The test point also works with o1-mini, gpt-4o is just cheaper.
diff --git a/+llms/+internal/callAzureChatAPI.m b/+llms/+internal/callAzureChatAPI.m
@@ -37,7 +37,7 @@
 %   % Send a request
 %   [text, message] = llms.internal.callAzureChatAPI(messages, functions, APIKey=apiKey)
 
-%   Copyright 2023-2024 The MathWorks, Inc.
+%   Copyright 2023-2025 The MathWorks, Inc.
 
 arguments
     endpoint
@@ -66,6 +66,17 @@
 
 [response, streamedText] = llms.internal.sendRequestWrapper(parameters,nvp.APIKey, URL, nvp.TimeOut, nvp.StreamFun);
 
+% For old models like GPT-3.5, we may have to change the request sent a
+% little. Since we cannot detect the model used other than trying to send a
+% request, we have to analyze the response instead.
+if response.StatusCode=="BadRequest" && ...
+        isfield(response.Body.Data,"error") && ...
+        isfield(response.Body.Data.error,"message") && ...
+        response.Body.Data.error.message == "Unrecognized request argument supplied: max_completion_tokens"
+    parameters = renameStructField(parameters,'max_completion_tokens','max_tokens');
+    [response, streamedText] = llms.internal.sendRequestWrapper(parameters,nvp.APIKey, URL, nvp.TimeOut, nvp.StreamFun);
+end
+
 % If call errors, "choices" will not be part of response.Body.Data, instead
 % we get response.Body.Data.error
 if response.StatusCode=="OK"
@@ -136,10 +147,15 @@
 
 nvpOptions = keys(dict);
 for opt = nvpOptions.'
-    if isfield(nvp, opt)
+    if isfield(nvp, opt) && ~isempty(nvp.(opt))
         parameters.(dict(opt)) = nvp.(opt);
     end
 end
+
+if nvp.MaxNumTokens == Inf
+    parameters = rmfield(parameters,dict("MaxNumTokens"));
+end
+
 end
 
 function dict = mapNVPToParameters()
@@ -148,7 +164,7 @@
 dict("TopP") = "top_p";
 dict("NumCompletions") = "n";
 dict("StopSequences") = "stop";
-dict("MaxNumTokens") = "max_tokens";
+dict("MaxNumTokens") = "max_completion_tokens";
 dict("PresencePenalty") = "presence_penalty";
 dict("FrequencyPenalty") = "frequency_penalty";
 end
diff --git a/azureChat.m b/azureChat.m
@@ -276,7 +276,9 @@
 
             if isfield(response.Body.Data,"error")
                 err = response.Body.Data.error.message;
-                if startsWith(err,"'json_schema' is not one of ['json_object', 'text']")
+                if startsWith(err,"'json_schema' is not one of ['json_object', 'text']") || ...
+                    startsWith(replace(err,newline," "),...
+                        "Invalid parameter: 'response_format' of type 'json_schema' is not supported with this model.")
                     error("llms:noStructuredOutputForAzureDeployment", ...
                         llms.utils.errorMessageCatalog.getMessage( ...
                             "llms:noStructuredOutputForAzureDeployment",this.DeploymentID));
diff --git a/tests/tazureChat.m b/tests/tazureChat.m
@@ -1,7 +1,7 @@
 classdef tazureChat < hopenAIChat
 % Tests for azureChat
 
-%   Copyright 2024 The MathWorks, Inc.
+%   Copyright 2024-2025 The MathWorks, Inc.
 
     properties(TestParameter)
         ValidConstructorInput = iGetValidConstructorInput();
@@ -69,6 +69,20 @@ function responseFormatRequiresNewAPI(testCase)
                 "llms:structuredOutputRequiresAPI");
         end
 
+        function maxNumTokensWithReasoningModel(testCase)
+            % Unlike OpenAI, Azure requires different parameter names for
+            % different models (max_tokens vs max_completion_tokens). Since
+            % we do not even know what model some deployment uses (us naming
+            % them after the model deployed is not a guarantee), that is a
+            % somewhat painful distinction.
+            testCase.verifyWarningFree(@() generate( ...
+                azureChat(DeploymentID="gpt-35-turbo-16k-0613"), ...
+                "What is object oriented design?", MaxNumTokens=23));
+            testCase.verifyWarningFree(@() generate( ...
+                azureChat(DeploymentID="o1-mini"), ...
+                "What is object oriented design?", MaxNumTokens=23));
+        end
+
         function generateWithImage(testCase)
             chat = azureChat(DeploymentID="gpt-4o");
             image_path = "peppers.png";
@@ -123,10 +137,10 @@ function canUseAPIVersions(testCase, APIVersions)
         end
 
         function specialErrorForUnsupportedResponseFormat(testCase)
-            testCase.assumeFail("Disabled until `llms.internal.callAzureChat` is updated to use `max_completion_tokens` instead of the deprecated `max_tokens` in the OpenAI API.")
-
+            % Our "gpt-4o" deployment has the model version 2024-05-13,
+            % which does not support structured output
             testCase.verifyError(@() generate(...
-                azureChat(DeploymentID="o1-mini"), ...
+                azureChat(DeploymentID="gpt-4o"), ...
                 "What is the smallest prime?", ...
                 ResponseFormat=struct("number",1)), ...
                 "llms:noStructuredOutputForAzureDeployment");