Skip to content

Commit dd06513

Browse files
committed
feat: add OpenAI transcribe and TTS models
1 parent d25d37a commit dd06513

File tree

9 files changed

+67
-23
lines changed

9 files changed

+67
-23
lines changed

models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioSpeechModel.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ public class OpenAiAudioSpeechModel implements SpeechModel, StreamingSpeechModel
7979
public OpenAiAudioSpeechModel(OpenAiAudioApi audioApi) {
8080
this(audioApi,
8181
OpenAiAudioSpeechOptions.builder()
82-
.model(OpenAiAudioApi.TtsModel.TTS_1.getValue())
82+
.model(OpenAiAudioApi.TtsModel.GPT_4O_MINI_TTS.getValue())
8383
.responseFormat(AudioResponseFormat.MP3)
8484
.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY)
8585
.speed(SPEED)

models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioTranscriptionModel.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ public class OpenAiAudioTranscriptionModel implements Model<AudioTranscriptionPr
6363
public OpenAiAudioTranscriptionModel(OpenAiAudioApi audioApi) {
6464
this(audioApi,
6565
OpenAiAudioTranscriptionOptions.builder()
66-
.model(OpenAiAudioApi.WhisperModel.WHISPER_1.getValue())
66+
.model(OpenAiAudioApi.TranscriptionModels.WHISPER_1.getValue())
6767
.responseFormat(OpenAiAudioApi.TranscriptResponseFormat.JSON)
6868
.temperature(0.7f)
6969
.build());

models/spring-ai-openai/src/main/java/org/springframework/ai/openai/api/OpenAiAudioApi.java

+53-9
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import reactor.core.publisher.Mono;
2727

2828
import org.springframework.ai.model.ApiKey;
29+
import org.springframework.ai.model.ChatModelDescription;
2930
import org.springframework.ai.model.NoopApiKey;
3031
import org.springframework.ai.model.SimpleApiKey;
3132
import org.springframework.ai.openai.api.common.OpenAiApiConstants;
@@ -212,22 +213,25 @@ public String getFilename() {
212213
* different model variates, tts-1 is optimized for real time text to speech use cases
213214
* and tts-1-hd is optimized for quality. These models can be used with the Speech
214215
* endpoint in the Audio API. Reference:
215-
* <a href="https://platform.openai.com/docs/models/tts">TTS</a>
216+
* <a href="https://platform.openai.com/docs/models#tts">TTS</a>
216217
*/
217218
public enum TtsModel {
218219

219-
// @formatter:off
220220
/**
221-
* The latest text to speech model, optimized for speed.
221+
* Text-to-speech model optimized for speed
222222
*/
223223
@JsonProperty("tts-1")
224224
TTS_1("tts-1"),
225225
/**
226-
* The latest text to speech model, optimized for quality.
226+
* Text-to-speech model optimized for quality.
227227
*/
228228
@JsonProperty("tts-1-hd")
229-
TTS_1_HD("tts-1-hd");
230-
// @formatter:on
229+
TTS_1_HD("tts-1-hd"),
230+
/**
231+
* Text-to-speech model powered by GPT-4o mini
232+
*/
233+
@JsonProperty("gpt-4o-mini-tts")
234+
GPT_4O_MINI_TTS("gpt-4o-mini-tts");
231235

232236
public final String value;
233237

@@ -249,6 +253,7 @@ public String getValue() {
249253
* v2-large model is currently available through our API with the whisper-1 model
250254
* name.
251255
*/
256+
@Deprecated
252257
public enum WhisperModel {
253258

254259
// @formatter:off
@@ -268,6 +273,45 @@ public String getValue() {
268273

269274
}
270275

276+
/**
277+
* The available models for the transcriptions API. Reference:
278+
* <a href="https://platform.openai.com/docs/models#transcription">
279+
*/
280+
public enum TranscriptionModels implements ChatModelDescription {
281+
282+
/**
283+
* Speech-to-text model powered by GPT-4o
284+
*/
285+
@JsonProperty("gpt-4o-transcribe")
286+
GPT_4O_TRANSCRIBE("gpt-4o-transcribe"),
287+
/**
288+
* Speech-to-text model powered by GPT-4o mini
289+
*/
290+
@JsonProperty("gpt-4o-mini-transcribe")
291+
GPT_4O_MINI_TRANSCRIBE("gpt-4o-mini-transcribe"),
292+
/**
293+
* General-purpose speech recognition model
294+
*/
295+
@JsonProperty("whisper-1")
296+
WHISPER_1("whisper-1");
297+
298+
public final String value;
299+
300+
TranscriptionModels(String value) {
301+
this.value = value;
302+
}
303+
304+
public String getValue() {
305+
return this.value;
306+
}
307+
308+
@Override
309+
public String getName() {
310+
return this.value;
311+
}
312+
313+
}
314+
271315
/**
272316
* The format of the transcript and translation outputs, in one of these options:
273317
* json, text, srt, verbose_json, or vtt. Defaults to json.
@@ -411,7 +455,7 @@ public String getValue() {
411455
*/
412456
public static class Builder {
413457

414-
private String model = TtsModel.TTS_1.getValue();
458+
private String model = TtsModel.GPT_4O_MINI_TTS.getValue();
415459

416460
private String input;
417461

@@ -521,7 +565,7 @@ public static class Builder {
521565

522566
private byte[] file;
523567

524-
private String model = WhisperModel.WHISPER_1.getValue();
568+
private String model = TranscriptionModels.WHISPER_1.getValue();
525569

526570
private String language;
527571

@@ -614,7 +658,7 @@ public static class Builder {
614658

615659
private byte[] file;
616660

617-
private String model = WhisperModel.WHISPER_1.getValue();
661+
private String model = TranscriptionModels.WHISPER_1.getValue();
618662

619663
private String prompt;
620664

models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioApiIT.java

+4-4
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@
2929
import org.springframework.ai.openai.api.OpenAiAudioApi.StructuredResponse;
3030
import org.springframework.ai.openai.api.OpenAiAudioApi.TranscriptionRequest;
3131
import org.springframework.ai.openai.api.OpenAiAudioApi.TranslationRequest;
32+
import org.springframework.ai.openai.api.OpenAiAudioApi.TranscriptionModels;
3233
import org.springframework.ai.openai.api.OpenAiAudioApi.TtsModel;
33-
import org.springframework.ai.openai.api.OpenAiAudioApi.WhisperModel;
3434
import org.springframework.util.FileCopyUtils;
3535

3636
import static org.assertj.core.api.Assertions.assertThat;
@@ -51,7 +51,7 @@ void speechTranscriptionAndTranslation() throws IOException {
5151

5252
byte[] speech = this.audioApi
5353
.createSpeech(SpeechRequest.builder()
54-
.model(TtsModel.TTS_1_HD.getValue())
54+
.model(TtsModel.GPT_4O_MINI_TTS.getValue())
5555
.input("Hello, my name is Chris and I love Spring A.I.")
5656
.voice(Voice.ONYX)
5757
.build())
@@ -63,15 +63,15 @@ void speechTranscriptionAndTranslation() throws IOException {
6363

6464
StructuredResponse translation = this.audioApi
6565
.createTranslation(
66-
TranslationRequest.builder().model(WhisperModel.WHISPER_1.getValue()).file(speech).build(),
66+
TranslationRequest.builder().model(TranscriptionModels.WHISPER_1.getValue()).file(speech).build(),
6767
StructuredResponse.class)
6868
.getBody();
6969

7070
assertThat(translation.text().replaceAll(",", "")).isEqualTo("Hello my name is Chris and I love Spring AI.");
7171

7272
StructuredResponse transcriptionEnglish = this.audioApi
7373
.createTranscription(
74-
TranscriptionRequest.builder().model(WhisperModel.WHISPER_1.getValue()).file(speech).build(),
74+
TranscriptionRequest.builder().model(TranscriptionModels.WHISPER_1.getValue()).file(speech).build(),
7575
StructuredResponse.class)
7676
.getBody();
7777

models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioModelNoOpApiKeysIT.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ void checkNoOpKey() {
4444
assertThatThrownBy(() -> {
4545
this.audioApi
4646
.createSpeech(OpenAiAudioApi.SpeechRequest.builder()
47-
.model(OpenAiAudioApi.TtsModel.TTS_1_HD.getValue())
47+
.model(OpenAiAudioApi.TtsModel.GPT_4O_MINI_TTS.getValue())
4848
.input("Hello, my name is Chris and I love Spring A.I.")
4949
.voice(OpenAiAudioApi.SpeechRequest.Voice.ONYX)
5050
.build())

models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelIT.java

+4-4
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ void shouldGenerateNonEmptyMp3AudioFromSpeechPrompt() {
6060
.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY)
6161
.speed(SPEED)
6262
.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
63-
.model(OpenAiAudioApi.TtsModel.TTS_1.value)
63+
.model(OpenAiAudioApi.TtsModel.GPT_4O_MINI_TTS.value)
6464
.build();
6565
SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!",
6666
speechOptions);
@@ -78,7 +78,7 @@ void speechRateLimitTest() {
7878
.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY)
7979
.speed(SPEED)
8080
.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
81-
.model(OpenAiAudioApi.TtsModel.TTS_1.value)
81+
.model(OpenAiAudioApi.TtsModel.GPT_4O_MINI_TTS.value)
8282
.build();
8383
SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!",
8484
speechOptions);
@@ -98,7 +98,7 @@ void shouldStreamNonEmptyResponsesForValidSpeechPrompts() {
9898
.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY)
9999
.speed(SPEED)
100100
.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
101-
.model(OpenAiAudioApi.TtsModel.TTS_1.value)
101+
.model(OpenAiAudioApi.TtsModel.GPT_4O_MINI_TTS.value)
102102
.build();
103103

104104
SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!",
@@ -120,7 +120,7 @@ void speechVoicesTest(String voice) {
120120
.voice(OpenAiAudioApi.SpeechRequest.Voice.valueOf(voice.toUpperCase()))
121121
.speed(SPEED)
122122
.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
123-
.model(OpenAiAudioApi.TtsModel.TTS_1.value)
123+
.model(OpenAiAudioApi.TtsModel.GPT_4O_MINI_TTS.value)
124124
.build();
125125
SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!",
126126
speechOptions);

models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelWithSpeechResponseMetadataTests.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ void aiResponseContainsImageResponseMetadata() {
7474
.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY)
7575
.speed(SPEED)
7676
.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
77-
.model(OpenAiAudioApi.TtsModel.TTS_1.value)
77+
.model(OpenAiAudioApi.TtsModel.GPT_4O_MINI_TTS.value)
7878
.build();
7979

8080
SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!",

spring-ai-spring-boot-autoconfigure/src/main/java/org/springframework/ai/autoconfigure/openai/OpenAiAudioSpeechProperties.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ public class OpenAiAudioSpeechProperties extends OpenAiParentProperties {
3535

3636
public static final String CONFIG_PREFIX = "spring.ai.openai.audio.speech";
3737

38-
public static final String DEFAULT_SPEECH_MODEL = OpenAiAudioApi.TtsModel.TTS_1.getValue();
38+
public static final String DEFAULT_SPEECH_MODEL = OpenAiAudioApi.TtsModel.GPT_4O_MINI_TTS.getValue();
3939

4040
private static final Float SPEED = 1.0f;
4141

spring-ai-spring-boot-autoconfigure/src/main/java/org/springframework/ai/autoconfigure/openai/OpenAiAudioTranscriptionProperties.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ public class OpenAiAudioTranscriptionProperties extends OpenAiParentProperties {
2626

2727
public static final String CONFIG_PREFIX = "spring.ai.openai.audio.transcription";
2828

29-
public static final String DEFAULT_TRANSCRIPTION_MODEL = OpenAiAudioApi.WhisperModel.WHISPER_1.getValue();
29+
public static final String DEFAULT_TRANSCRIPTION_MODEL = OpenAiAudioApi.TranscriptionModels.WHISPER_1.getValue();
3030

3131
private static final Double DEFAULT_TEMPERATURE = 0.7;
3232

0 commit comments

Comments
 (0)