Skip to content

Commit f245832

Browse files
feat(stt): add recognize enrichments, add new function detectLanguage
1 parent 4a304da commit f245832

24 files changed

+1032
-62
lines changed

speech-to-text/src/main/java/com/ibm/watson/speech_to_text/v1/SpeechToText.java

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* (C) Copyright IBM Corp. 2016, 2025.
2+
* (C) Copyright IBM Corp. 2016, 2026.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
55
* the License. You may obtain a copy of the License at
@@ -51,6 +51,7 @@
5151
import com.ibm.watson.speech_to_text.v1.model.DeleteLanguageModelOptions;
5252
import com.ibm.watson.speech_to_text.v1.model.DeleteUserDataOptions;
5353
import com.ibm.watson.speech_to_text.v1.model.DeleteWordOptions;
54+
import com.ibm.watson.speech_to_text.v1.model.DetectLanguageOptions;
5455
import com.ibm.watson.speech_to_text.v1.model.GetAcousticModelOptions;
5556
import com.ibm.watson.speech_to_text.v1.model.GetAudioOptions;
5657
import com.ibm.watson.speech_to_text.v1.model.GetCorpusOptions;
@@ -60,6 +61,7 @@
6061
import com.ibm.watson.speech_to_text.v1.model.GetWordOptions;
6162
import com.ibm.watson.speech_to_text.v1.model.Grammar;
6263
import com.ibm.watson.speech_to_text.v1.model.Grammars;
64+
import com.ibm.watson.speech_to_text.v1.model.LanguageDetectionResults;
6365
import com.ibm.watson.speech_to_text.v1.model.LanguageModel;
6466
import com.ibm.watson.speech_to_text.v1.model.LanguageModels;
6567
import com.ibm.watson.speech_to_text.v1.model.ListAcousticModelsOptions;
@@ -447,6 +449,9 @@ public ServiceCall<SpeechRecognitionResults> recognize(RecognizeOptions recogniz
447449
if (recognizeOptions.speechBeginEvent() != null) {
448450
builder.query("speech_begin_event", String.valueOf(recognizeOptions.speechBeginEvent()));
449451
}
452+
if (recognizeOptions.enrichments() != null) {
453+
builder.query("enrichments", String.valueOf(recognizeOptions.enrichments()));
454+
}
450455
if (recognizeOptions.languageCustomizationId() != null) {
451456
builder.query(
452457
"language_customization_id", String.valueOf(recognizeOptions.languageCustomizationId()));
@@ -776,6 +781,12 @@ public ServiceCall<RecognitionJob> createJob(CreateJobOptions createJobOptions)
776781
if (createJobOptions.resultsTtl() != null) {
777782
builder.query("results_ttl", String.valueOf(createJobOptions.resultsTtl()));
778783
}
784+
if (createJobOptions.speechBeginEvent() != null) {
785+
builder.query("speech_begin_event", String.valueOf(createJobOptions.speechBeginEvent()));
786+
}
787+
if (createJobOptions.enrichments() != null) {
788+
builder.query("enrichments", String.valueOf(createJobOptions.enrichments()));
789+
}
779790
if (createJobOptions.languageCustomizationId() != null) {
780791
builder.query(
781792
"language_customization_id", String.valueOf(createJobOptions.languageCustomizationId()));
@@ -2801,4 +2812,43 @@ public ServiceCall<Void> deleteUserData(DeleteUserDataOptions deleteUserDataOpti
28012812
ResponseConverter<Void> responseConverter = ResponseConverterUtils.getVoid();
28022813
return createServiceCall(builder.build(), responseConverter);
28032814
}
2815+
2816+
/**
2817+
* Spoken language identification.
2818+
*
2819+
* <p>Detects the spoken language in audio streams. The endpoint is `/v1/detect_language` and user
2820+
* can optionally include `lid_confidence` parameter to set a custom confidence threshold for
2821+
* detection. The model continuously processes incoming audio and returns the identified language
2822+
* when it reaches a confidence level higher than the specified threshold (0.99 by default). See
2823+
* [Spoken language
2824+
* identification](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-speech-language-identification).
2825+
*
2826+
* @param detectLanguageOptions the {@link DetectLanguageOptions} containing the options for the
2827+
* call
2828+
* @return a {@link ServiceCall} with a result of type {@link LanguageDetectionResults}
2829+
*/
2830+
public ServiceCall<LanguageDetectionResults> detectLanguage(
2831+
DetectLanguageOptions detectLanguageOptions) {
2832+
com.ibm.cloud.sdk.core.util.Validator.notNull(
2833+
detectLanguageOptions, "detectLanguageOptions cannot be null");
2834+
RequestBuilder builder =
2835+
RequestBuilder.post(
2836+
RequestBuilder.resolveRequestUrl(getServiceUrl(), "/v1/detect_language"));
2837+
Map<String, String> sdkHeaders =
2838+
SdkCommon.getSdkHeaders("speech_to_text", "v1", "detectLanguage");
2839+
for (Entry<String, String> header : sdkHeaders.entrySet()) {
2840+
builder.header(header.getKey(), header.getValue());
2841+
}
2842+
builder.header("Accept", "application/json");
2843+
if (detectLanguageOptions.contentType() != null) {
2844+
builder.header("Content-Type", detectLanguageOptions.contentType());
2845+
}
2846+
builder.query("lid_confidence", String.valueOf(detectLanguageOptions.lidConfidence()));
2847+
builder.bodyContent(
2848+
detectLanguageOptions.contentType(), null, null, detectLanguageOptions.audio());
2849+
ResponseConverter<LanguageDetectionResults> responseConverter =
2850+
ResponseConverterUtils.getValue(
2851+
new com.google.gson.reflect.TypeToken<LanguageDetectionResults>() {}.getType());
2852+
return createServiceCall(builder.build(), responseConverter);
2853+
}
28042854
}

speech-to-text/src/main/java/com/ibm/watson/speech_to_text/v1/model/CreateJobOptions.java

Lines changed: 66 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* (C) Copyright IBM Corp. 2018, 2025.
2+
* (C) Copyright IBM Corp. 2018, 2026.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
55
* the License. You may obtain a copy of the License at
@@ -247,6 +247,8 @@ public interface Events {
247247
protected String events;
248248
protected String userToken;
249249
protected Long resultsTtl;
250+
protected Boolean speechBeginEvent;
251+
protected String enrichments;
250252
protected String languageCustomizationId;
251253
protected String acousticCustomizationId;
252254
protected String baseModelVersion;
@@ -284,6 +286,8 @@ public static class Builder {
284286
private String events;
285287
private String userToken;
286288
private Long resultsTtl;
289+
private Boolean speechBeginEvent;
290+
private String enrichments;
287291
private String languageCustomizationId;
288292
private String acousticCustomizationId;
289293
private String baseModelVersion;
@@ -325,6 +329,8 @@ private Builder(CreateJobOptions createJobOptions) {
325329
this.events = createJobOptions.events;
326330
this.userToken = createJobOptions.userToken;
327331
this.resultsTtl = createJobOptions.resultsTtl;
332+
this.speechBeginEvent = createJobOptions.speechBeginEvent;
333+
this.enrichments = createJobOptions.enrichments;
328334
this.languageCustomizationId = createJobOptions.languageCustomizationId;
329335
this.acousticCustomizationId = createJobOptions.acousticCustomizationId;
330336
this.baseModelVersion = createJobOptions.baseModelVersion;
@@ -467,6 +473,28 @@ public Builder resultsTtl(long resultsTtl) {
467473
return this;
468474
}
469475

476+
/**
477+
* Set the speechBeginEvent.
478+
*
479+
* @param speechBeginEvent the speechBeginEvent
480+
* @return the CreateJobOptions builder
481+
*/
482+
public Builder speechBeginEvent(Boolean speechBeginEvent) {
483+
this.speechBeginEvent = speechBeginEvent;
484+
return this;
485+
}
486+
487+
/**
488+
* Set the enrichments.
489+
*
490+
* @param enrichments the enrichments
491+
* @return the CreateJobOptions builder
492+
*/
493+
public Builder enrichments(String enrichments) {
494+
this.enrichments = enrichments;
495+
return this;
496+
}
497+
470498
/**
471499
* Set the languageCustomizationId.
472500
*
@@ -788,6 +816,8 @@ protected CreateJobOptions(Builder builder) {
788816
events = builder.events;
789817
userToken = builder.userToken;
790818
resultsTtl = builder.resultsTtl;
819+
speechBeginEvent = builder.speechBeginEvent;
820+
enrichments = builder.enrichments;
791821
languageCustomizationId = builder.languageCustomizationId;
792822
acousticCustomizationId = builder.acousticCustomizationId;
793823
baseModelVersion = builder.baseModelVersion;
@@ -940,6 +970,41 @@ public Long resultsTtl() {
940970
return resultsTtl;
941971
}
942972

973+
/**
974+
* Gets the speechBeginEvent.
975+
*
976+
* <p>If `true`, the service returns a response object `SpeechActivity` which contains the time
977+
* when a speech activity is detected in the stream. This can be used both in standard and low
978+
* latency mode. This feature enables client applications to know that some words/speech has been
979+
* detected and the service is in the process of decoding. This can be used in lieu of interim
980+
* results in standard mode. Use `sad_module: 2` to increase accuracy and performance in detecting
981+
* speech boundaries within the audio stream. See [Using speech recognition
982+
* parameters](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-service-features#features-parameters).
983+
*
984+
* @return the speechBeginEvent
985+
*/
986+
public Boolean speechBeginEvent() {
987+
return speechBeginEvent;
988+
}
989+
990+
/**
991+
* Gets the enrichments.
992+
*
993+
* <p>Speech transcript enrichment improves readability of raw ASR transcripts by adding
994+
* punctuation (periods, commas, question marks, exclamation points) and intelligent
995+
* capitalization (sentence beginnings, proper nouns, acronyms, brand names). To enable
996+
* enrichment, add the `enrichments=punctuation` parameter to your recognition request. Supported
997+
* languages include English (US, UK, Australia, India), French (France, Canada), German, Italian,
998+
* Portuguese (Brazil, Portugal), Spanish (Spain, Latin America, Argentina, Chile, Colombia,
999+
* Mexico, Peru), and Japanese. See [Speech transcript
1000+
* enrichment](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-speech-transcript-enrichment).
1001+
*
1002+
* @return the enrichments
1003+
*/
1004+
public String enrichments() {
1005+
return enrichments;
1006+
}
1007+
9431008
/**
9441009
* Gets the languageCustomizationId.
9451010
*
Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
/*
2+
* (C) Copyright IBM Corp. 2026.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
5+
* the License. You may obtain a copy of the License at
6+
*
7+
* http://www.apache.org/licenses/LICENSE-2.0
8+
*
9+
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
10+
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
11+
* specific language governing permissions and limitations under the License.
12+
*/
13+
14+
package com.ibm.watson.speech_to_text.v1.model;
15+
16+
import com.ibm.cloud.sdk.core.service.model.GenericModel;
17+
import java.io.File;
18+
import java.io.FileInputStream;
19+
import java.io.FileNotFoundException;
20+
import java.io.InputStream;
21+
22+
/** The detectLanguage options. */
23+
public class DetectLanguageOptions extends GenericModel {
24+
25+
protected Float lidConfidence;
26+
protected InputStream audio;
27+
protected String contentType;
28+
29+
/** Builder. */
30+
public static class Builder {
31+
private Float lidConfidence;
32+
private InputStream audio;
33+
private String contentType;
34+
35+
/**
36+
* Instantiates a new Builder from an existing DetectLanguageOptions instance.
37+
*
38+
* @param detectLanguageOptions the instance to initialize the Builder with
39+
*/
40+
private Builder(DetectLanguageOptions detectLanguageOptions) {
41+
this.lidConfidence = detectLanguageOptions.lidConfidence;
42+
this.audio = detectLanguageOptions.audio;
43+
this.contentType = detectLanguageOptions.contentType;
44+
}
45+
46+
/** Instantiates a new builder. */
47+
public Builder() {}
48+
49+
/**
50+
* Instantiates a new builder with required properties.
51+
*
52+
* @param lidConfidence the lidConfidence
53+
* @param audio the audio
54+
*/
55+
public Builder(Float lidConfidence, InputStream audio) {
56+
this.lidConfidence = lidConfidence;
57+
this.audio = audio;
58+
}
59+
60+
/**
61+
* Builds a DetectLanguageOptions.
62+
*
63+
* @return the new DetectLanguageOptions instance
64+
*/
65+
public DetectLanguageOptions build() {
66+
return new DetectLanguageOptions(this);
67+
}
68+
69+
/**
70+
* Set the lidConfidence.
71+
*
72+
* @param lidConfidence the lidConfidence
73+
* @return the DetectLanguageOptions builder
74+
*/
75+
public Builder lidConfidence(Float lidConfidence) {
76+
this.lidConfidence = lidConfidence;
77+
return this;
78+
}
79+
80+
/**
81+
* Set the audio.
82+
*
83+
* @param audio the audio
84+
* @return the DetectLanguageOptions builder
85+
*/
86+
public Builder audio(InputStream audio) {
87+
this.audio = audio;
88+
return this;
89+
}
90+
91+
/**
92+
* Set the contentType.
93+
*
94+
* @param contentType the contentType
95+
* @return the DetectLanguageOptions builder
96+
*/
97+
public Builder contentType(String contentType) {
98+
this.contentType = contentType;
99+
return this;
100+
}
101+
102+
/**
103+
* Set the audio.
104+
*
105+
* @param audio the audio
106+
* @return the DetectLanguageOptions builder
107+
* @throws FileNotFoundException if the file could not be found
108+
*/
109+
public Builder audio(File audio) throws FileNotFoundException {
110+
this.audio = new FileInputStream(audio);
111+
return this;
112+
}
113+
}
114+
115+
protected DetectLanguageOptions() {}
116+
117+
protected DetectLanguageOptions(Builder builder) {
118+
com.ibm.cloud.sdk.core.util.Validator.notNull(
119+
builder.lidConfidence, "lidConfidence cannot be null");
120+
com.ibm.cloud.sdk.core.util.Validator.notNull(builder.audio, "audio cannot be null");
121+
lidConfidence = builder.lidConfidence;
122+
audio = builder.audio;
123+
contentType = builder.contentType;
124+
}
125+
126+
/**
127+
* New builder.
128+
*
129+
* @return a DetectLanguageOptions builder
130+
*/
131+
public Builder newBuilder() {
132+
return new Builder(this);
133+
}
134+
135+
/**
136+
* Gets the lidConfidence.
137+
*
138+
* <p>Set a custom confidence threshold for detection.
139+
*
140+
* @return the lidConfidence
141+
*/
142+
public Float lidConfidence() {
143+
return lidConfidence;
144+
}
145+
146+
/**
147+
* Gets the audio.
148+
*
149+
* <p>The audio to transcribe.
150+
*
151+
* @return the audio
152+
*/
153+
public InputStream audio() {
154+
return audio;
155+
}
156+
157+
/**
158+
* Gets the contentType.
159+
*
160+
* <p>The type of the input.
161+
*
162+
* @return the contentType
163+
*/
164+
public String contentType() {
165+
return contentType;
166+
}
167+
}

0 commit comments

Comments
 (0)