@@ -36,7 +36,7 @@ public class AudioOps(
36
36
/* *
37
37
* Get the parent [KotlinOps] object.
38
38
*/
39
- public val ops : KotlinOps ,
39
+ public val ops : KotlinOps
40
40
) {
41
41
public val java: org.tensorflow.op.AudioOps = ops.java.audio
42
42
@@ -47,29 +47,24 @@ public class AudioOps(
47
47
48
48
/* *
49
49
* Produces a visualization of audio data over time.
50
- *
51
50
* Spectrograms are a standard way of representing audio information as a series of
52
51
* slices of frequency information, one slice for each window of time. By joining
53
52
* these together into a sequence, they form a distinctive fingerprint of the sound
54
53
* over time.
55
- *
56
54
* This op expects to receive audio data as an input, stored as floats in the range
57
55
* -1 to 1, together with a window width in samples, and a stride specifying how
58
56
* far to move the window between slices. From this it generates a three
59
57
* dimensional output. The first dimension is for the channels in the input, so a
60
58
* stereo audio input would have two here for example. The second dimension is time,
61
59
* with successive frequency slices. The third dimension has an amplitude value for
62
60
* each frequency during that time slice.
63
- *
64
61
* This means the layout when converted and saved as an image is rotated 90 degrees
65
62
* clockwise from a typical spectrogram. Time is descending down the Y axis, and
66
63
* the frequency decreases from left to right.
67
- *
68
64
* Each value in the result represents the square root of the sum of the real and
69
65
* imaginary parts of an FFT on the current window of samples. In this way, the
70
66
* lowest dimension represents the power of each frequency in the current window,
71
67
* and adjacent windows are concatenated in the next dimension.
72
- *
73
68
* To get a more intuitive and visual look at what this operation does, you can run
74
69
* tensorflow/examples/wav_to_spectrogram to read in an audio file and save out the
75
70
* resulting spectrogram as a PNG image.
@@ -78,17 +73,20 @@ public class AudioOps(
78
73
* @param windowSize How wide the input window is in samples. For the highest efficiency
79
74
* this should be a power of two, but other values are accepted.
80
75
* @param stride How widely apart the center of adjacent sample windows should be.
81
- * @param options carries optional attributes values
76
+ * @param options carries optional attribute values
82
77
* @return a new instance of AudioSpectrogram
83
78
* @see org.tensorflow.op.AudioOps.audioSpectrogram
79
+ * @param magnitudeSquared Sets the magnitudeSquared option.
80
+ *
84
81
* @param magnitudeSquared Whether to return the squared magnitude or just the
85
82
* magnitude. Using squared magnitude can avoid extra calculations.
83
+ * @return this Options instance.
86
84
*/
87
85
public fun audioSpectrogram (
88
86
input : Operand <TFloat32 >,
89
87
windowSize : Long ,
90
88
stride : Long ,
91
- magnitudeSquared : Boolean? = null,
89
+ magnitudeSquared : Boolean? = null
92
90
): AudioSpectrogram = java.audioSpectrogram(
93
91
input,
94
92
windowSize,
@@ -100,33 +98,35 @@ public class AudioOps(
100
98
101
99
/* *
102
100
* Decode a 16-bit PCM WAV file to a float tensor.
103
- *
104
101
* The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float.
105
- *
106
102
* When desired_channels is set, if the input contains fewer channels than this
107
103
* then the last channel will be duplicated to give the requested number, else if
108
104
* the input has more channels than requested then the additional channels will be
109
105
* ignored.
110
- *
111
106
* If desired_samples is set, then the audio will be cropped or padded with zeroes
112
107
* to the requested length.
113
- *
114
108
* The first output contains a Tensor with the content of the audio samples. The
115
109
* lowest dimension will be the number of channels, and the second will be the
116
110
* number of samples. For example, a ten-sample-long stereo WAV file should give an
117
111
* output shape of [10, 2].
118
112
*
119
113
* @param contents The WAV-encoded audio, usually from a file.
120
- * @param options carries optional attributes values
114
+ * @param options carries optional attribute values
121
115
* @return a new instance of DecodeWav
122
116
* @see org.tensorflow.op.AudioOps.decodeWav
117
+ * @param desiredChannels Sets the desiredChannels option.
118
+ *
123
119
* @param desiredChannels Number of sample channels wanted.
120
+ * @return this Options instance.
121
+ * @param desiredSamples Sets the desiredSamples option.
122
+ *
124
123
* @param desiredSamples Length of audio requested.
124
+ * @return this Options instance.
125
125
*/
126
126
public fun decodeWav (
127
127
contents : Operand <TString >,
128
128
desiredChannels : Long? = null,
129
- desiredSamples : Long? = null,
129
+ desiredSamples : Long? = null
130
130
): DecodeWav = java.decodeWav(
131
131
contents,
132
132
* listOfNotNull(
@@ -137,16 +137,14 @@ public class AudioOps(
137
137
138
138
/* *
139
139
* Encode audio data using the WAV file format.
140
- *
141
140
* This operation will generate a string suitable to be saved out to create a .wav
142
141
* audio file. It will be encoded in the 16-bit PCM format. It takes in float
143
142
* values in the range -1.0f to 1.0f, and any outside that value will be clamped to
144
143
* that range.
144
+ * ``` audio``` is a 2-D float Tensor of shape ``` [length, channels]```.
145
+ * ``` sample_rate``` is a scalar Tensor holding the rate to use (e.g. 44100).
145
146
*
146
- * `audio` is a 2-D float Tensor of shape `[length, channels]`.
147
- * `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
148
- *
149
- * @param audio 2-D with shape `[length, channels]`.
147
+ * @param audio 2-D with shape ` [length, channels]`.
150
148
* @param sampleRate Scalar containing the sample frequency.
151
149
* @return a new instance of EncodeWav
152
150
* @see org.tensorflow.op.AudioOps.encodeWav
@@ -159,7 +157,6 @@ public class AudioOps(
159
157
160
158
/* *
161
159
* Transforms a spectrogram into a form that's useful for speech recognition.
162
- *
163
160
* Mel Frequency Cepstral Coefficients are a way of representing audio data that's
164
161
* been effective as an input feature for machine learning. They are created by
165
162
* taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
@@ -171,23 +168,35 @@ public class AudioOps(
171
168
* @param spectrogram Typically produced by the Spectrogram op, with magnitude_squared
172
169
* set to true.
173
170
* @param sampleRate How many samples per second the source audio used.
174
- * @param options carries optional attributes values
171
+ * @param options carries optional attribute values
175
172
* @return a new instance of Mfcc
176
173
* @see org.tensorflow.op.AudioOps.mfcc
174
+ * @param upperFrequencyLimit Sets the upperFrequencyLimit option.
175
+ *
177
176
* @param upperFrequencyLimit The highest frequency to use when calculating the
178
177
* ceptstrum.
178
+ * @return this Options instance.
179
+ * @param lowerFrequencyLimit Sets the lowerFrequencyLimit option.
180
+ *
179
181
* @param lowerFrequencyLimit The lowest frequency to use when calculating the
180
182
* ceptstrum.
183
+ * @return this Options instance.
184
+ * @param filterbankChannelCount Sets the filterbankChannelCount option.
185
+ *
181
186
* @param filterbankChannelCount Resolution of the Mel bank used internally.
187
+ * @return this Options instance.
188
+ * @param dctCoefficientCount Sets the dctCoefficientCount option.
189
+ *
182
190
* @param dctCoefficientCount How many output channels to produce per time slice.
191
+ * @return this Options instance.
183
192
*/
184
193
public fun mfcc (
185
194
spectrogram : Operand <TFloat32 >,
186
195
sampleRate : Operand <TInt32 >,
187
196
upperFrequencyLimit : Float? = null,
188
197
lowerFrequencyLimit : Float? = null,
189
198
filterbankChannelCount : Long? = null,
190
- dctCoefficientCount : Long? = null,
199
+ dctCoefficientCount : Long? = null
191
200
): Mfcc = java.mfcc(
192
201
spectrogram,
193
202
sampleRate,
0 commit comments