tensorflow
diff --git a/‎research/deep_speech/__init__.py b/‎research/deep_speech/__init__.py
diff --git a/‎research/deep_speech/data/__init__.py b/‎research/deep_speech/data/__init__.py
diff --git a/‎research/deep_speech/data/dataset.py
Lines changed: 222 additions & 0 deletions b/‎research/deep_speech/data/dataset.py
Lines changed: 222 additions & 0 deletions
diff --git a/‎research/deep_speech/data/featurizer.py
Lines changed: 122 additions & 0 deletions b/‎research/deep_speech/data/featurizer.py
Lines changed: 122 additions & 0 deletions
diff --git a/‎research/deep_speech/data/vocabulary.txt
Lines changed: 33 additions & 0 deletions b/‎research/deep_speech/data/vocabulary.txt
Lines changed: 33 additions & 0 deletions
@@ -0,0 +1,222 @@
+#  Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+# ==============================================================================
+"""Generate tf.data.Dataset object for deep speech training/evaluation."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import scipy.io.wavfile as wavfile
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+# pylint: disable=g-bad-import-order
+from data.featurizer import AudioFeaturizer
+from data.featurizer import TextFeaturizer
+
+
+class AudioConfig(object):
+  """Configs for spectrogram extraction from audio."""
+
+  def __init__(self,
+               sample_rate,
+               frame_length,
+               frame_step,
+               fft_length=None,
+               normalize=False,
+               spect_type="linear"):
+    """Initialize the AudioConfig class.
+
+    Args:
+      sample_rate: an integer denoting the sample rate of the input waveform.
+      frame_length: an integer for the length of a spectrogram frame, in ms.
+      frame_step: an integer for the frame stride, in ms.
+      fft_length: an integer for the number of fft bins.
+      normalize: a boolean for whether apply normalization on the audio tensor.
+      spect_type: a string for the type of spectrogram to be extracted.
+    """
+
+    self.sample_rate = sample_rate
+    self.frame_length = frame_length
+    self.frame_step = frame_step
+    self.fft_length = fft_length
+    self.normalize = normalize
+    self.spect_type = spect_type
+
+
+class DatasetConfig(object):
+  """Config class for generating the DeepSpeechDataset."""
+
+  def __init__(self, audio_config, data_path, vocab_file_path):
+    """Initialize the configs for deep speech dataset.
+
+    Args:
+      audio_config: AudioConfig object specifying the audio-related configs.
+      data_path: a string denoting the full path of a manifest file.
+      vocab_file_path: a string specifying the vocabulary file path.
+
+    Raises:
+      RuntimeError: file path not exist.
+    """
+
+    self.audio_config = audio_config
+    assert tf.gfile.Exists(data_path)
+    assert tf.gfile.Exists(vocab_file_path)
+    self.data_path = data_path
+    self.vocab_file_path = vocab_file_path
+
+
+class DeepSpeechDataset(object):
+  """Dataset class for training/evaluation of DeepSpeech model."""
+
+  def __init__(self, dataset_config):
+    """Initialize the class.
+
+    Each dataset file contains three columns: "wav_filename", "wav_filesize",
+    and "transcript". This function parses the csv file and stores each example
+    by the increasing order of audio length (indicated by wav_filesize).
+
+    Args:
+      dataset_config: DatasetConfig object.
+    """
+    self.config = dataset_config
+    # Instantiate audio feature extractor.
+    self.audio_featurizer = AudioFeaturizer(
+        sample_rate=self.config.audio_config.sample_rate,
+        frame_length=self.config.audio_config.frame_length,
+        frame_step=self.config.audio_config.frame_step,
+        fft_length=self.config.audio_config.fft_length,
+        spect_type=self.config.audio_config.spect_type)
+    # Instantiate text feature extractor.
+    self.text_featurizer = TextFeaturizer(
+        vocab_file=self.config.vocab_file_path)
+
+    self.speech_labels = self.text_featurizer.speech_labels
+    self.features, self.labels = self._preprocess_data(self.config.data_path)
+    self.num_feature_bins = (
+        self.features[0].shape[1] if len(self.features) else None)
+
+  def _preprocess_data(self, file_path):
+    """Generate a list of waveform, transcript pair.
+
+    Note that the waveforms are ordered in increasing length, so that audio
+    samples in a mini-batch have similar length.
+
+    Args:
+      file_path: a string specifying the csv file path for a data set.
+
+    Returns:
+      features and labels array processed from the audio/text input.
+    """
+
+    with tf.gfile.Open(file_path, "r") as f:
+      lines = f.read().splitlines()
+    lines = [line.split("\t") for line in lines]
+    # Skip the csv header.
+    lines = lines[1:]
+    # Sort input data by the length of waveform.
+    lines.sort(key=lambda item: int(item[1]))
+    features = [self._preprocess_audio(line[0]) for line in lines]
+    labels = [self._preprocess_transcript(line[2]) for line in lines]
+    return features, labels
+
+  def _normalize_audio_tensor(self, audio_tensor):
+    """Perform mean and variance normalization on the spectrogram tensor.
+
+    Args:
+      audio_tensor: a tensor for the spectrogram feature.
+
+    Returns:
+      a tensor for the normalized spectrogram.
+    """
+    mean, var = tf.nn.moments(audio_tensor, axes=[0])
+    normalized = (audio_tensor - mean) / (tf.sqrt(var) + 1e-6)
+    return normalized
+
+  def _preprocess_audio(self, audio_file_path):
+    """Load the audio file in memory."""
+    tf.logging.info(
+        "Extracting spectrogram feature for {}".format(audio_file_path))
+    sample_rate, data = wavfile.read(audio_file_path)
+    assert sample_rate == self.config.audio_config.sample_rate
+    if data.dtype not in [np.float32, np.float64]:
+      data = data.astype(np.float32) / np.iinfo(data.dtype).max
+    feature = self.audio_featurizer.featurize(data)
+    if self.config.audio_config.normalize:
+      feature = self._normalize_audio_tensor(feature)
+    return tf.Session().run(
+        feature)  # return a numpy array rather than a tensor
+
+  def _preprocess_transcript(self, transcript):
+    return self.text_featurizer.featurize(transcript)
+
+
+def input_fn(batch_size, deep_speech_dataset, repeat=1):
+  """Input function for model training and evaluation.
+
+  Args:
+    batch_size: an integer denoting the size of a batch.
+    deep_speech_dataset: DeepSpeechDataset object.
+    repeat: an integer for how many times to repeat the dataset.
+
+  Returns:
+    a tf.data.Dataset object for model to consume.
+  """
+  features = deep_speech_dataset.features
+  labels = deep_speech_dataset.labels
+  num_feature_bins = deep_speech_dataset.num_feature_bins
+
+  def _gen_data():
+    for i in xrange(len(features)):
+      feature = np.expand_dims(features[i], axis=2)
+      input_length = [features[i].shape[0]]
+      label_length = [len(labels[i])]
+      yield {
+          "features": feature,
+          "labels": labels[i],
+          "input_length": input_length,
+          "label_length": label_length
+      }
+
+  dataset = tf.data.Dataset.from_generator(
+      _gen_data,
+      output_types={
+          "features": tf.float32,
+          "labels": tf.int32,
+          "input_length": tf.int32,
+          "label_length": tf.int32
+      },
+      output_shapes={
+          "features": tf.TensorShape([None, num_feature_bins, 1]),
+          "labels": tf.TensorShape([None]),
+          "input_length": tf.TensorShape([1]),
+          "label_length": tf.TensorShape([1])
+      })
+
+  # Repeat and batch the dataset
+  dataset = dataset.repeat(repeat)
+  # Padding the features to its max length dimensions.
+  dataset = dataset.padded_batch(
+      batch_size=batch_size,
+      padded_shapes={
+          "features": tf.TensorShape([None, num_feature_bins, 1]),
+          "labels": tf.TensorShape([None]),
+          "input_length": tf.TensorShape([1]),
+          "label_length": tf.TensorShape([1])
+      })
+
+  # Prefetch to improve speed of input pipeline.
+  dataset = dataset.prefetch(1)
+  return dataset
@@ -0,0 +1,122 @@
+#  Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+# ==============================================================================
+"""Utility class for extracting features from the text and audio input."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import codecs
+import functools
+import numpy as np
+import tensorflow as tf
+
+
+class AudioFeaturizer(object):
+  """Class to extract spectrogram features from the audio input."""
+
+  def __init__(self,
+               sample_rate=16000,
+               frame_length=25,
+               frame_step=10,
+               fft_length=None,
+               window_fn=functools.partial(
+                   tf.contrib.signal.hann_window, periodic=True),
+               spect_type="linear"):
+    """Initialize the audio featurizer class according to the configs.
+
+    Args:
+      sample_rate: an integer specifying the sample rate of the input waveform.
+      frame_length: an integer for the length of a spectrogram frame, in ms.
+      frame_step: an integer for the frame stride, in ms.
+      fft_length: an integer for the number of fft bins.
+      window_fn: windowing function.
+      spect_type: a string for the type of spectrogram to be extracted.
+      Currently only support 'linear', otherwise will raise a value error.
+
+    Raises:
+      ValueError: In case of invalid arguments for `spect_type`.
+    """
+    if spect_type != "linear":
+      raise ValueError("Unsupported spectrogram type: %s" % spect_type)
+    self.window_fn = window_fn
+    self.frame_length = int(sample_rate * frame_length / 1e3)
+    self.frame_step = int(sample_rate * frame_step / 1e3)
+    self.fft_length = fft_length if fft_length else int(2**(np.ceil(
+        np.log2(self.frame_length))))
+
+  def featurize(self, waveform):
+    """Extract spectrogram feature tensors from the waveform."""
+    return self._compute_linear_spectrogram(waveform)
+
+  def _compute_linear_spectrogram(self, waveform):
+    """Compute the linear-scale, magnitude spectrograms for the input waveform.
+
+    Args:
+      waveform: a float32 audio tensor.
+    Returns:
+      a float 32 tensor with shape [len, num_bins]
+    """
+
+    # `stfts` is a complex64 Tensor representing the Short-time Fourier
+    # Transform of each signal in `signals`. Its shape is
+    # [?, fft_unique_bins] where fft_unique_bins = fft_length // 2 + 1.
+    stfts = tf.contrib.signal.stft(
+        waveform,
+        frame_length=self.frame_length,
+        frame_step=self.frame_step,
+        fft_length=self.fft_length,
+        window_fn=self.window_fn,
+        pad_end=True)
+
+    # An energy spectrogram is the magnitude of the complex-valued STFT.
+    # A float32 Tensor of shape [?, 257].
+    magnitude_spectrograms = tf.abs(stfts)
+    return magnitude_spectrograms
+
+  def _compute_mel_filterbank_features(self, waveform):
+    """Compute the mel filterbank features."""
+    raise NotImplementedError("MFCC feature extraction not supported yet.")
+
+
+class TextFeaturizer(object):
+  """Extract text feature based on char-level granularity.
+
+  By looking up the vocabulary table, each input string (one line of transcript)
+  will be converted to a sequence of integer indexes.
+  """
+
+  def __init__(self, vocab_file):
+    lines = []
+    with codecs.open(vocab_file, "r", "utf-8") as fin:
+      lines.extend(fin.readlines())
+    self.token_to_idx = {}
+    self.idx_to_token = {}
+    self.speech_labels = ""
+    idx = 0
+    for line in lines:
+      line = line[:-1]  # Strip the '\n' char.
+      if line.startswith("#"):
+        # Skip from reading comment line.
+        continue
+      self.token_to_idx[line] = idx
+      self.idx_to_token[idx] = line
+      self.speech_labels += line
+      idx += 1
+
+  def featurize(self, text):
+    """Convert string to a list of integers."""
+    tokens = list(text.strip().lower())
+    feats = [self.token_to_idx[token] for token in tokens]
+    return feats
@@ -0,0 +1,33 @@
+# List of alphabets (utf-8 encoded). Note that '#' starts a comment line, which
+# will be ignored by the parser.
+# begin of vocabulary
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+'
+ 
+-
+# end of vocabulary