doc

Lingjun Liu · Lingjun Liu · commit 80c985c8c76a · 2019-09-14T11:18:42.000+01:00
diff --git a/tensorlayer/models/transformer/attention_layer.py b/tensorlayer/models/transformer/attention_layer.py
@@ -60,6 +60,7 @@ def get_config(self):
         }
 
     def build(self, inputs_shape):
+
         # Transformation for linearly projecting the queries, keys, and values.
         self.q_transformation = self._get_weights(
             "q_project", shape=(self.hidden_size, self.hidden_size), init=tf.initializers.get('glorot_uniform')
@@ -75,20 +76,7 @@ def build(self, inputs_shape):
         )
 
     def split_heads(self, x):
-        """Split x into different heads, and transpose the resulting value.
-
-    The tensor is transposed to insure the inner dimensions hold the correct
-    values during the matrix multiplication.
 
-    Parameters
-    -----------
-
-      x: A tensor with shape [batch_size, length, hidden_size]
-
-    Returns:
-    -----------
-      A tensor with shape [batch_size, num_heads, length, hidden_size/num_heads]
-    """
         with tf.name_scope("split_heads"):
             batch_size = tf.shape(x)[0]
             length = tf.shape(x)[1]
@@ -103,40 +91,15 @@ def split_heads(self, x):
             return tf.transpose(x, [0, 2, 1, 3])
 
     def combine_heads(self, x):
-        """Combine tensor that has been split.
-
-    Args:
-      x: A tensor [batch_size, num_heads, length, hidden_size/num_heads]
 
-    Returns:
-    -----------
-      A tensor with shape [batch_size, length, hidden_size]
-    """
         with tf.name_scope("combine_heads"):
             batch_size = tf.shape(x)[0]
             length = tf.shape(x)[2]
             x = tf.transpose(x, [0, 2, 1, 3])  # --> [batch, length, num_heads, depth]
             return tf.reshape(x, [batch_size, length, self.hidden_size])
 
     def forward(self, x, y, mask, cache=None):
-        """Apply attention mechanism to x and y.
-
-    Args:
-      x: a tensor with shape [batch_size, length_x, hidden_size]
-      y: a tensor with shape [batch_size, length_y, hidden_size]
-      mask: attention bias that will be added to the result of the dot product.
-      training: boolean, whether in training mode or not.
-      cache: (Used during prediction) dictionary with tensors containing results
-        of previous attentions. The dictionary must have the items:
-            {"k": tensor with shape [batch_size, i, key_channels],
-             "v": tensor with shape [batch_size, i, value_channels]}
-        where i is the current decoded length.
-
-    Returns:
-    -----------
-      Attention layer output with shape [batch_size, length_x, hidden_size]
-      Attention weights with shape [batch_size, number_of_head, length_x, length_y]
-    """
+        """Apply attention mechanism to x and y."""
         # Linearly project the query (q), key (k) and value (v) using different
         # learned projections. This is in preparation of splitting them into
         # multiple heads. Multi-head attention uses multiple queries, keys, and
diff --git a/tensorlayer/models/transformer/beamsearchHelper/beam_search.py b/tensorlayer/models/transformer/beamsearchHelper/beam_search.py
@@ -72,10 +72,11 @@ def sequence_beam_search(
     eos_id: int 
         id of eos token, used to determine when a sequence has finished
     
-    Returns
+    Notes
     -------
-    Top decoded sequences [batch_size, beam_size, max_decode_length]
-    sequence scores [batch_size, beam_size]
+    The function would return:
+        Top decoded sequences [batch_size, beam_size, max_decode_length]
+        sequence scores [batch_size, beam_size]
   """
 
     batch_size = tf.shape(initial_ids)[0]
diff --git a/tensorlayer/models/transformer/beamsearchHelper/beam_search_v1.py b/tensorlayer/models/transformer/beamsearchHelper/beam_search_v1.py
@@ -166,15 +166,6 @@ def _continue_search(self, state):
       2) when the worst score in the finished sequences is better than the best
          score in the alive sequences (i.e. the finished sequences are provably
          unchanging)
-
-    Parameters
-    -----------
-      state: A dictionary with the current loop state.
-
-    Returns:
-    -----------
-      Bool tensor with value True if loop should continue, False if loop should
-      terminate.
     """
         i = state[_StateKeys.CUR_INDEX]
         alive_log_probs = state[_StateKeys.ALIVE_LOG_PROBS]
@@ -216,13 +207,6 @@ def _search_step(self, state):
     by the length normalization factor. Without length normalization, the
     search is more likely to return shorter sequences.
 
-    Parameters
-    -----------
-      state: A dictionary with the current loop state.
-
-    Returns:
-    -----------
-      new state dictionary.
     """
         # Grow alive sequences by one token.
         new_seq, new_log_probs, new_cache = self._grow_alive_seq(state)
@@ -241,20 +225,9 @@ def _search_step(self, state):
 
     def _grow_alive_seq(self, state):
         """Grow alive sequences by one token, and collect top 2*beam_size sequences.
-
     2*beam_size sequences are collected because some sequences may have reached
     the EOS token. 2*beam_size ensures that at least beam_size sequences are
     still alive.
-
-    Parameters
-    -----------
-      state: A dictionary with the current loop state.
-    Returns:
-    -----------
-      Tuple of
-      (Top 2*beam_size sequences [batch_size, 2 * beam_size, cur_index + 1],
-       Scores of returned sequences [batch_size, 2 * beam_size],
-       New alive cache, for each of the 2 * beam_size sequences)
     """
         i = state[_StateKeys.CUR_INDEX]
         alive_seq = state[_StateKeys.ALIVE_SEQ]
@@ -384,10 +357,11 @@ def sequence_beam_search(
     eos_id: int 
         id of eos token, used to determine when a sequence has finished
     
-    Returns
+    Notes
     -------
-    Top decoded sequences [batch_size, beam_size, max_decode_length]
-    sequence scores [batch_size, beam_size]
+    The function would return:
+      Top decoded sequences [batch_size, beam_size, max_decode_length]
+      sequence scores [batch_size, beam_size]
   """
     batch_size = tf.shape(initial_ids)[0]
     sbs = SequenceBeamSearch(symbols_to_logits_fn, vocab_size, batch_size, beam_size, alpha, max_decode_length, eos_id)
@@ -449,14 +423,6 @@ def _get_shape_keep_last_dim(tensor):
 
 def _flatten_beam_dim(tensor):
     """Reshapes first two dimensions in to single dimension.
-
-  Parameters
-  -----------
-    tensor: Tensor to reshape of shape [A, B, ...]
-
-  Returns
-  -----------
-    Reshaped tensor of shape [A*B, ...]
   """
     shape = _shape_list(tensor)
     shape[0] *= shape[1]
diff --git a/tensorlayer/models/transformer/embedding_layer.py b/tensorlayer/models/transformer/embedding_layer.py
@@ -30,8 +30,10 @@ def __init__(self, vocab_size, hidden_size):
 
     Parameters
     -----------
-      vocab_size: Number of tokens in the embedding. (Typically ~32,000)
-      hidden_size: Dimensionality of the embedding. (Typically 512 or 1024)
+      vocab_size : int
+        Number of tokens in the embedding. (Typically ~32,000)
+      hidden_size : int
+        Dimensionality of the embedding. (Typically 512 or 1024)
     """
         super(EmbeddingLayer, self).__init__()
         self.vocab_size = vocab_size
@@ -56,20 +58,7 @@ def get_config(self):
         }
 
     def forward(self, inputs, mode="embedding"):
-        """Get token embeddings of inputs.
-
-    Parameters
-    -----------
-      inputs: An int64 tensor with shape [batch_size, length]
-      mode: string, a valid value is one of "embedding" and "linear".
-     Returns:
-    -----------
-      outputs: (1) If mode == "embedding", output embedding tensor, float32 with
-        shape [batch_size, length, embedding_size]; (2) mode == "linear", output
-        linear tensor, float32 with shape [batch_size, length, vocab_size].
-    Raises:
-      ValueError: if mode is not valid.
-    """
+        """Get token embeddings of inputs."""
         if mode == "embedding":
             return self._embedding(inputs)
         elif mode == "linear":
@@ -89,15 +78,7 @@ def _embedding(self, inputs):
             return embeddings
 
     def _linear(self, inputs):
-        """Computes logits by running inputs through a linear layer.
-
-    Parameters
-    -----------
-      inputs: A float32 tensor with shape [batch_size, length, hidden_size]
-     Returns:
-    -----------
-      float32 tensor with shape [batch_size, length, vocab_size].
-    """
+        """Computes logits by running inputs through a linear layer."""
         with tf.name_scope("presoftmax_linear"):
             batch_size = tf.shape(inputs)[0]
             length = tf.shape(inputs)[1]
diff --git a/tensorlayer/models/transformer/feedforward_layer.py b/tensorlayer/models/transformer/feedforward_layer.py
@@ -30,9 +30,12 @@ def __init__(self, hidden_size, filter_size, keep_prob):
 
     Parameters
     -----------
-      hidden_size: int, output dim of hidden layer.
-      filter_size: int, filter size for the inner (first) dense layer.
-      relu_dropout: float, dropout rate for training.
+      hidden_size: int
+        output dim of hidden layer.
+      filter_size: int
+        filter size for the inner (first) dense layer.
+      relu_dropout: float
+        dropout rate for training.
     """
         super(TransformerFeedForwardLayer, self).__init__()
         self.hidden_size = hidden_size
@@ -60,18 +63,7 @@ def get_config(self):
         }
 
     def forward(self, inputs):
-        """Return outputs of the feedforward network.
-
-    Parameters
-    -----------
-      x: tensor with shape [batch_size, length, hidden_size]
-      training: boolean, whether in training mode or not.
-
-    Returns:
-    -----------
-      Output of the feedforward network.
-      tensor with shape [batch_size, length, hidden_size]
-    """
+        """Return outputs of the feedforward network."""
         # Retrieve dynamically known shapes
         x = inputs
         batch_size = tf.shape(x)[0]
diff --git a/tensorlayer/models/transformer/transformer.py b/tensorlayer/models/transformer/transformer.py