tensorlayer
diff --git a/‎CHANGELOG.md
Lines changed: 3 additions & 2 deletions b/‎CHANGELOG.md
Lines changed: 3 additions & 2 deletions
diff --git a/‎examples/translation_task/tutorial_transformer.py
Lines changed: 37 additions & 48 deletions b/‎examples/translation_task/tutorial_transformer.py
Lines changed: 37 additions & 48 deletions
diff --git a/‎tensorlayer/models/transformer/attention_layer.py
Lines changed: 3 additions & 0 deletions b/‎tensorlayer/models/transformer/attention_layer.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎tensorlayer/models/transformer/beamsearchHelper/beam_search.py
Lines changed: 31 additions & 24 deletions b/‎tensorlayer/models/transformer/beamsearchHelper/beam_search.py
Lines changed: 31 additions & 24 deletions
@@ -95,6 +95,7 @@ To release a new version, please update the changelog as followed:
 - Support string dtype in InputLayer (#PR 1017)
 - Support Dynamic RNN in RNN (#PR 1023)
 - Add ResNet50 static model (#PR 1030)
+- Add Transformer model (#PR 1027)
 
 ### Changed
 
@@ -125,8 +126,8 @@ To release a new version, please update the changelog as followed:
 - @zsdonghao
 - @ChrisWu1997: #1010 #1015 #1025 #1030
 - @warshallrho: #1017 #1021 #1026 #1029 #1032
-- @ArnoldLIULJ: #1023
-- @JingqingZ: #1023
+- @ArnoldLIULJ: #1023 #1027
+- @JingqingZ: #1023 #1027
 
 ## [2.1.0]
 
 
@@ -8,25 +8,24 @@
 from tensorlayer.models.transformer.utils import metrics
 from tensorlayer.models.transformer.utils import attention_visualisation
 import tensorlayer as tl
-
-
 """ Translation from Portugese to English by Transformer model
 This tutorial provides basic instructions on how to define and train Transformer model on Tensorlayer for 
 Translation task. You can also learn how to visualize the attention block via this tutorial. 
 """
 
+
 def set_up_dataset():
     # Set up dataset for Portugese-English translation from the TED Talks Open Translation Project.
     # This dataset contains approximately 50000 training examples, 1100 validation examples, and 2000 test examples.
     # https://www.ted.com/participate/translate
 
-    examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True,
-                                as_supervised=True)
+    examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True, as_supervised=True)
     train_examples, val_examples = examples['train'], examples['validation']
 
     # Set up tokenizer and save the tokenizer
     tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
-        (en.numpy() and pt.numpy() for pt, en in train_examples), target_vocab_size=2**14)
+        (en.numpy() and pt.numpy() for pt, en in train_examples), target_vocab_size=2**14
+    )
 
     tokenizer.save_to_file("tokenizer")
     tokenizer = tfds.features.text.SubwordTextEncoder.load_from_file("tokenizer")
@@ -38,44 +37,42 @@ def test_tokenizer_success(tokenizer):
     sample_string = 'TensorLayer is awesome.'
 
     tokenized_string = tokenizer.encode(sample_string)
-    print ('Tokenized string is {}'.format(tokenized_string))
+    print('Tokenized string is {}'.format(tokenized_string))
 
     original_string = tokenizer.decode(tokenized_string)
-    print ('The original string: {}'.format(original_string))
+    print('The original string: {}'.format(original_string))
     assert original_string == sample_string
 
 
-
 def generate_training_dataset(train_examples, tokenizer):
+
     def encode(lang1, lang2):
-        lang1 = tokenizer.encode(
-            lang1.numpy()) + [tokenizer.vocab_size+1]
+        lang1 = tokenizer.encode(lang1.numpy()) + [tokenizer.vocab_size + 1]
+
+        lang2 = tokenizer.encode(lang2.numpy()) + [tokenizer.vocab_size + 1]
 
-        lang2 = tokenizer.encode(
-            lang2.numpy()) + [tokenizer.vocab_size+1]
-        
         return lang1, lang2
+
     MAX_LENGTH = 50
+
     def filter_max_length(x, y, max_length=MAX_LENGTH):
-        return tf.logical_and(tf.size(x) <= max_length,
-                            tf.size(y) <= max_length)
+        return tf.logical_and(tf.size(x) <= max_length, tf.size(y) <= max_length)
+
     def tf_encode(pt, en):
         return tf.py_function(encode, [pt, en], [tf.int64, tf.int64])
+
     train_dataset = train_examples.map(tf_encode)
     train_dataset = train_dataset.filter(filter_max_length)
     # cache the dataset to memory to get a speedup while reading from it.
     train_dataset = train_dataset.cache()
     BUFFER_SIZE = 20000
     BATCH_SIZE = 64
-    train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(
-        BATCH_SIZE, padded_shapes=([-1], [-1]))
+    train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE, padded_shapes=([-1], [-1]))
     train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)
 
     return train_dataset
 
 
-
-
 def model_setup(tokenizer):
     # define Hyper parameters for transformer
     class HYPER_PARAMS(object):
@@ -91,16 +88,14 @@ class HYPER_PARAMS(object):
         extra_decode_length = 50
         beam_size = 5
         alpha = 0.6  # used to calculate length normalization in beam search
-        
-        
-        label_smoothing=0.1
-        learning_rate=2.0
-        learning_rate_decay_rate=1.0
-        learning_rate_warmup_steps=4000
-        
-        sos_id = 0
-        eos_id = tokenizer.vocab_size+1
 
+        label_smoothing = 0.1
+        learning_rate = 2.0
+        learning_rate_decay_rate = 1.0
+        learning_rate_warmup_steps = 4000
+
+        sos_id = 0
+        eos_id = tokenizer.vocab_size + 1
 
     model = Transformer(HYPER_PARAMS)
 
@@ -112,20 +107,20 @@ class HYPER_PARAMS(object):
 
 # Use the Adam optimizer with a custom learning rate scheduler according to the formula in the Paper "Attention is All you need"
 class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
-  def __init__(self, d_model, warmup_steps=5):
-    super(CustomSchedule, self).__init__()
-    
-    self.d_model = d_model
-    self.d_model = tf.cast(self.d_model, tf.float32)
 
-    self.warmup_steps = warmup_steps
-    
-  def __call__(self, step):
-    arg1 = tf.math.rsqrt(step)
-    arg2 = step * (self.warmup_steps ** -1.5)
-    
-    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
+    def __init__(self, d_model, warmup_steps=5):
+        super(CustomSchedule, self).__init__()
+
+        self.d_model = d_model
+        self.d_model = tf.cast(self.d_model, tf.float32)
+
+        self.warmup_steps = warmup_steps
 
+    def __call__(self, step):
+        arg1 = tf.math.rsqrt(step)
+        arg2 = step * (self.warmup_steps**-1.5)
+
+        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
 
 
 def tutorial_transformer():
@@ -146,23 +141,17 @@ def tutorial_transformer():
                 if (batch % 50 == 0):
                     print('Batch ID {} at Epoch [{}/{}]: loss {:.4f}'.format(batch, epoch + 1, num_epochs, loss))
 
-
-
     model.eval()
     sentence_en = tokenizer.encode('TensorLayer is awesome.')
     [prediction, weights_decoder], weights_encoder = model(inputs=[sentence_en])
 
-    predicted_sentence = tokenizer.decode([i for i in prediction["outputs"][0]
-                                                if i < tokenizer.vocab_size]) 
+    predicted_sentence = tokenizer.decode([i for i in prediction["outputs"][0] if i < tokenizer.vocab_size])
     print("Translated: ", predicted_sentence)
 
-
-    # visualize the self attention 
+    # visualize the self attention
     tokenizer_str = [tokenizer.decode([ts]) for ts in (sentence_en)]
     attention_visualisation.plot_attention_weights(weights_encoder["layer_0"], tokenizer_str, tokenizer_str)
 
-    
-
 
 if __name__ == "__main__":
     tutorial_transformer()
@@ -83,6 +83,7 @@ def split_heads(self, x):
       x: A tensor with shape [batch_size, length, hidden_size]
 
     Returns:
+    -----------
       A tensor with shape [batch_size, num_heads, length, hidden_size/num_heads]
     """
         with tf.name_scope("split_heads"):
@@ -105,6 +106,7 @@ def combine_heads(self, x):
       x: A tensor [batch_size, num_heads, length, hidden_size/num_heads]
 
     Returns:
+-----------
       A tensor with shape [batch_size, length, hidden_size]
     """
         with tf.name_scope("combine_heads"):
@@ -128,6 +130,7 @@ def forward(self, x, y, mask, cache=None):
         where i is the current decoded length.
 
     Returns:
+    -----------
       Attention layer output with shape [batch_size, length_x, hidden_size]
       Attention weights with shape [batch_size, number_of_head, length_x, length_y]
     """
 
@@ -39,41 +39,45 @@ def search(self, initial_ids, initial_cache):
         finished_scores = finished_state[_StateKeys.FINISHED_SCORES]
         finished_flags = finished_state[_StateKeys.FINISHED_FLAGS]
 
-        # # Account for corner case where there are no finished sequences for a
-        # # particular batch item. In that case, return alive sequences for that batch
-        # # item.
-        # finished_seq = tf.where(tf.reduce_any(finished_flags, 1), finished_seq, alive_seq)
-        # finished_scores = tf.where(tf.reduce_any(finished_flags, 1), finished_scores, alive_log_probs)
         return finished_seq, finished_scores
 
 
 def sequence_beam_search(
         symbols_to_logits_fn, initial_ids, initial_cache, vocab_size, beam_size, alpha, max_decode_length, eos_id
 ):
     """Search for sequence of subtoken ids with the largest probability.
-
-    Args:
-        symbols_to_logits_fn: A function that takes in ids, index, and cache as
-        arguments. The passed in arguments will have shape:
+    
+    Parameters
+    -----------
+    symbols_to_logits_fn : A function with ids, index, and cache as arguments. 
+        The passed in arguments will have shape:
             ids -> [batch_size * beam_size, index]
             index -> [] (scalar)
             cache -> nested dictionary of tensors [batch_size * beam_size, ...]
         The function must return logits and new cache.
             logits -> [batch * beam_size, vocab_size]
             new cache -> same shape/structure as inputted cache
-        initial_ids: Starting ids for each batch item.
-        int32 tensor with shape [batch_size]
-        initial_cache: dict containing starting decoder variables information
-        vocab_size: int size of tokens
-        beam_size: int number of beams
-        alpha: float defining the strength of length normalization
-        max_decode_length: maximum length to decoded sequence
-        eos_id: int id of eos token, used to determine when a sequence has finished
-
-    Returns:
-        Top decoded sequences [batch_size, beam_size, max_decode_length]
-        sequence scores [batch_size, beam_size]
-    """
+    initial_ids : int with shape [batch_size]
+        Starting ids for each batch item.
+    initial_cache: dict 
+        contain starting decoder variables information
+    vocab_size: int 
+        size of tokens
+    beam_size: int 
+        number of beams
+    alpha: float 
+        strength of length normalization
+    max_decode_length: int
+        maximum length to decoded sequence
+    eos_id: int 
+        id of eos token, used to determine when a sequence has finished
+    
+    Returns
+    -------
+    Top decoded sequences [batch_size, beam_size, max_decode_length]
+    sequence scores [batch_size, beam_size]
+  """
+
     batch_size = tf.shape(initial_ids)[0]
 
     sbs = SequenceBeamSearchV2(
@@ -85,11 +89,14 @@ def sequence_beam_search(
 def _expand_to_same_rank(tensor, target):
     """Expands a given tensor to target's rank to be broadcastable.
 
-    Args:
+    Parameters
+    -----------
+    
         tensor: input tensor to tile. Shape: [b, d1, ..., da]
         target: target tensor. Shape: [b, d1, ..., da, ..., dn]
 
-    Returns:
+     Returns:
+    -----------
         Tiled tensor of shape [b, d1, ..., da, 1, ..., 1] with same rank of target.
 
     Raises: