Skip to content

Commit e2662c2

Browse files
author
Lingjun Liu
committed
documentation
1 parent 90d536e commit e2662c2

File tree

10 files changed

+248
-204
lines changed

10 files changed

+248
-204
lines changed

CHANGELOG.md

+3-2
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ To release a new version, please update the changelog as followed:
9595
- Support string dtype in InputLayer (#PR 1017)
9696
- Support Dynamic RNN in RNN (#PR 1023)
9797
- Add ResNet50 static model (#PR 1030)
98+
- Add Transformer model (#PR 1027)
9899

99100
### Changed
100101

@@ -125,8 +126,8 @@ To release a new version, please update the changelog as followed:
125126
- @zsdonghao
126127
- @ChrisWu1997: #1010 #1015 #1025 #1030
127128
- @warshallrho: #1017 #1021 #1026 #1029 #1032
128-
- @ArnoldLIULJ: #1023
129-
- @JingqingZ: #1023
129+
- @ArnoldLIULJ: #1023 #1027
130+
- @JingqingZ: #1023 #1027
130131

131132
## [2.1.0]
132133

examples/translation_task/tutorial_transformer.py

+37-48
Original file line numberDiff line numberDiff line change
@@ -8,25 +8,24 @@
88
from tensorlayer.models.transformer.utils import metrics
99
from tensorlayer.models.transformer.utils import attention_visualisation
1010
import tensorlayer as tl
11-
12-
1311
""" Translation from Portugese to English by Transformer model
1412
This tutorial provides basic instructions on how to define and train Transformer model on Tensorlayer for
1513
Translation task. You can also learn how to visualize the attention block via this tutorial.
1614
"""
1715

16+
1817
def set_up_dataset():
1918
# Set up dataset for Portugese-English translation from the TED Talks Open Translation Project.
2019
# This dataset contains approximately 50000 training examples, 1100 validation examples, and 2000 test examples.
2120
# https://www.ted.com/participate/translate
2221

23-
examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True,
24-
as_supervised=True)
22+
examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True, as_supervised=True)
2523
train_examples, val_examples = examples['train'], examples['validation']
2624

2725
# Set up tokenizer and save the tokenizer
2826
tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
29-
(en.numpy() and pt.numpy() for pt, en in train_examples), target_vocab_size=2**14)
27+
(en.numpy() and pt.numpy() for pt, en in train_examples), target_vocab_size=2**14
28+
)
3029

3130
tokenizer.save_to_file("tokenizer")
3231
tokenizer = tfds.features.text.SubwordTextEncoder.load_from_file("tokenizer")
@@ -38,44 +37,42 @@ def test_tokenizer_success(tokenizer):
3837
sample_string = 'TensorLayer is awesome.'
3938

4039
tokenized_string = tokenizer.encode(sample_string)
41-
print ('Tokenized string is {}'.format(tokenized_string))
40+
print('Tokenized string is {}'.format(tokenized_string))
4241

4342
original_string = tokenizer.decode(tokenized_string)
44-
print ('The original string: {}'.format(original_string))
43+
print('The original string: {}'.format(original_string))
4544
assert original_string == sample_string
4645

4746

48-
4947
def generate_training_dataset(train_examples, tokenizer):
48+
5049
def encode(lang1, lang2):
51-
lang1 = tokenizer.encode(
52-
lang1.numpy()) + [tokenizer.vocab_size+1]
50+
lang1 = tokenizer.encode(lang1.numpy()) + [tokenizer.vocab_size + 1]
51+
52+
lang2 = tokenizer.encode(lang2.numpy()) + [tokenizer.vocab_size + 1]
5353

54-
lang2 = tokenizer.encode(
55-
lang2.numpy()) + [tokenizer.vocab_size+1]
56-
5754
return lang1, lang2
55+
5856
MAX_LENGTH = 50
57+
5958
def filter_max_length(x, y, max_length=MAX_LENGTH):
60-
return tf.logical_and(tf.size(x) <= max_length,
61-
tf.size(y) <= max_length)
59+
return tf.logical_and(tf.size(x) <= max_length, tf.size(y) <= max_length)
60+
6261
def tf_encode(pt, en):
6362
return tf.py_function(encode, [pt, en], [tf.int64, tf.int64])
63+
6464
train_dataset = train_examples.map(tf_encode)
6565
train_dataset = train_dataset.filter(filter_max_length)
6666
# cache the dataset to memory to get a speedup while reading from it.
6767
train_dataset = train_dataset.cache()
6868
BUFFER_SIZE = 20000
6969
BATCH_SIZE = 64
70-
train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(
71-
BATCH_SIZE, padded_shapes=([-1], [-1]))
70+
train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE, padded_shapes=([-1], [-1]))
7271
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)
7372

7473
return train_dataset
7574

7675

77-
78-
7976
def model_setup(tokenizer):
8077
# define Hyper parameters for transformer
8178
class HYPER_PARAMS(object):
@@ -91,16 +88,14 @@ class HYPER_PARAMS(object):
9188
extra_decode_length = 50
9289
beam_size = 5
9390
alpha = 0.6 # used to calculate length normalization in beam search
94-
95-
96-
label_smoothing=0.1
97-
learning_rate=2.0
98-
learning_rate_decay_rate=1.0
99-
learning_rate_warmup_steps=4000
100-
101-
sos_id = 0
102-
eos_id = tokenizer.vocab_size+1
10391

92+
label_smoothing = 0.1
93+
learning_rate = 2.0
94+
learning_rate_decay_rate = 1.0
95+
learning_rate_warmup_steps = 4000
96+
97+
sos_id = 0
98+
eos_id = tokenizer.vocab_size + 1
10499

105100
model = Transformer(HYPER_PARAMS)
106101

@@ -112,20 +107,20 @@ class HYPER_PARAMS(object):
112107

113108
# Use the Adam optimizer with a custom learning rate scheduler according to the formula in the Paper "Attention is All you need"
114109
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
115-
def __init__(self, d_model, warmup_steps=5):
116-
super(CustomSchedule, self).__init__()
117-
118-
self.d_model = d_model
119-
self.d_model = tf.cast(self.d_model, tf.float32)
120110

121-
self.warmup_steps = warmup_steps
122-
123-
def __call__(self, step):
124-
arg1 = tf.math.rsqrt(step)
125-
arg2 = step * (self.warmup_steps ** -1.5)
126-
127-
return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
111+
def __init__(self, d_model, warmup_steps=5):
112+
super(CustomSchedule, self).__init__()
113+
114+
self.d_model = d_model
115+
self.d_model = tf.cast(self.d_model, tf.float32)
116+
117+
self.warmup_steps = warmup_steps
128118

119+
def __call__(self, step):
120+
arg1 = tf.math.rsqrt(step)
121+
arg2 = step * (self.warmup_steps**-1.5)
122+
123+
return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
129124

130125

131126
def tutorial_transformer():
@@ -146,23 +141,17 @@ def tutorial_transformer():
146141
if (batch % 50 == 0):
147142
print('Batch ID {} at Epoch [{}/{}]: loss {:.4f}'.format(batch, epoch + 1, num_epochs, loss))
148143

149-
150-
151144
model.eval()
152145
sentence_en = tokenizer.encode('TensorLayer is awesome.')
153146
[prediction, weights_decoder], weights_encoder = model(inputs=[sentence_en])
154147

155-
predicted_sentence = tokenizer.decode([i for i in prediction["outputs"][0]
156-
if i < tokenizer.vocab_size])
148+
predicted_sentence = tokenizer.decode([i for i in prediction["outputs"][0] if i < tokenizer.vocab_size])
157149
print("Translated: ", predicted_sentence)
158150

159-
160-
# visualize the self attention
151+
# visualize the self attention
161152
tokenizer_str = [tokenizer.decode([ts]) for ts in (sentence_en)]
162153
attention_visualisation.plot_attention_weights(weights_encoder["layer_0"], tokenizer_str, tokenizer_str)
163154

164-
165-
166155

167156
if __name__ == "__main__":
168157
tutorial_transformer()

tensorlayer/models/transformer/attention_layer.py

+3
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ def split_heads(self, x):
8383
x: A tensor with shape [batch_size, length, hidden_size]
8484
8585
Returns:
86+
-----------
8687
A tensor with shape [batch_size, num_heads, length, hidden_size/num_heads]
8788
"""
8889
with tf.name_scope("split_heads"):
@@ -105,6 +106,7 @@ def combine_heads(self, x):
105106
x: A tensor [batch_size, num_heads, length, hidden_size/num_heads]
106107
107108
Returns:
109+
-----------
108110
A tensor with shape [batch_size, length, hidden_size]
109111
"""
110112
with tf.name_scope("combine_heads"):
@@ -128,6 +130,7 @@ def forward(self, x, y, mask, cache=None):
128130
where i is the current decoded length.
129131
130132
Returns:
133+
-----------
131134
Attention layer output with shape [batch_size, length_x, hidden_size]
132135
Attention weights with shape [batch_size, number_of_head, length_x, length_y]
133136
"""

tensorlayer/models/transformer/beamsearchHelper/beam_search.py

+31-24
Original file line numberDiff line numberDiff line change
@@ -39,41 +39,45 @@ def search(self, initial_ids, initial_cache):
3939
finished_scores = finished_state[_StateKeys.FINISHED_SCORES]
4040
finished_flags = finished_state[_StateKeys.FINISHED_FLAGS]
4141

42-
# # Account for corner case where there are no finished sequences for a
43-
# # particular batch item. In that case, return alive sequences for that batch
44-
# # item.
45-
# finished_seq = tf.where(tf.reduce_any(finished_flags, 1), finished_seq, alive_seq)
46-
# finished_scores = tf.where(tf.reduce_any(finished_flags, 1), finished_scores, alive_log_probs)
4742
return finished_seq, finished_scores
4843

4944

5045
def sequence_beam_search(
5146
symbols_to_logits_fn, initial_ids, initial_cache, vocab_size, beam_size, alpha, max_decode_length, eos_id
5247
):
5348
"""Search for sequence of subtoken ids with the largest probability.
54-
55-
Args:
56-
symbols_to_logits_fn: A function that takes in ids, index, and cache as
57-
arguments. The passed in arguments will have shape:
49+
50+
Parameters
51+
-----------
52+
symbols_to_logits_fn : A function with ids, index, and cache as arguments.
53+
The passed in arguments will have shape:
5854
ids -> [batch_size * beam_size, index]
5955
index -> [] (scalar)
6056
cache -> nested dictionary of tensors [batch_size * beam_size, ...]
6157
The function must return logits and new cache.
6258
logits -> [batch * beam_size, vocab_size]
6359
new cache -> same shape/structure as inputted cache
64-
initial_ids: Starting ids for each batch item.
65-
int32 tensor with shape [batch_size]
66-
initial_cache: dict containing starting decoder variables information
67-
vocab_size: int size of tokens
68-
beam_size: int number of beams
69-
alpha: float defining the strength of length normalization
70-
max_decode_length: maximum length to decoded sequence
71-
eos_id: int id of eos token, used to determine when a sequence has finished
72-
73-
Returns:
74-
Top decoded sequences [batch_size, beam_size, max_decode_length]
75-
sequence scores [batch_size, beam_size]
76-
"""
60+
initial_ids : int with shape [batch_size]
61+
Starting ids for each batch item.
62+
initial_cache: dict
63+
contain starting decoder variables information
64+
vocab_size: int
65+
size of tokens
66+
beam_size: int
67+
number of beams
68+
alpha: float
69+
strength of length normalization
70+
max_decode_length: int
71+
maximum length to decoded sequence
72+
eos_id: int
73+
id of eos token, used to determine when a sequence has finished
74+
75+
Returns
76+
-------
77+
Top decoded sequences [batch_size, beam_size, max_decode_length]
78+
sequence scores [batch_size, beam_size]
79+
"""
80+
7781
batch_size = tf.shape(initial_ids)[0]
7882

7983
sbs = SequenceBeamSearchV2(
@@ -85,11 +89,14 @@ def sequence_beam_search(
8589
def _expand_to_same_rank(tensor, target):
8690
"""Expands a given tensor to target's rank to be broadcastable.
8791
88-
Args:
92+
Parameters
93+
-----------
94+
8995
tensor: input tensor to tile. Shape: [b, d1, ..., da]
9096
target: target tensor. Shape: [b, d1, ..., da, ..., dn]
9197
92-
Returns:
98+
Returns:
99+
-----------
93100
Tiled tensor of shape [b, d1, ..., da, 1, ..., 1] with same rank of target.
94101
95102
Raises:

0 commit comments

Comments
 (0)