-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
226 lines (163 loc) · 10.6 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
import os
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
import tensorflow as tf
import numpy as np
# Attributes of the `Transformer` Class
### 1. `num_layers`
# Defines the number of layers (or "blocks") in both the encoder and decoder stacks of the Transformer. Each layer consists of multi-head attention followed by a feed-forward network.
### 2. `d_model`
# The dimensionality of the input and output embeddings, as well as the hidden states in each layer. It represents the size of the vectors that flow through the model.
### 3. `num_heads`
# Specifies how many attention heads are used in each multi-head attention block. The input and output of each layer are divided into multiple attention heads, allowing the model to focus on different parts of the input sequence.
# exemple : In a standard Transformer, `num_heads=8` indicates 8 parallel attention heads.
### 4. `d_ff`
# The dimensionality of the feed-forward network within each layer. After the multi-head attention, the output goes through a feed-forward network with hidden layers of size `d_ff`.
# exemple : If `d_ff=2048`, it indicates the feed-forward layers have 2048 units.
### 5. `input_vocab_size`
# Defines the size of the input vocabulary. It is the total number of distinct tokens or words that can appear in the input data. The input sequences will be converted into embeddings based on this vocabulary size.
# For a language model with a vocabulary of 10,000 unique words, `input_vocab_size=10000`.
### 6. `target_vocab_size`
# Similar to `input_vocab_size`, this defines the size of the target vocabulary. It is the total number of tokens or words that can appear in the output data (e.g., translations or sequences to be generated).
# For a translation task with 10,000 target words, `target_vocab_size=10000`.
### 7. `max_len`
# Defines the maximum length of the input and output sequences. It is used to compute positional encodings, which are necessary because the model doesn't have any inherent sense of the order of tokens.
# exemple : If `max_len=5000`, the model can handle sequences of up to 5,000 tokens in length.
### **Overall Purpose of Attributes**:
# num_layers, d_model, num_heads, d_ff attributes define the complexity and capacity of the Transformer model. Larger values typically allow the model to capture more intricate patterns but increase computational cost.
# input_vocab_size, target_vocab_size, max_len attributes define the nature of the input/output data and how the model processes sequences.
class Transformer :
def __init__(self, num_layers, d_model, num_heads, d_ff, input_vocab_size, target_vocab_size, max_len=10000):
self.num_layers = num_layers
self.d_model = d_model
self.num_heads = num_heads
self.d_ff = d_ff
self.input_vocab_size = input_vocab_size
self.target_vocab_size= target_vocab_size
self.max_len = max_len
# Scaled Dot-Product Attention Layer
def scaled_dot_product_attention(self, Q, K, V, mask=None):
# Compute the dot products of the query (Q) and key (K) matrices.
matmul_qk = tf.matmul(Q, K, transpose_b=True)
# Scale the dot products by the square root of the dimensionality of the key vectors.
dk = tf.cast(tf.shape(K)[-1], tf.float32)
scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
# Apply the mask to the scaled attention logits (if provided) to ignore certain positions.
if mask is not None:
scaled_attention_logits += (mask * -1e9)
# Apply softmax to the scaled attention logits to get the attention weights.
attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
# Multiply the attention weights with the value (V) matrix to get the final output.
output = tf.matmul(attention_weights, V)
return output, attention_weights
# Multi-Head Attention Layer
def multi_head_attention(self, Q, K, V, mask=None):
# Ensure that the dimensionality of the model is divisible by the number of heads.
assert self.d_model % self.num_heads == 0
# Define the multi-head attention layer
mha_layer = tf.keras.layers.MultiHeadAttention(
num_heads=self.num_heads,
key_dim=self.d_model // self.num_heads,
dropout=0.1 # You can adjust dropout if needed
)
# Apply the multi-head attention layer
attention_output = mha_layer(query=Q, key=K, value=V, attention_mask=mask)
# Define a final dense layer for the output
dense = tf.keras.layers.Dense(self.d_model)
# Apply the final dense layer to the attention output
output = dense(attention_output)
return output, None
# Feed Forward Network applied at each position
def positionwise_feed_forward(self, x):
# Define the first dense layer with ReLU activation.
dense1 = tf.keras.layers.Dense(self.d_ff, activation='relu')
# Define the second dense layer to project back to d_model dimension.
dense2 = tf.keras.layers.Dense(self.d_model)
# Apply the first dense layer
x = dense1(x)
# Apply the second dense layer
return dense2(x)
# Positional Encoding Layer to inject position information into the embeddings
def positional_encoding(self):
def get_angles(position, i, d_model):
# Compute the angle rates for positional encoding.
angles = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
return position * angles
# Calculate the angles for all positions and dimensions.
angle_rads = get_angles(np.arange(self.max_len)[:, np.newaxis],
np.arange(self.d_model)[np.newaxis, :],
self.d_model)
# Apply sine to even indices and cosine to odd indices.
angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
# Add an extra batch dimension and cast to float32.
pos_encoding = angle_rads[np.newaxis, ...]
return tf.cast(pos_encoding, dtype=tf.float32)
# Encoder Layer which consists of multi-head attention and feed forward network
def encoder_layer(self, x, dropout_rate=0.1, mask=None, training=False):
# Apply multi-head self-attention
mha_output, _ = self.multi_head_attention(x, x, x, mask)
# Apply dropout and residual connection (Add & Norm block)
mha_output = tf.keras.layers.Dropout(dropout_rate)(mha_output, training=training)
out1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x + mha_output)
# Apply position-wise feed forward network.
ffn_output = self.positionwise_feed_forward(out1)
ffn_output = tf.keras.layers.Dropout(dropout_rate)(ffn_output, training=training)
# Apply another residual connection and normalization
return tf.keras.layers.LayerNormalization(epsilon=1e-6)(out1 + ffn_output)
# Decoder Layer which includes self-attention, encoder-decoder attention, and feed-forward network
def decoder_layer(self, x, enc_output, dropout_rate=0.1, tgt_mask=None, src_mask=None, training=False):
# Apply multi-head self-attention
mha1_output, _ = self.multi_head_attention(x, x, x, tgt_mask)
# apply risiduel connection and normalization to output of mha1
mha1_output = tf.keras.layers.Dropout(dropout_rate)(mha1_output, training=training)
out1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x + mha1_output)
# Apply multi-head attention over the encoder output.
mha2_output, _ = self.multi_head_attention(out1, enc_output, enc_output, src_mask)
mha2_output = tf.keras.layers.Dropout(dropout_rate)(mha2_output, training=training)
out2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)(out1 + mha2_output)
# Apply position-wise feed forward network.
ffn_output = self.positionwise_feed_forward(out2)
ffn_output = tf.keras.layers.Dropout(dropout_rate)(ffn_output, training=training)
return tf.keras.layers.LayerNormalization(epsilon=1e-6)(out2 + ffn_output)
# Encoder Stack: Stacks multiple Encoder layers
def encoder(self, input, seq_len, dropout_rate=0.1):
input = tf.keras.layers.Input(shape= (seq_len, ), dtype= tf.int64)
# Using Lambda to calculate the dynamic shape
# dynamic_shape_layer = tf.keras.layers.Lambda(lambda x: tf.shape(x))
# seq_len = dynamic_shape_layer(input)
embeddings = tf.keras.layers.Embedding(self.input_vocab_size, self.d_model)(input)
pos_encodings = self.positional_encoding()
pos_encodings = pos_encodings[:, :seq_len, :]
x = embeddings + pos_encodings
x = tf.keras.layers.Dropout(dropout_rate)(x)
for _ in range(self.num_layers):
x = self.encoder_layer(x)
return tf.keras.Model(inputs= input, outputs= x)
# Decoder Stack: Stacks multiple Decoder layers
def decoder(self, target, enc_output, seq_len, dropout_rate=0.1):
target= tf.keras.layers.Input(shape= (seq_len, ), dtype= tf.int64)
enc_output= tf.keras.layers.Input(shape= (seq_len, self.d_model), dtype= tf.float32)
embeddings = tf.keras.layers.Embedding(self.target_vocab_size, self.d_model)(target)
pos_encodings = self.positional_encoding()
pos_encodings = pos_encodings[:, :seq_len, :]
x = embeddings + pos_encodings
x = tf.keras.layers.Dropout(dropout_rate)(x)
for _ in range(self.num_layers):
x = self.decoder_layer(x, enc_output, dropout_rate)
x = tf.keras.layers.Dense(
self.target_vocab_size,
activation='softmax' # softmax activation for classification
)(x)
return tf.keras.Model(inputs= [target, enc_output], outputs= x)
# Transformer Model: Combines encoder, decoder, and final linear projection
def predict(self, input, target):
input_len = len(input)
target_len = len(target)
input= tf.keras.layers.Input(shape= (input_len, ), dtype= tf.int64)
target= tf.keras.layers.Input(shape= (target_len, ), dtype= tf.int64)
# Get the encoder and decoder outputs
enc_output = self.encoder(input, input_len) # Pass input as tensor
dec_output = self.decoder(target, enc_output, target_len) # Pass target and encoder output as tensors
enc_output= tf.keras.layers.Input(shape= (target_len, self.d_model), dtype= tf.float32)
final_output = tf.keras.layers.Dense(self.target_vocab_size)(dec_output(inputs= [target, enc_output]))
return tf.keras.Model(inputs=[input, target], outputs= final_output)