Initial commit

Ajith Thomas · Ajith Thomas · commit 1a193c32a947 · 2019-03-31T16:16:54.000+05:30
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+#Directories to be ignored
+/.idea/
+/notebooks/
+/__pycache__/
diff --git a/addressing.py b/addressing.py
@@ -0,0 +1,121 @@
+#!/usr/bin/python3
+
+import tensorflow as tf
+import numpy as np
+
+from tensorflow.keras import Model
+from tensorflow.keras.layers import Dense
+
+
+class Addressing(Model):
+
+    def __init__(self, memory_locations=128, memory_vector_size=20, maximum_shifts=3, reading=True):
+        super(Addressing, self).__init__()
+
+        self.memory_locations = memory_locations  # N locations
+        self.memory_vector_size = memory_vector_size  # M vector size
+        self.maximum_shifts = maximum_shifts
+        self.reading = reading
+
+        self.read_split = [self.memory_vector_size, 1, 1, self.maximum_shifts, 1]
+        self.write_split = [self.memory_vector_size, 1, 1, self.maximum_shifts, 1,
+                            self.memory_vector_size, self.memory_vector_size]
+
+        if self.reading:
+            self.emit_len = np.sum(self.read_split)
+        else:
+            self.emit_len = np.sum(self.write_split)
+
+        self.fc_addr = Dense(units=self.emit_len, activation='sigmoid', name="emit_params", trainable=False,
+                             kernel_initializer='glorot_uniform', bias_initializer='glorot_normal')
+        
+        self.k_t = None
+        self.beta_t = None
+        self.g_t = None
+        self.s_t = None
+        self.gamma_t = None
+
+        self.e_t = None
+        self.a_t = None
+        
+        # All of the below are the weights over N locations produced by the addressing mechanism
+        # [Batch size, N]
+        self.w_c_t = None
+        self.w_g_t = None
+        self.w_tidle_t = None
+        self.w_t = None
+
+    def emit_head_params(self, fc_output):
+
+        if self.reading:
+            k_t, beta_t, g_t, s_t, gamma_t = tf.split(fc_output, self.read_split, axis=-1)
+
+            self.k_t = tf.identity(k_t)
+            self.beta_t = tf.nn.softplus(beta_t)
+            self.g_t = tf.nn.sigmoid(g_t)
+            self.s_t = tf.nn.softmax(s_t, axis=1)
+            self.gamma_t = 1.0 + tf.nn.softplus(gamma_t)
+
+        else:
+            k_t, beta_t, g_t, s_t, gamma_t, e_t, a_t = tf.split(fc_output, self.write_split, axis=-1)
+
+            self.k_t = tf.identity(k_t)
+            self.beta_t = tf.nn.softplus(beta_t)
+            self.g_t = tf.nn.sigmoid(g_t)
+            self.s_t = tf.nn.softmax(s_t, axis=1)
+            self.gamma_t = 1.0 + tf.nn.softplus(gamma_t)
+
+            self.e_t = tf.nn.sigmoid(e_t)
+            self.a_t = tf.identity(a_t)
+
+    @staticmethod
+    def cosine_similarity(k, m):
+        k_mag = tf.sqrt(tf.reduce_sum(tf.square(k), axis=-1))
+        m_mag = tf.sqrt(tf.reduce_sum(tf.square(m), axis=-1))
+        mag_prod = tf.multiply(k_mag, m_mag)
+        dot = tf.squeeze(tf.keras.layers.dot([k, m], axes=(-1, -1)), axis=1)
+        return tf.divide(dot, mag_prod)
+    
+    @staticmethod
+    def circular_convolution(w, s):
+        kernels=tf.TensorArray(dtype=s.dtype, size=s.shape[0])
+        
+        for i in range(0, s.shape[0]):
+            kernels.write(i, tf.roll(w, shift=i-(s.shape[0]//2), axis=0))
+
+        w_circ_conv = tf.transpose(kernels.stack())
+        return tf.reduce_sum(w_circ_conv*s, axis=1)
+    
+    def content_addressing(self, M_t):
+        k_t = tf.expand_dims(self.k_t, axis=1)
+        self.w_c_t = tf.nn.softmax(self.beta_t * self.cosine_similarity(k_t, M_t), axis=1)
+    
+    def interpolation(self, w_t_prev):
+        self.w_g_t = (self.g_t * self.w_c_t) + ((1 - self.g_t)*w_t_prev)
+    
+    def convolutional_shift(self):
+        convolved_weights = tf.TensorArray(dtype=self.w_g_t.dtype, size=self.w_g_t.shape[0])
+        
+        for i in range(self.s_t.shape[0]):
+            cc = self.circular_convolution(self.w_g_t[i], self.s_t[i])
+            convolved_weights.write(i, cc)
+        
+        self.w_tidle_t = convolved_weights.stack()
+
+    def sharpening(self):
+        self.w_t = tf.nn.softmax(tf.pow(self.w_tidle_t, self.gamma_t), axis=1)
+
+    def call(self, controller_output, w_t_prev, M_t):
+        # Controller outputs used for addressing
+        self.emit_head_params(self.fc_addr(controller_output))
+
+        # Addressing mechanism
+        self.content_addressing(M_t)
+        self.interpolation(w_t_prev)
+        self.convolutional_shift()
+        self.sharpening()
+
+        if self.reading:
+            return self.w_t  # The new weight over the N locations of the memory matrix, and
+        else:
+            return self.w_t, self.e_t, self.a_t
diff --git a/controller.py b/controller.py
@@ -0,0 +1,26 @@
+#!/usr/bin/python3
+
+import tensorflow as tf
+
+from tensorflow.keras import Model
+from tensorflow.keras.layers import Dense
+
+
+class Controller(Model):
+
+    def __init__(self, controller_size=100):
+        super(Controller, self).__init__()
+
+        # 3-layer feedforward controller
+        self.d1 = Dense(units=controller_size, activation=tf.nn.sigmoid, name="controller_d1",
+                        kernel_initializer='glorot_uniform', bias_initializer='glorot_normal')
+        self.d2 = Dense(units=controller_size, activation=tf.nn.sigmoid, name="controller_d2",
+                        kernel_initializer='glorot_uniform', bias_initializer='glorot_normal')
+        self.d3 = Dense(units=controller_size, activation=tf.nn.sigmoid, name="controller_d3",
+                        kernel_initializer='glorot_uniform', bias_initializer='glorot_normal')
+
+    def call(self, controller_input):
+        out = self.d1(controller_input)
+        out = self.d2(out)
+        return self.d3(out)
+
diff --git a/heads.py b/heads.py
@@ -0,0 +1,44 @@
+#!/usr/bin/python3
+
+import tensorflow as tf
+
+from tensorflow.keras import Model
+
+from addressing import Addressing
+
+
+class ReadHead(Model):
+
+    def __init__(self, memory_locations=128, memory_vector_size=20, maximum_shifts=3):
+        super(ReadHead, self).__init__()
+
+        self.addr_mech = Addressing(memory_locations, memory_vector_size, maximum_shifts, reading=True)
+    
+    def call(self, controller_output, w_t_1, M_t):
+        w_t = self.addr_mech(controller_output, w_t_1, M_t)
+        r_t = tf.squeeze(tf.matmul(tf.expand_dims(w_t, axis=1), M_t), axis=1)
+        return r_t, w_t
+
+
+class WriteHead(Model):
+
+    def __init__(self, memory_locations=128, memory_vector_size=20, maximum_shifts=3):
+        super(WriteHead, self).__init__()
+        
+        self.memory_vector_size = memory_vector_size
+        self.addr_mech = Addressing(memory_locations, memory_vector_size, maximum_shifts, reading=False)
+
+    def call(self, controller_output, w_t_1, M_t_1):
+        w_t, e_t, a_t = self.addr_mech(controller_output, w_t_1, M_t_1)
+        w_t = tf.expand_dims(w_t, axis=1)
+
+        # Erase
+        e_t = tf.expand_dims(e_t, axis=1)
+        M_tidle_t = tf.multiply(M_t_1, (1.0 - tf.matmul(w_t, e_t, transpose_a=True)))
+
+        # Add
+        a_t = tf.expand_dims(a_t, axis=1)
+        M_t = M_tidle_t + tf.matmul(w_t, a_t, transpose_a=True)
+
+        return M_t, tf.squeeze(e_t, axis=1), tf.squeeze(a_t, axis=1), tf.squeeze(w_t, axis=1)
+
diff --git a/ntm.py b/ntm.py
@@ -0,0 +1,97 @@
+#!/usr/bin/python3
+
+import tensorflow as tf
+import numpy as np
+
+from controller import Controller
+from heads import ReadHead, WriteHead
+
+from tensorflow.keras import Model
+from tensorflow.keras.layers import Dense
+
+
+class NTM(Model):
+
+    def __init__(self, controller_size=100, memory_locations=128, memory_vector_size=20, maximum_shifts=3, output_size=8):
+        super(NTM, self).__init__()
+
+        self.memory_locations = memory_locations  # N locations
+        self.memory_vector_size = memory_vector_size  # M size memory vectors
+        self.maximum_shifts = maximum_shifts
+        
+        self.controller = Controller(controller_size)
+        self.read_head = ReadHead(self.memory_locations, self.memory_vector_size, self.maximum_shifts)
+        self.write_head = WriteHead(self.memory_locations, self.memory_vector_size, self.maximum_shifts)
+
+        self.final_fc = Dense(units=output_size, activation=tf.nn.sigmoid, name="final_fc",
+                              kernel_initializer='glorot_uniform', bias_initializer='glorot_normal')
+
+        self.stddev = 1.0 / (np.sqrt(self.memory_locations + self.memory_vector_size))
+
+        # The learned bias vector
+        self.r_bias = tf.constant(tf.random.normal([1, self.memory_vector_size]) * 0.01)  # Bias for previous reads
+        self.M_bias = tf.constant(tf.random.uniform([1, self.memory_locations, self.memory_vector_size],
+                                                    minval=-self.stddev, maxval=self.stddev))  # Bias for Memory matrix
+
+        # States of the NTM
+        self.r_t_1 = None  # Previous read vector variable [Batch size, M]
+        self.w_t_1 = None  # Previous weights over the memory matrix [Batch size, N]
+        self.M_t = None  # The memory matrix [Batch size, N, M]
+
+        # Extra outputs that are tracked
+        self.e_t = None
+        self.a_t = None
+
+    def create_new_state(self, batch_size):  # Creates a new NTM state
+        # This has to be manually called if stateful is set to true
+        if self.r_t_1 is None:
+            self.r_t_1 = tf.Variable(tf.tile(self.r_bias, [batch_size, 1]), trainable=False)
+        else:
+            self.r_t_1.assign(tf.tile(self.r_bias, [batch_size, 1]))
+
+        if self.w_t_1 is None:
+            self.w_t_1 = tf.Variable(tf.zeros([batch_size, self.memory_locations]), trainable=False)
+        else:
+            self.w_t_1.assign(tf.zeros([batch_size, self.memory_locations]))
+
+        if self.M_t is None:
+            self.M_t = tf.Variable(tf.tile(self.M_bias, [batch_size, 1, 1]), trainable=False)
+        else:
+            self.M_t.assign(tf.tile(self.M_bias, [batch_size, 1, 1]))
+    
+    def call(self, inputs, stateful=False):
+        # Convert from [Batch, Timesteps, Features] to [Timesteps, Batch, Features]
+        inputs = tf.transpose(inputs, [1, 0, 2])
+        outputs = tf.TensorArray(dtype=inputs.dtype, size=inputs.shape[0])
+        
+        if not stateful:  # A new state will not be created at the start of each new batch
+            self.create_new_state(inputs.shape[1])
+
+        for i in range(inputs.shape[0]):
+            # Concatenated input and previous reads [Batch, Features + N]
+            controller_inputs = tf.concat([inputs[i], self.r_t_1], axis=1)
+            controller_outputs = self.controller(controller_inputs)  # [Batch size, Controller size]
+
+            r_t, w_t = self.read_head(controller_outputs, tf.identity(self.w_t_1), tf.identity(self.M_t))  # [Batch size, M], [Batch size, N]
+            self.r_t_1.assign(r_t)
+            self.w_t_1.assign(w_t)
+
+            # [Batch size, M, N], [Batch size, M], [Batch size, M], [Batch size, N]
+            M_t, self.e_t, self.a_t, w_t = self.write_head(controller_outputs, tf.identity(self.w_t_1), tf.identity(self.M_t))
+            self.M_t.assign(M_t)
+            self.w_t_1.assign(w_t)
+
+            fc_input = tf.concat([controller_outputs, self.r_t_1], axis=1)  # [Batch size, Controller size + M],
+            output_t = self.final_fc(fc_input)  # [Batch size, Output size]
+            outputs.write(i, output_t)  # Write it to an array
+
+        outputs = tf.transpose(outputs.stack(), [1, 0, 2])  # [Batch size, Timesteps, Output size]
+        return outputs
+
+
+# ntm = NTM(controller_size=100, memory_locations=10, memory_vector_size=5, output_size=3)
+
+# # [Batch, Timesteps, Features]
+# inp = tf.Variable(tf.reshape(tf.range(0.0,4.0,0.1),[2,5,4]))
+# out = ntm(inp)
+# print(out)
diff --git a/seqgen.py b/seqgen.py
@@ -0,0 +1,51 @@
+#!/usr/bin/python3
+
+import numpy as np 
+
+
+# Function to generate sequences of different lengths for the copy task
+def generate_patterns(batch_size=100,
+                      max_sequence=20,
+                      min_sequence=1,
+                      in_bits=8,
+                      out_bits=8,
+                      pad=1e-12,
+                      low_tol=1e-12,
+                      high_tol=1.0,
+                      fixed_seq_len=False):  # Function to generate sequences of different lengths
+    
+    ti = []
+    to = []
+
+    for _ in range(batch_size):
+        
+        if not fixed_seq_len:
+            seq_len_row = np.random.randint(low=min_sequence, high=max_sequence+1)
+        else:
+            seq_len_row = max_sequence
+
+        pat = np.random.randint(low=0, high=2, size=(seq_len_row,out_bits))
+        pat = pat.astype(np.float32)
+
+        # Applying tolerance (So that values don't go to zero and cause NaN errors)
+        pat[pat < 1] = low_tol
+        pat[pat >= 1] = high_tol
+
+        # Padding can be added if needed
+        x = np.ones(((max_sequence*2)+2, in_bits+2), dtype=pat.dtype) * pad  # Input pattern has two extra side track
+        y = np.ones(((max_sequence*2)+2, out_bits), dtype=pat.dtype) * pad  # Side tracks are not produced
+
+        # Creates a delayed output (Target delay)
+        x[1:seq_len_row+1, 2:] = pat
+        y[seq_len_row+2:(2*seq_len_row)+2, :] = pat  # No side tracks needed for the output
+
+        x[1:seq_len_row+1, 0:2] = low_tol
+        x[0, :] = low_tol
+        x[0, 1] = 1.0  # Start of sequence
+        x[seq_len_row+1, :] = low_tol
+        x[seq_len_row+1, 0] = 1.0  # End of sequence
+
+        ti.append(x)
+        to.append(y)
+
+    return np.array(ti), np.array(to)
diff --git a/train.py b/train.py