diff --git a/deep_learning4e.py b/deep_learning4e.py
index 64aa49e90..734a9307c 100644
--- a/deep_learning4e.py
+++ b/deep_learning4e.py
@@ -9,14 +9,14 @@
 from keras.preprocessing import sequence
 
 from utils4e import (Sigmoid, dot_product, softmax1D, conv1D, gaussian_kernel, element_wise_product, vector_add,
-                     random_weights, scalar_vector_product, matrix_multiplication, map_vector, mse_loss)
+                     random_weights, scalar_vector_product, matrix_multiplication, map_vector, mean_squared_error_loss)
 
 
 class Node:
     """
     A node in a computational graph contains the pointer to all its parents.
-    :param val: value of current node.
-    :param parents: a container of all parents of current node.
+    :param val: value of current node
+    :param parents: a container of all parents of current node
     """
 
     def __init__(self, val=None, parents=None):
@@ -55,40 +55,40 @@ def forward(self, inputs):
         raise NotImplementedError
 
 
-class OutputLayer(Layer):
-    """1D softmax output layer in 19.3.2"""
+class InputLayer(Layer):
+    """1D input layer. Layer size is the same as input vector size."""
 
     def __init__(self, size=3):
         super().__init__(size)
 
     def forward(self, inputs):
+        """Take each value of the inputs to each unit in the layer."""
         assert len(self.nodes) == len(inputs)
-        res = softmax1D(inputs)
-        for node, val in zip(self.nodes, res):
-            node.val = val
-        return res
+        for node, inp in zip(self.nodes, inputs):
+            node.val = inp
+        return inputs
 
 
-class InputLayer(Layer):
-    """1D input layer. Layer size is the same as input vector size."""
+class OutputLayer(Layer):
+    """1D softmax output layer in 19.3.2."""
 
     def __init__(self, size=3):
         super().__init__(size)
 
     def forward(self, inputs):
-        """Take each value of the inputs to each unit in the layer."""
         assert len(self.nodes) == len(inputs)
-        for node, inp in zip(self.nodes, inputs):
-            node.val = inp
-        return inputs
+        res = softmax1D(inputs)
+        for node, val in zip(self.nodes, res):
+            node.val = val
+        return res
 
 
 class DenseLayer(Layer):
     """
     1D dense layer in a neural network.
-    :param in_size: input vector size, int.
-    :param out_size: output vector size, int.
-    :param activation: activation function, Activation object.
+    :param in_size: (int) input vector size
+    :param out_size: (int) output vector size
+    :param activation: (Activation object) activation function
     """
 
     def __init__(self, in_size=3, out_size=3, activation=None):
@@ -124,7 +124,7 @@ def __init__(self, size=3, kernel_size=3):
             node.weights = gaussian_kernel(kernel_size)
 
     def forward(self, features):
-        # each node in layer takes a channel in the features.
+        # each node in layer takes a channel in the features
         assert len(self.nodes) == len(features)
         res = []
         # compute the convolution output of each channel, store it in node.val
@@ -154,7 +154,8 @@ def forward(self, features):
         for i in range(len(self.nodes)):
             feature = features[i]
             # get the max value in a kernel_size * kernel_size area
-            out = [max(feature[i:i + self.kernel_size]) for i in range(len(feature) - self.kernel_size + 1)]
+            out = [max(feature[i:i + self.kernel_size])
+                   for i in range(len(feature) - self.kernel_size + 1)]
             res.append(out)
             self.nodes[i].val = out
         return res
@@ -270,13 +271,13 @@ def adam(dataset, net, loss, epochs=1000, rho=(0.9, 0.999), delta=1 / 10 ** 8,
 
 def BackPropagation(inputs, targets, theta, net, loss):
     """
-    The back-propagation algorithm for multilayer networks in only one epoch, to calculate gradients of theta
-    :param inputs: a batch of inputs in an array. Each input is an iterable object.
-    :param targets: a batch of targets in an array. Each target is an iterable object.
-    :param theta: parameters to be updated.
-    :param net: a list of predefined layer objects representing their linear sequence.
-    :param loss: a predefined loss function taking array of inputs and targets.
-    :return: gradients of theta, loss of the input batch.
+    The back-propagation algorithm for multilayer networks in only one epoch, to calculate gradients of theta.
+    :param inputs: a batch of inputs in an array. Each input is an iterable object
+    :param targets: a batch of targets in an array. Each target is an iterable object
+    :param theta: parameters to be updated
+    :param net: a list of predefined layer objects representing their linear sequence
+    :param loss: a predefined loss function taking array of inputs and targets
+    :return: gradients of theta, loss of the input batch
     """
 
     assert len(inputs) == len(targets)
@@ -325,9 +326,9 @@ def BackPropagation(inputs, targets, theta, net, loss):
 class BatchNormalizationLayer(Layer):
     """Batch normalization layer."""
 
-    def __init__(self, size, epsilon=0.001):
+    def __init__(self, size, eps=0.001):
         super().__init__(size)
-        self.epsilon = epsilon
+        self.eps = eps
         # self.weights = [beta, gamma]
         self.weights = [0, 0]
         self.inputs = None
@@ -341,7 +342,7 @@ def forward(self, inputs):
         res = []
         # get normalized value of each input
         for i in range(len(self.nodes)):
-            val = [(inputs[i] - mu) * self.weights[0] / np.sqrt(self.epsilon + stderr ** 2) + self.weights[1]]
+            val = [(inputs[i] - mu) * self.weights[0] / np.sqrt(self.eps + stderr ** 2) + self.weights[1]]
             res.append(val)
             self.nodes[i].val = val
         return res
@@ -375,7 +376,7 @@ def NeuralNetLearner(dataset, hidden_layer_sizes=None, learning_rate=0.01, epoch
     raw_net.append(DenseLayer(hidden_input_size, output_size))
 
     # update parameters of the network
-    learned_net = optimizer(dataset, raw_net, mse_loss, epochs, l_rate=learning_rate,
+    learned_net = optimizer(dataset, raw_net, mean_squared_error_loss, epochs, l_rate=learning_rate,
                             batch_size=batch_size, verbose=verbose)
 
     def predict(example):
@@ -394,7 +395,7 @@ def predict(example):
     return predict
 
 
-def PerceptronLearner(dataset, learning_rate=0.01, epochs=100, verbose=None):
+def PerceptronLearner(dataset, learning_rate=0.01, epochs=100, optimizer=gradient_descent, batch_size=1, verbose=None):
     """
     Simple perceptron neural network.
     """
@@ -405,7 +406,8 @@ def PerceptronLearner(dataset, learning_rate=0.01, epochs=100, verbose=None):
     raw_net = [InputLayer(input_size), DenseLayer(input_size, output_size)]
 
     # update the network
-    learned_net = gradient_descent(dataset, raw_net, mse_loss, epochs, l_rate=learning_rate, verbose=verbose)
+    learned_net = optimizer(dataset, raw_net, mean_squared_error_loss, epochs, l_rate=learning_rate,
+                            batch_size=batch_size, verbose=verbose)
 
     def predict(example):
         layer_out = learned_net[1].forward(example)
@@ -419,7 +421,7 @@ def SimpleRNNLearner(train_data, val_data, epochs=2):
     RNN example for text sentimental analysis.
     :param train_data: a tuple of (training data, targets)
             Training data: ndarray taking training examples, while each example is coded by embedding
-            Targets: ndarray taking targets of each example. Each target is mapped to an integer.
+            Targets: ndarray taking targets of each example. Each target is mapped to an integer
     :param val_data: a tuple of (validation data, targets)
     :param epochs: number of epochs
     :return: a keras model
diff --git a/learning.py b/learning.py
index 99ef8abc2..764392c7d 100644
--- a/learning.py
+++ b/learning.py
@@ -968,7 +968,7 @@ def ada_boost(dataset, L, K):
         h.append(h_k)
         error = sum(weight for example, weight in zip(examples, w) if example[target] != h_k(example))
         # avoid divide-by-0 from either 0% or 100% error rates
-        error = clip(error, eps, 1 - eps)
+        error = np.clip(error, eps, 1 - eps)
         for j, example in enumerate(examples):
             if example[target] == h_k(example):
                 w[j] *= error / (1 - error)
diff --git a/learning4e.py b/learning4e.py
index f581b9ec1..7dba31cfa 100644
--- a/learning4e.py
+++ b/learning4e.py
@@ -742,7 +742,7 @@ def ada_boost(dataset, L, K):
         h.append(h_k)
         error = sum(weight for example, weight in zip(examples, w) if example[target] != h_k(example))
         # avoid divide-by-0 from either 0% or 100% error rates
-        error = clip(error, eps, 1 - eps)
+        error = np.clip(error, eps, 1 - eps)
         for j, example in enumerate(examples):
             if example[target] == h_k(example):
                 w[j] *= error / (1 - error)
diff --git a/tests/test_deep_learning4e.py b/tests/test_deep_learning4e.py
index ed8979a0a..305c2e65c 100644
--- a/tests/test_deep_learning4e.py
+++ b/tests/test_deep_learning4e.py
@@ -11,8 +11,8 @@ def test_neural_net():
     iris = DataSet(name='iris')
     classes = ['setosa', 'versicolor', 'virginica']
     iris.classes_to_numbers(classes)
-    nnl_adam = NeuralNetLearner(iris, [4], learning_rate=0.001, epochs=200, optimizer=adam)
     nnl_gd = NeuralNetLearner(iris, [4], learning_rate=0.15, epochs=100, optimizer=gradient_descent)
+    nnl_adam = NeuralNetLearner(iris, [4], learning_rate=0.001, epochs=200, optimizer=adam)
     tests = [([5.0, 3.1, 0.9, 0.1], 0),
              ([5.1, 3.5, 1.0, 0.0], 0),
              ([4.9, 3.3, 1.1, 0.1], 0),
@@ -22,25 +22,28 @@ def test_neural_net():
              ([7.5, 4.1, 6.2, 2.3], 2),
              ([7.3, 4.0, 6.1, 2.4], 2),
              ([7.0, 3.3, 6.1, 2.5], 2)]
-    assert grade_learner(nnl_adam, tests) >= 1 / 3
     assert grade_learner(nnl_gd, tests) >= 1 / 3
-    assert err_ratio(nnl_adam, iris) < 0.21
     assert err_ratio(nnl_gd, iris) < 0.21
+    assert grade_learner(nnl_adam, tests) >= 1 / 3
+    assert err_ratio(nnl_adam, iris) < 0.21
 
 
 def test_perceptron():
     iris = DataSet(name='iris')
     classes = ['setosa', 'versicolor', 'virginica']
     iris.classes_to_numbers(classes)
-    pl = PerceptronLearner(iris, learning_rate=0.01, epochs=100)
+    pl_gd = PerceptronLearner(iris, learning_rate=0.01, epochs=100, optimizer=gradient_descent)
+    pl_adam = PerceptronLearner(iris, learning_rate=0.01, epochs=100, optimizer=adam)
     tests = [([5, 3, 1, 0.1], 0),
              ([5, 3.5, 1, 0], 0),
              ([6, 3, 4, 1.1], 1),
              ([6, 2, 3.5, 1], 1),
              ([7.5, 4, 6, 2], 2),
              ([7, 3, 6, 2.5], 2)]
-    assert grade_learner(pl, tests) > 1 / 2
-    assert err_ratio(pl, iris) < 0.4
+    assert grade_learner(pl_gd, tests) > 1 / 2
+    assert err_ratio(pl_gd, iris) < 0.4
+    assert grade_learner(pl_adam, tests) > 1 / 2
+    assert err_ratio(pl_adam, iris) < 0.4
 
 
 def test_rnn():
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 31b5848f0..6c2a50808 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -173,10 +173,6 @@ def test_normalize():
     assert normalize([1, 2, 1]) == [0.25, 0.5, 0.25]
 
 
-def test_clip():
-    assert [clip(x, 0, 1) for x in [-1, 0.5, 10]] == [0, 0.5, 1]
-
-
 def test_gaussian():
     assert gaussian(1, 0.5, 0.7) == 0.6664492057835993
     assert gaussian(5, 2, 4.5) == 0.19333405840142462
@@ -201,10 +197,6 @@ def test_distance_squared():
     assert distance_squared((1, 2), (5, 5)) == 25.0
 
 
-def test_vector_clip():
-    assert vector_clip((-1, 10), (0, 0), (9, 9)) == (0, 9)
-
-
 def test_turn_heading():
     assert turn_heading((0, 1), 1) == (-1, 0)
     assert turn_heading((0, 1), -1) == (1, 0)
diff --git a/utils.py b/utils.py
index 4bf29a9a3..fd683d34a 100644
--- a/utils.py
+++ b/utils.py
@@ -233,8 +233,20 @@ def euclidean_distance(x, y):
     return np.sqrt(sum((_x - _y) ** 2 for _x, _y in zip(x, y)))
 
 
+def manhattan_distance(x, y):
+    return sum(abs(_x - _y) for _x, _y in zip(x, y))
+
+
+def hamming_distance(x, y):
+    return sum(_x != _y for _x, _y in zip(x, y))
+
+
 def cross_entropy_loss(x, y):
-    return (-1.0 / len(x)) * sum(x * np.log(y) + (1 - x) * np.log(1 - y) for x, y in zip(x, y))
+    return (-1.0 / len(x)) * sum(_x * np.log(_y) + (1 - _x) * np.log(1 - _y) for _x, _y in zip(x, y))
+
+
+def mean_squared_error_loss(x, y):
+    return (1.0 / len(x)) * sum((_x - _y) ** 2 for _x, _y in zip(x, y))
 
 
 def rms_error(x, y):
@@ -242,25 +254,17 @@ def rms_error(x, y):
 
 
 def ms_error(x, y):
-    return mean((x - y) ** 2 for x, y in zip(x, y))
+    return mean((_x - _y) ** 2 for _x, _y in zip(x, y))
 
 
 def mean_error(x, y):
-    return mean(abs(x - y) for x, y in zip(x, y))
-
-
-def manhattan_distance(x, y):
-    return sum(abs(_x - _y) for _x, _y in zip(x, y))
+    return mean(abs(_x - _y) for _x, _y in zip(x, y))
 
 
 def mean_boolean_error(x, y):
     return mean(_x != _y for _x, _y in zip(x, y))
 
 
-def hamming_distance(x, y):
-    return sum(_x != _y for _x, _y in zip(x, y))
-
-
 def normalize(dist):
     """Multiply each number by a constant such that the sum is 1.0"""
     if isinstance(dist, dict):
@@ -277,20 +281,15 @@ def random_weights(min_value, max_value, num_weights):
     return [random.uniform(min_value, max_value) for _ in range(num_weights)]
 
 
-def clip(x, lowest, highest):
-    """Return x clipped to the range [lowest..highest]."""
-    return max(lowest, min(x, highest))
+def sigmoid(x):
+    """Return activation value of x with sigmoid function."""
+    return 1 / (1 + np.exp(-x))
 
 
 def sigmoid_derivative(value):
     return value * (1 - value)
 
 
-def sigmoid(x):
-    """Return activation value of x with sigmoid function."""
-    return 1 / (1 + np.exp(-x))
-
-
 def elu(x, alpha=0.01):
     return x if x > 0 else alpha * (np.exp(x) - 1)
 
@@ -389,13 +388,6 @@ def distance_squared(a, b):
     return (xA - xB) ** 2 + (yA - yB) ** 2
 
 
-def vector_clip(vector, lowest, highest):
-    """Return vector, except if any element is less than the corresponding
-    value of lowest or more than the corresponding value of highest, clip to
-    those values."""
-    return type(vector)(map(clip, vector, lowest, highest))
-
-
 # ______________________________________________________________________________
 # Misc Functions
 
@@ -484,7 +476,6 @@ def failure_test(algorithm, tests):
     to check for correctness. On the other hand, a lot of algorithms output something
     particular on fail (for example, False, or None).
     tests is a list with each element in the form: (values, failure_output)."""
-    from statistics import mean
     return mean(int(algorithm(x) != y) for x, y in tests)
 
 
diff --git a/utils4e.py b/utils4e.py
index 1c376066e..b0fbf8df8 100644
--- a/utils4e.py
+++ b/utils4e.py
@@ -319,6 +319,14 @@ def euclidean_distance(x, y):
     return np.sqrt(sum((_x - _y) ** 2 for _x, _y in zip(x, y)))
 
 
+def manhattan_distance(x, y):
+    return sum(abs(_x - _y) for _x, _y in zip(x, y))
+
+
+def hamming_distance(x, y):
+    return sum(_x != _y for _x, _y in zip(x, y))
+
+
 def rms_error(x, y):
     return np.sqrt(ms_error(x, y))
 
@@ -331,28 +339,20 @@ def mean_error(x, y):
     return mean(abs(x - y) for x, y in zip(x, y))
 
 
-def manhattan_distance(x, y):
-    return sum(abs(_x - _y) for _x, _y in zip(x, y))
-
-
 def mean_boolean_error(x, y):
     return mean(_x != _y for _x, _y in zip(x, y))
 
 
-def hamming_distance(x, y):
-    return sum(_x != _y for _x, _y in zip(x, y))
-
-
-# 19.2 Common Loss Functions
+# loss functions
 
 
 def cross_entropy_loss(x, y):
-    """Example of cross entropy loss. x and y are 1D iterable objects."""
-    return (-1.0 / len(x)) * sum(x * np.log(y) + (1 - x) * np.log(1 - y) for x, y in zip(x, y))
+    """Cross entropy loss function. x and y are 1D iterable objects."""
+    return (-1.0 / len(x)) * sum(x * np.log(_y) + (1 - _x) * np.log(1 - _y) for _x, _y in zip(x, y))
 
 
-def mse_loss(x, y):
-    """Example of min square loss. x and y are 1D iterable objects."""
+def mean_squared_error_loss(x, y):
+    """Min square loss function. x and y are 1D iterable objects."""
     return (1.0 / len(x)) * sum((_x - _y) ** 2 for _x, _y in zip(x, y))
 
 
@@ -395,29 +395,21 @@ def gaussian_kernel_2D(size=3, sigma=0.5):
     return g / g.sum()
 
 
-# ______________________________________________________________________________
-# loss and activation functions
+# activation functions
 
 
 class Activation:
 
     def f(self, x):
-        pass
+        return NotImplementedError
 
     def derivative(self, x):
-        pass
-
-
-def clip(x, lowest, highest):
-    """Return x clipped to the range [lowest..highest]."""
-    return max(lowest, min(x, highest))
+        return NotImplementedError
 
 
 def softmax1D(x):
     """Return the softmax vector of input vector x."""
-    exps = [np.exp(_x) for _x in x]
-    sum_exps = sum(exps)
-    return [exp / sum_exps for exp in exps]
+    return np.exp(x) / sum(np.exp(x))
 
 
 class Sigmoid(Activation):
@@ -545,13 +537,6 @@ def distance_squared(a, b):
     return (xA - xB) ** 2 + (yA - yB) ** 2
 
 
-def vector_clip(vector, lowest, highest):
-    """Return vector, except if any element is less than the corresponding
-    value of lowest or more than the corresponding value of highest, clip to
-    those values."""
-    return type(vector)(map(clip, vector, lowest, highest))
-
-
 # ______________________________________________________________________________
 # Misc Functions
 
@@ -642,7 +627,6 @@ def failure_test(algorithm, tests):
     to check for correctness. On the other hand, a lot of algorithms output something
     particular on fail (for example, False, or None).
     tests is a list with each element in the form: (values, failure_output)."""
-    from statistics import mean
     return mean(int(algorithm(x) != y) for x, y in tests)