diff --git a/deep_learning4e.py b/deep_learning4e.py
index 0a0387afc..0e2aec242 100644
--- a/deep_learning4e.py
+++ b/deep_learning4e.py
@@ -8,8 +8,8 @@
 from keras.layers import Embedding, SimpleRNN, Dense
 from keras.preprocessing import sequence
 
-from utils4e import (Sigmoid, dot_product, softmax1D, conv1D, gaussian_kernel, element_wise_product, vector_add,
-                     random_weights, scalar_vector_product, matrix_multiplication, map_vector, mean_squared_error_loss)
+from utils4e import (softmax1D, conv1D, gaussian_kernel, element_wise_product, vector_add, random_weights,
+                     scalar_vector_product, map_vector, mean_squared_error_loss)
 
 
 class Node:
@@ -31,13 +31,67 @@ class Layer:
     """
 
     def __init__(self, size):
-        self.nodes = [Node() for _ in range(size)]
+        self.nodes = np.array([Node() for _ in range(size)])
 
     def forward(self, inputs):
         """Define the operation to get the output of this layer"""
         raise NotImplementedError
 
 
+class Activation:
+
+    def function(self, x):
+        return NotImplementedError
+
+    def derivative(self, x):
+        return NotImplementedError
+
+
+class Sigmoid(Activation):
+
+    def function(self, x):
+        return 1 / (1 + np.exp(-x))
+
+    def derivative(self, value):
+        return value * (1 - value)
+
+
+class Relu(Activation):
+
+    def function(self, x):
+        return max(0, x)
+
+    def derivative(self, value):
+        return 1 if value > 0 else 0
+
+
+class Elu(Activation):
+
+    def function(self, x, alpha=0.01):
+        return x if x > 0 else alpha * (np.exp(x) - 1)
+
+    def derivative(self, value, alpha=0.01):
+        return 1 if value > 0 else alpha * np.exp(value)
+
+
+class Tanh(Activation):
+
+    def function(self, x):
+        return np.tanh(x)
+
+    def derivative(self, value):
+        return 1 - (value ** 2)
+
+
+class LeakyRelu(Activation):
+
+    def function(self, x, alpha=0.01):
+        return x if x > 0 else alpha * x
+
+    def derivative(self, value, alpha=0.01):
+        return 1 if value > 0 else alpha
+
+
 class InputLayer(Layer):
     """1D input layer. Layer size is the same as input vector size."""
 
@@ -88,7 +142,7 @@ def forward(self, inputs):
         res = []
         # get the output value of each unit
         for unit in self.nodes:
-            val = self.activation.function(dot_product(unit.weights, inputs))
+            val = self.activation.function(np.dot(unit.weights, inputs))
             unit.value = val
             res.append(val)
         return res
@@ -144,6 +198,31 @@ def forward(self, features):
         return res
 
 
+class BatchNormalizationLayer(Layer):
+    """Batch normalization layer."""
+
+    def __init__(self, size, eps=0.001):
+        super().__init__(size)
+        self.eps = eps
+        # self.weights = [beta, gamma]
+        self.weights = [0, 0]
+        self.inputs = None
+
+    def forward(self, inputs):
+        # mean value of inputs
+        mu = sum(inputs) / len(inputs)
+        # standard error of inputs
+        stderr = statistics.stdev(inputs)
+        self.inputs = inputs
+        res = []
+        # get normalized value of each input
+        for i in range(len(self.nodes)):
+            val = [(inputs[i] - mu) * self.weights[0] / np.sqrt(self.eps + stderr ** 2) + self.weights[1]]
+            res.append(val)
+            self.nodes[i].value = val
+        return res
+
+
 def init_examples(examples, idx_i, idx_t, o_units):
     """Init examples from dataset.examples."""
 
@@ -164,7 +243,7 @@ def init_examples(examples, idx_i, idx_t, o_units):
     return inputs, targets
 
 
-def stochastic_gradient_descent(dataset, net, loss, epochs=1000, l_rate=0.01, batch_size=1, verbose=None):
+def stochastic_gradient_descent(dataset, net, loss, epochs=1000, l_rate=0.01, batch_size=1, verbose=False):
     """
     Gradient descent algorithm to update the learnable parameters of a network.
     :return: the updated network
@@ -181,23 +260,23 @@ def stochastic_gradient_descent(dataset, net, loss, epochs=1000, l_rate=0.01, ba
             # compute gradients of weights
             gs, batch_loss = BackPropagation(inputs, targets, weights, net, loss)
             # update weights with gradient descent
-            weights = vector_add(weights, scalar_vector_product(-l_rate, gs))
+            weights = [x + y for x, y in zip(weights, [np.array(tg) * -l_rate for tg in gs])]
             total_loss += batch_loss
 
             # update the weights of network each batch
             for i in range(len(net)):
-                if weights[i]:
+                if weights[i].size != 0:
                     for j in range(len(weights[i])):
                         net[i].nodes[j].weights = weights[i][j]
 
-        if verbose and (e + 1) % verbose == 0:
+        if verbose:
             print("epoch:{}, total_loss:{}".format(e + 1, total_loss))
 
     return net
 
 
 def adam(dataset, net, loss, epochs=1000, rho=(0.9, 0.999), delta=1 / 10 ** 8,
-         l_rate=0.001, batch_size=1, verbose=None):
+         l_rate=0.001, batch_size=1, verbose=False):
     """
     [Figure 19.6]
     Adam optimizer to update the learnable parameters of a network.
@@ -247,7 +326,7 @@ def adam(dataset, net, loss, epochs=1000, rho=(0.9, 0.999), delta=1 / 10 ** 8,
                     for j in range(len(weights[i])):
                         net[i].nodes[j].weights = weights[i][j]
 
-        if verbose and (e + 1) % verbose == 0:
+        if verbose:
             print("epoch:{}, total_loss:{}".format(e + 1, total_loss))
 
     return net
@@ -288,16 +367,16 @@ def BackPropagation(inputs, targets, theta, net, loss):
         # initialize delta
         delta = [[] for _ in range(n_layers)]
 
-        previous = [layer_out[i] - t_val[i] for i in range(o_units)]
+        previous = np.array([layer_out[i] - t_val[i] for i in range(o_units)])
         h_layers = n_layers - 1
 
         # backward pass
         for i in range(h_layers, 0, -1):
             layer = net[i]
-            derivative = [layer.activation.derivative(node.value) for node in layer.nodes]
-            delta[i] = element_wise_product(previous, derivative)
+            derivative = np.array([layer.activation.derivative(node.value) for node in layer.nodes])
+            delta[i] = previous * derivative
             # pass to layer i-1 in the next iteration
-            previous = matrix_multiplication([delta[i]], theta[i])[0]
+            previous = np.matmul([delta[i]], theta[i])[0]
             # compute gradient of layer i
             gradients[i] = [scalar_vector_product(d, net[i].inputs) for d in delta[i]]
 
@@ -307,98 +386,108 @@ def BackPropagation(inputs, targets, theta, net, loss):
     return total_gradients, batch_loss
 
 
-class BatchNormalizationLayer(Layer):
-    """Batch normalization layer."""
-
-    def __init__(self, size, eps=0.001):
-        super().__init__(size)
-        self.eps = eps
-        # self.weights = [beta, gamma]
-        self.weights = [0, 0]
-        self.inputs = None
-
-    def forward(self, inputs):
-        # mean value of inputs
-        mu = sum(inputs) / len(inputs)
-        # standard error of inputs
-        stderr = statistics.stdev(inputs)
-        self.inputs = inputs
-        res = []
-        # get normalized value of each input
-        for i in range(len(self.nodes)):
-            val = [(inputs[i] - mu) * self.weights[0] / np.sqrt(self.eps + stderr ** 2) + self.weights[1]]
-            res.append(val)
-            self.nodes[i].value = val
-        return res
-
-
 def get_batch(examples, batch_size=1):
     """Split examples into multiple batches"""
     for i in range(0, len(examples), batch_size):
         yield examples[i: i + batch_size]
 
 
-def NeuralNetLearner(dataset, hidden_layer_sizes, l_rate=0.01, epochs=1000, batch_size=1,
-                     optimizer=stochastic_gradient_descent, verbose=None):
+class NeuralNetworkLearner:
     """
     Simple dense multilayer neural network.
     :param hidden_layer_sizes: size of hidden layers in the form of a list
     """
-    input_size = len(dataset.inputs)
-    output_size = len(dataset.values[dataset.target])
 
-    # initialize the network
-    raw_net = [InputLayer(input_size)]
-    # add hidden layers
-    hidden_input_size = input_size
-    for h_size in hidden_layer_sizes:
-        raw_net.append(DenseLayer(hidden_input_size, h_size))
-        hidden_input_size = h_size
-    raw_net.append(DenseLayer(hidden_input_size, output_size))
-
-    # update parameters of the network
-    learned_net = optimizer(dataset, raw_net, mean_squared_error_loss, epochs, l_rate=l_rate,
-                            batch_size=batch_size, verbose=verbose)
-
-    def predict(example):
-        n_layers = len(learned_net)
+    def __init__(self, dataset, hidden_layer_sizes, l_rate=0.01, epochs=1000, batch_size=10,
+                 optimizer=stochastic_gradient_descent, loss=mean_squared_error_loss, verbose=False, plot=False):
+        self.dataset = dataset
+        self.l_rate = l_rate
+        self.epochs = epochs
+        self.batch_size = batch_size
+        self.optimizer = optimizer
+        self.loss = loss
+        self.verbose = verbose
+        self.plot = plot
+
+        input_size = len(dataset.inputs)
+        output_size = len(dataset.values[dataset.target])
+
+        # initialize the network
+        raw_net = [InputLayer(input_size)]
+        # add hidden layers
+        hidden_input_size = input_size
+        for h_size in hidden_layer_sizes:
+            raw_net.append(DenseLayer(hidden_input_size, h_size))
+            hidden_input_size = h_size
+        raw_net.append(DenseLayer(hidden_input_size, output_size))
+        self.raw_net = raw_net
+
+    def fit(self, X, y):
+        self.learned_net = self.optimizer(self.dataset, self.raw_net, loss=self.loss, epochs=self.epochs,
+                                          l_rate=self.l_rate, batch_size=self.batch_size, verbose=self.verbose)
+        return self
+
+    def predict(self, example):
+        n_layers = len(self.learned_net)
 
         layer_input = example
         layer_out = example
 
         # get the output of each layer by forward passing
         for i in range(1, n_layers):
-            layer_out = learned_net[i].forward(layer_input)
+            layer_out = self.learned_net[i].forward(np.array(layer_input).reshape((-1, 1)))
             layer_input = layer_out
 
         return layer_out.index(max(layer_out))
 
-    return predict
-
 
-def PerceptronLearner(dataset, l_rate=0.01, epochs=1000, batch_size=1,
-                      optimizer=stochastic_gradient_descent, verbose=None):
+class PerceptronLearner:
     """
     Simple perceptron neural network.
     """
-    input_size = len(dataset.inputs)
-    output_size = len(dataset.values[dataset.target])
 
-    # initialize the network, add dense layer
-    raw_net = [InputLayer(input_size), DenseLayer(input_size, output_size)]
-
-    # update the network
-    learned_net = optimizer(dataset, raw_net, mean_squared_error_loss, epochs, l_rate=l_rate,
-                            batch_size=batch_size, verbose=verbose)
-
-    def predict(example):
-        layer_out = learned_net[1].forward(example)
+    def __init__(self, dataset, l_rate=0.01, epochs=1000, batch_size=10, optimizer=stochastic_gradient_descent,
+                 loss=mean_squared_error_loss, verbose=False, plot=False):
+        self.dataset = dataset
+        self.l_rate = l_rate
+        self.epochs = epochs
+        self.batch_size = batch_size
+        self.optimizer = optimizer
+        self.loss = loss
+        self.verbose = verbose
+        self.plot = plot
+
+        input_size = len(dataset.inputs)
+        output_size = len(dataset.values[dataset.target])
+
+        # initialize the network, add dense layer
+        self.raw_net = [InputLayer(input_size), DenseLayer(input_size, output_size)]
+
+    def fit(self, X, y):
+        self.learned_net = self.optimizer(self.dataset, self.raw_net, loss=self.loss, epochs=self.epochs,
+                                          l_rate=self.l_rate, batch_size=self.batch_size, verbose=self.verbose)
+        return self
+
+    def predict(self, example):
+        layer_out = self.learned_net[1].forward(np.array(example).reshape((-1, 1)))
         return layer_out.index(max(layer_out))
 
-    return predict
+
+def keras_dataset_loader(dataset, max_length=500):
+    """
+    Helper function to load keras datasets.
+    :param dataset: keras data set type
+    :param max_length: max length of each input sequence
+    """
+    # init dataset
+    (X_train, y_train), (X_val, y_val) = dataset
+    if max_length > 0:
+        X_train = sequence.pad_sequences(X_train, maxlen=max_length)
+        X_val = sequence.pad_sequences(X_val, maxlen=max_length)
+    return (X_train[10:], y_train[10:]), (X_val, y_val), (X_train[:10], y_train[:10])
 
 
-def SimpleRNNLearner(train_data, val_data, epochs=2):
+def SimpleRNNLearner(train_data, val_data, epochs=2, verbose=False):
     """
     RNN example for text sentimental analysis.
     :param train_data: a tuple of (training data, targets)
@@ -406,6 +495,7 @@ def SimpleRNNLearner(train_data, val_data, epochs=2):
             Targets: ndarray taking targets of each example. Each target is mapped to an integer
     :param val_data: a tuple of (validation data, targets)
     :param epochs: number of epochs
+    :param verbose: verbosity mode
     :return: a keras model
     """
 
@@ -424,31 +514,18 @@ def SimpleRNNLearner(train_data, val_data, epochs=2):
     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
 
     # train the model
-    model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=128, verbose=2)
+    model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=128, verbose=verbose)
 
     return model
 
 
-def keras_dataset_loader(dataset, max_length=500):
-    """
-    Helper function to load keras datasets.
-    :param dataset: keras data set type
-    :param max_length: max length of each input sequence
-    """
-    # init dataset
-    (X_train, y_train), (X_val, y_val) = dataset
-    if max_length > 0:
-        X_train = sequence.pad_sequences(X_train, maxlen=max_length)
-        X_val = sequence.pad_sequences(X_val, maxlen=max_length)
-    return (X_train[10:], y_train[10:]), (X_val, y_val), (X_train[:10], y_train[:10])
-
-
-def AutoencoderLearner(inputs, encoding_size, epochs=200):
+def AutoencoderLearner(inputs, encoding_size, epochs=200, verbose=False):
     """
     Simple example of linear auto encoder learning producing the input itself.
     :param inputs: a batch of input data in np.ndarray type
     :param encoding_size: int, the size of encoding layer
     :param epochs: number of epochs
+    :param verbose: verbosity mode
     :return: a keras model
     """
 
@@ -466,6 +543,6 @@ def AutoencoderLearner(inputs, encoding_size, epochs=200):
     model.compile(loss='mean_squared_error', optimizer=sgd, metrics=['accuracy'])
 
     # train the model
-    model.fit(inputs, inputs, epochs=epochs, batch_size=10, verbose=2)
+    model.fit(inputs, inputs, epochs=epochs, batch_size=10, verbose=verbose)
 
     return model
diff --git a/learning.py b/learning.py
index 764392c7d..e83467c43 100644
--- a/learning.py
+++ b/learning.py
@@ -201,7 +201,7 @@ def parse_csv(input, delim=','):
     return [list(map(num_or_str, line.split(delim))) for line in lines]
 
 
-def err_ratio(predict, dataset, examples=None, verbose=0):
+def err_ratio(predict, dataset, examples=None):
     """
     Return the proportion of the examples that are NOT correctly predicted.
     verbose - 0: No output; 1: Output wrong; 2 (or greater): Output correct
@@ -215,10 +215,6 @@ def err_ratio(predict, dataset, examples=None, verbose=0):
         output = predict(dataset.sanitize(example))
         if output == desired:
             right += 1
-            if verbose >= 2:
-                print('   OK: got {} for {}'.format(desired, example))
-        elif verbose:
-            print('WRONG: got {}, expected {} for {}'.format(output, desired, example))
     return 1 - (right / len(examples))
 
 
diff --git a/learning4e.py b/learning4e.py
index 3cf41ad1e..4ef022e83 100644
--- a/learning4e.py
+++ b/learning4e.py
@@ -5,7 +5,9 @@
 from statistics import stdev
 
 from qpsolvers import solve_qp
+from scipy.optimize import minimize
 
+from deep_learning4e import Sigmoid
 from probabilistic_learning import NaiveBayesLearner
 from utils4e import *
 
@@ -128,7 +130,7 @@ def update_values(self):
 
     def sanitize(self, example):
         """Return a copy of example, with non-input attributes replaced by None."""
-        return [attr_i if i in self.inputs else None for i, attr_i in enumerate(example)]
+        return [attr_i if i in self.inputs else None for i, attr_i in enumerate(example)][:-1]
 
     def classes_to_numbers(self, classes=None):
         """Converts class names to numbers."""
@@ -201,7 +203,7 @@ def parse_csv(input, delim=','):
     return [list(map(num_or_str, line.split(delim))) for line in lines]
 
 
-def err_ratio(predict, dataset, examples=None, verbose=0):
+def err_ratio(learner, dataset, examples=None):
     """
     Return the proportion of the examples that are NOT correctly predicted.
     verbose - 0: No output; 1: Output wrong; 2 (or greater): Output correct
@@ -212,22 +214,18 @@ def err_ratio(predict, dataset, examples=None, verbose=0):
     right = 0
     for example in examples:
         desired = example[dataset.target]
-        output = predict(dataset.sanitize(example))
-        if output == desired:
+        output = learner.predict(dataset.sanitize(example))
+        if np.allclose(output, desired):
             right += 1
-            if verbose >= 2:
-                print('   OK: got {} for {}'.format(desired, example))
-        elif verbose:
-            print('WRONG: got {}, expected {} for {}'.format(output, desired, example))
     return 1 - (right / len(examples))
 
 
-def grade_learner(predict, tests):
+def grade_learner(learner, tests):
     """
     Grades the given learner based on how many tests it passes.
     tests is a list with each element in the form: (values, output).
     """
-    return mean(int(predict(X) == y) for X, y in tests)
+    return mean(int(learner.predict(X) == y) for X, y in tests)
 
 
 def train_test_split(dataset, start=None, end=None, test_split=None):
@@ -323,18 +321,18 @@ def score(learner, size):
     return [(size, mean([score(learner, size) for _ in range(trials)])) for size in sizes]
 
 
-def PluralityLearner(dataset):
+class PluralityLearner:
     """
     A very dumb algorithm: always pick the result that was most popular
     in the training data. Makes a baseline for comparison.
     """
-    most_popular = mode([e[dataset.target] for e in dataset.examples])
 
-    def predict(example):
-        """Always return same result: the most popular from the training set."""
-        return most_popular
+    def __init__(self, dataset):
+        self.most_popular = mode([e[dataset.target] for e in dataset.examples])
 
-    return predict
+    def predict(self, example):
+        """Always return same result: the most popular from the training set."""
+        return self.most_popular
 
 
 class DecisionFork:
@@ -390,61 +388,67 @@ def __repr__(self):
         return repr(self.result)
 
 
-def DecisionTreeLearner(dataset):
+class DecisionTreeLearner:
     """[Figure 18.5]"""
 
-    target, values = dataset.target, dataset.values
+    def __init__(self, dataset):
+        self.dataset = dataset
+        self.tree = self.decision_tree_learning(dataset.examples, dataset.inputs)
 
-    def decision_tree_learning(examples, attrs, parent_examples=()):
+    def decision_tree_learning(self, examples, attrs, parent_examples=()):
         if len(examples) == 0:
-            return plurality_value(parent_examples)
-        if all_same_class(examples):
-            return DecisionLeaf(examples[0][target])
+            return self.plurality_value(parent_examples)
+        if self.all_same_class(examples):
+            return DecisionLeaf(examples[0][self.dataset.target])
         if len(attrs) == 0:
-            return plurality_value(examples)
-        A = choose_attribute(attrs, examples)
-        tree = DecisionFork(A, dataset.attr_names[A], plurality_value(examples))
-        for (v_k, exs) in split_by(A, examples):
-            subtree = decision_tree_learning(exs, remove_all(A, attrs), examples)
+            return self.plurality_value(examples)
+        A = self.choose_attribute(attrs, examples)
+        tree = DecisionFork(A, self.dataset.attr_names[A], self.plurality_value(examples))
+        for (v_k, exs) in self.split_by(A, examples):
+            subtree = self.decision_tree_learning(exs, remove_all(A, attrs), examples)
             tree.add(v_k, subtree)
         return tree
 
-    def plurality_value(examples):
+    def plurality_value(self, examples):
         """
         Return the most popular target value for this set of examples.
         (If target is binary, this is the majority; otherwise plurality).
         """
-        popular = argmax_random_tie(values[target], key=lambda v: count(target, v, examples))
+        popular = argmax_random_tie(self.dataset.values[self.dataset.target],
+                                    key=lambda v: self.count(self.dataset.target, v, examples))
         return DecisionLeaf(popular)
 
-    def count(attr, val, examples):
+    def count(self, attr, val, examples):
         """Count the number of examples that have example[attr] = val."""
         return sum(e[attr] == val for e in examples)
 
-    def all_same_class(examples):
+    def all_same_class(self, examples):
         """Are all these examples in the same target class?"""
-        class0 = examples[0][target]
-        return all(e[target] == class0 for e in examples)
+        class0 = examples[0][self.dataset.target]
+        return all(e[self.dataset.target] == class0 for e in examples)
 
-    def choose_attribute(attrs, examples):
+    def choose_attribute(self, attrs, examples):
         """Choose the attribute with the highest information gain."""
-        return argmax_random_tie(attrs, key=lambda a: information_gain(a, examples))
+        return argmax_random_tie(attrs, key=lambda a: self.information_gain(a, examples))
 
-    def information_gain(attr, examples):
+    def information_gain(self, attr, examples):
         """Return the expected reduction in entropy from splitting by attr."""
 
         def I(examples):
-            return information_content([count(target, v, examples) for v in values[target]])
+            return information_content([self.count(self.dataset.target, v, examples)
+                                        for v in self.dataset.values[self.dataset.target]])
 
         n = len(examples)
-        remainder = sum((len(examples_i) / n) * I(examples_i) for (v, examples_i) in split_by(attr, examples))
+        remainder = sum((len(examples_i) / n) * I(examples_i)
+                        for (v, examples_i) in self.split_by(attr, examples))
         return I(examples) - remainder
 
-    def split_by(attr, examples):
+    def split_by(self, attr, examples):
         """Return a list of (val, examples) pairs for each val of attr."""
-        return [(v, [e for e in examples if e[attr] == v]) for v in values[attr]]
+        return [(v, [e for e in examples if e[attr] == v]) for v in self.dataset.values[attr]]
 
-    return decision_tree_learning(dataset.examples, dataset.inputs)
+    def predict(self, x):
+        return self.tree(x)
 
 
 def information_content(values):
@@ -453,136 +457,213 @@ def information_content(values):
     return sum(-p * np.log2(p) for p in probabilities)
 
 
-def DecisionListLearner(dataset):
+class DecisionListLearner:
     """
     [Figure 18.11]
     A decision list implemented as a list of (test, value) pairs.
     """
 
-    def decision_list_learning(examples):
+    def __init__(self, dataset):
+        self.predict.decision_list = self.decision_list_learning(set(dataset.examples))
+
+    def decision_list_learning(self, examples):
         if not examples:
             return [(True, False)]
-        t, o, examples_t = find_examples(examples)
+        t, o, examples_t = self.find_examples(examples)
         if not t:
             raise Exception
-        return [(t, o)] + decision_list_learning(examples - examples_t)
+        return [(t, o)] + self.decision_list_learning(examples - examples_t)
 
-    def find_examples(examples):
+    def find_examples(self, examples):
         """
         Find a set of examples that all have the same outcome under
         some test. Return a tuple of the test, outcome, and examples.
         """
         raise NotImplementedError
 
-    def passes(example, test):
+    def passes(self, example, test):
         """Does the example pass the test?"""
         raise NotImplementedError
 
-    def predict(example):
+    def predict(self, example):
         """Predict the outcome for the first passing test."""
-        for test, outcome in predict.decision_list:
-            if passes(example, test):
+        for test, outcome in self.predict.decision_list:
+            if self.passes(example, test):
                 return outcome
 
-    predict.decision_list = decision_list_learning(set(dataset.examples))
-
-    return predict
-
 
-def NearestNeighborLearner(dataset, k=1):
+class NearestNeighborLearner:
     """k-NearestNeighbor: the k nearest neighbors vote."""
 
-    def predict(example):
+    def __init__(self, dataset, k=1):
+        self.dataset = dataset
+        self.k = k
+
+    def predict(self, example):
         """Find the k closest items, and have them vote for the best."""
-        best = heapq.nsmallest(k, ((dataset.distance(e, example), e) for e in dataset.examples))
-        return mode(e[dataset.target] for (d, e) in best)
+        best = heapq.nsmallest(self.k, ((self.dataset.distance(e, example), e) for e in self.dataset.examples))
+        return mode(e[self.dataset.target] for (d, e) in best)
 
-    return predict
 
+class LossFunction:
+    def __init__(self, X, y):
+        self.X = X
+        self.y = y.flatten()
 
-def LinearLearner(dataset, learning_rate=0.01, epochs=100):
-    """
-    [Section 18.6.4]
-    Linear classifier with hard threshold.
-    """
-    idx_i = dataset.inputs
-    idx_t = dataset.target
-    examples = dataset.examples
-    num_examples = len(examples)
+    @staticmethod
+    def predict(X, theta):
+        return NotImplementedError
+
+    def function(self, theta):
+        return NotImplementedError
 
-    # X transpose
-    X_col = [dataset.values[i] for i in idx_i]  # vertical columns of X
+    def jacobian(self, theta):
+        return NotImplementedError
 
-    # add dummy
-    ones = [1 for _ in range(len(examples))]
-    X_col = [ones] + X_col
 
-    # initialize random weights
-    num_weights = len(idx_i) + 1
-    w = random_weights(min_value=-0.5, max_value=0.5, num_weights=num_weights)
+class MeanSquaredError(LossFunction):
+    def __init__(self, X, y):
+        super().__init__(X, y)
+        self.x_star = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)  # or np.linalg.lstsq(X, y)[0]
 
-    for epoch in range(epochs):
-        err = []
-        # pass over all examples
-        for example in examples:
-            x = [1] + example
-            y = dot_product(w, x)
-            t = example[idx_t]
-            err.append(t - y)
+    @staticmethod
+    def predict(X, theta):
+        return np.dot(X, theta)
+
+    def function(self, theta):
+        return (1 / 2 * self.X.shape[0]) * np.sum(np.square(self.predict(self.X, theta) - self.y))
+
+    def jacobian(self, theta):
+        return (1 / self.X.shape[0]) * np.dot(self.X.T, self.predict(self.X, theta) - self.y)
+
+
+class CrossEntropy(LossFunction):
+    def __init__(self, X, y):
+        super().__init__(X, y)
+
+    @staticmethod
+    def predict(X, theta):
+        return Sigmoid().function(np.dot(X, theta))
+
+    def function(self, theta):
+        pred = self.predict(self.X, theta)
+        return -(1 / self.X.shape[0]) * np.sum(self.y * np.log(pred) + (1 - self.y) * np.log(1 - pred))
+
+    def jacobian(self, theta):
+        return (1 / self.X.shape[0]) * np.dot(self.X.T, self.predict(self.X, theta) - self.y)
+
+
+class LinearRegressionLearner:
+    """
+    [Section 18.6.4]
+    Linear Regressor
+    """
 
-        # update weights
-        for i in range(len(w)):
-            w[i] = w[i] + learning_rate * (dot_product(err, X_col[i]) / num_examples)
+    def __init__(self, l_rate=0.01, epochs=1000, optimizer='bfgs'):
+        self.l_rate = l_rate
+        self.epochs = epochs
+        self.optimizer = optimizer
 
-    def predict(example):
-        x = [1] + example
-        return dot_product(w, x)
+    def fit(self, X, y):
+        loss = MeanSquaredError(X, y)
+        self.w = minimize(fun=loss.function, x0=np.zeros((X.shape[1], 1)), method=self.optimizer, jac=loss.jacobian).x
+        return self
 
-    return predict
+    def predict(self, example):
+        return np.dot(example, self.w)
 
 
-def LogisticLinearLeaner(dataset, learning_rate=0.01, epochs=100):
+class BinaryLogisticRegressionLearner:
     """
     [Section 18.6.5]
-    Linear classifier with logistic regression.
+    Logistic Regression Classifier
     """
-    idx_i = dataset.inputs
-    idx_t = dataset.target
-    examples = dataset.examples
-    num_examples = len(examples)
 
-    # X transpose
-    X_col = [dataset.values[i] for i in idx_i]  # vertical columns of X
+    def __init__(self, l_rate=0.01, epochs=1000, optimizer='bfgs'):
+        self.l_rate = l_rate
+        self.epochs = epochs
+        self.optimizer = optimizer
 
-    # add dummy
-    ones = [1 for _ in range(len(examples))]
-    X_col = [ones] + X_col
+    def fit(self, X, y):
+        self.labels = np.unique(y)
+        y = np.where(y == self.labels[0], 0, 1)
+        loss = CrossEntropy(X, y)
+        self.w = minimize(fun=loss.function, x0=np.zeros((X.shape[1], 1)), method=self.optimizer, jac=loss.jacobian).x
+        return self
+
+    def predict_score(self, x):
+        return CrossEntropy.predict(x, self.w)
 
-    # initialize random weights
-    num_weights = len(idx_i) + 1
-    w = random_weights(min_value=-0.5, max_value=0.5, num_weights=num_weights)
+    def predict(self, x):
+        return np.where(self.predict_score(x) >= 0.5, self.labels[1], self.labels[0]).astype(int)
 
-    for epoch in range(epochs):
-        err = []
-        h = []
-        # pass over all examples
-        for example in examples:
-            x = [1] + example
-            y = Sigmoid().function(dot_product(w, x))
-            h.append(Sigmoid().derivative(y))
-            t = example[idx_t]
-            err.append(t - y)
 
-        # update weights
-        for i in range(len(w)):
-            buffer = [x * y for x, y in zip(err, h)]
-            w[i] = w[i] + learning_rate * (dot_product(buffer, X_col[i]) / num_examples)
+class MultiLogisticRegressionLearner:
+    def __init__(self, l_rate=0.01, epochs=1000, optimizer='bfgs', decision_function='ovr'):
+        self.l_rate = l_rate
+        self.epochs = epochs
+        self.optimizer = optimizer
+        self.decision_function = decision_function
+        self.n_class, self.classifiers = 0, []
 
-    def predict(example):
-        x = [1] + example
-        return Sigmoid().function(dot_product(w, x))
+    def fit(self, X, y):
+        """
+        Trains n_class or n_class * (n_class - 1) / 2 classifiers
+        according to the training method, ovr or ovo respectively.
+        :param X: array of size [n_samples, n_features] holding the training samples
+        :param y: array of size [n_samples] holding the class labels
+        :return: array of classifiers
+        """
+        labels = np.unique(y)
+        self.n_class = len(labels)
+        if self.decision_function == 'ovr':  # one-vs-rest method
+            for label in labels:
+                y1 = np.array(y)
+                y1[y1 != label] = -1.0
+                y1[y1 == label] = 1.0
+                clf = BinaryLogisticRegressionLearner(self.l_rate, self.epochs, self.optimizer)
+                clf.fit(X, y1)
+                self.classifiers.append(copy.deepcopy(clf))
+        elif self.decision_function == 'ovo':  # use one-vs-one method
+            n_labels = len(labels)
+            for i in range(n_labels):
+                for j in range(i + 1, n_labels):
+                    neg_id, pos_id = y == labels[i], y == labels[j]
+                    x1, y1 = np.r_[X[neg_id], X[pos_id]], np.r_[y[neg_id], y[pos_id]]
+                    y1[y1 == labels[i]] = -1.0
+                    y1[y1 == labels[j]] = 1.0
+                    clf = BinaryLogisticRegressionLearner(self.l_rate, self.epochs, self.optimizer)
+                    clf.fit(x1, y1)
+                    self.classifiers.append(copy.deepcopy(clf))
+        else:
+            return ValueError("Decision function must be either 'ovr' or 'ovo'.")
+        return self
 
-    return predict
+    def predict(self, x):
+        """
+        Predicts the class of a given example according to the training method.
+        """
+        n_samples = len(x)
+        if self.decision_function == 'ovr':  # one-vs-rest method
+            assert len(self.classifiers) == self.n_class
+            score = np.zeros((n_samples, self.n_class))
+            for i in range(self.n_class):
+                clf = self.classifiers[i]
+                score[:, i] = clf.predict_score(x)
+            return np.argmax(score, axis=1)
+        elif self.decision_function == 'ovo':  # use one-vs-one method
+            assert len(self.classifiers) == self.n_class * (self.n_class - 1) / 2
+            vote = np.zeros((n_samples, self.n_class))
+            clf_id = 0
+            for i in range(self.n_class):
+                for j in range(i + 1, self.n_class):
+                    res = self.classifiers[clf_id].predict(x)
+                    vote[res < 0, i] += 1.0  # negative sample: class i
+                    vote[res > 0, j] += 1.0  # positive sample: class j
+                    clf_id += 1
+            return np.argmax(vote, axis=1)
+        else:
+            return ValueError("Decision function must be either 'ovr' or 'ovo'.")
 
 
 class BinarySVM:
@@ -613,6 +694,7 @@ def fit(self, X, y):
         sv_boundary = self.alphas < self.C - self.eps
         self.b = np.mean(self.sv_y[sv_boundary] - np.dot(self.alphas * self.sv_y,
                                                          self.kernel(self.sv_x, self.sv_x[sv_boundary])))
+        return self
 
     def QP(self, X, y):
         """
@@ -687,6 +769,7 @@ def fit(self, X, y):
                     self.classifiers.append(copy.deepcopy(clf))
         else:
             return ValueError("Decision function must be either 'ovr' or 'ovo'.")
+        return self
 
     def predict(self, x):
         """
@@ -715,18 +798,17 @@ def predict(self, x):
             return ValueError("Decision function must be either 'ovr' or 'ovo'.")
 
 
-def EnsembleLearner(learners):
+class EnsembleLearner:
     """Given a list of learning algorithms, have them vote."""
 
-    def train(dataset):
-        predictors = [learner(dataset) for learner in learners]
+    def __init__(self, learners):
+        self.learners = learners
 
-        def predict(example):
-            return mode(predictor(example) for predictor in predictors)
+    def train(self, dataset):
+        self.predictors = [learner(dataset) for learner in self.learners]
 
-        return predict
-
-    return train
+    def predict(self, example):
+        return mode(predictor.predict(example) for predictor in self.predictors)
 
 
 def ada_boost(dataset, L, K):
@@ -740,24 +822,26 @@ def ada_boost(dataset, L, K):
     for k in range(K):
         h_k = L(dataset, w)
         h.append(h_k)
-        error = sum(weight for example, weight in zip(examples, w) if example[target] != h_k(example))
+        error = sum(weight for example, weight in zip(examples, w) if example[target] != h_k.predict(example[:-1]))
         # avoid divide-by-0 from either 0% or 100% error rates
         error = np.clip(error, eps, 1 - eps)
         for j, example in enumerate(examples):
-            if example[target] == h_k(example):
+            if example[target] == h_k.predict(example[:-1]):
                 w[j] *= error / (1 - error)
         w = normalize(w)
         z.append(np.log((1 - error) / error))
     return weighted_majority(h, z)
 
 
-def weighted_majority(predictors, weights):
+class weighted_majority:
     """Return a predictor that takes a weighted vote."""
 
-    def predict(example):
-        return weighted_mode((predictor(example) for predictor in predictors), weights)
+    def __init__(self, predictors, weights):
+        self.predictors = predictors
+        self.weights = weights
 
-    return predict
+    def predict(self, example):
+        return weighted_mode((predictor.predict(example) for predictor in self.predictors), self.weights)
 
 
 def weighted_mode(values, weights):
@@ -772,28 +856,28 @@ def weighted_mode(values, weights):
     return max(totals, key=totals.__getitem__)
 
 
-def RandomForest(dataset, n=5):
+class RandomForest:
     """An ensemble of Decision Trees trained using bagging and feature bagging."""
 
-    def data_bagging(dataset, m=0):
+    def __init__(self, dataset, n=5):
+        self.dataset = dataset
+        self.n = n
+        self.predictors = [DecisionTreeLearner(DataSet(examples=self.data_bagging(), attrs=self.dataset.attrs,
+                                                       attr_names=self.dataset.attr_names, target=self.dataset.target,
+                                                       inputs=self.feature_bagging())) for _ in range(self.n)]
+
+    def data_bagging(self, m=0):
         """Sample m examples with replacement"""
-        n = len(dataset.examples)
-        return weighted_sample_with_replacement(m or n, dataset.examples, [1] * n)
+        n = len(self.dataset.examples)
+        return weighted_sample_with_replacement(m or n, self.dataset.examples, [1] * n)
 
-    def feature_bagging(dataset, p=0.7):
+    def feature_bagging(self, p=0.7):
         """Feature bagging with probability p to retain an attribute"""
-        inputs = [i for i in dataset.inputs if probability(p)]
-        return inputs or dataset.inputs
-
-    def predict(example):
-        print([predictor(example) for predictor in predictors])
-        return mode(predictor(example) for predictor in predictors)
-
-    predictors = [DecisionTreeLearner(DataSet(examples=data_bagging(dataset), attrs=dataset.attrs,
-                                              attr_names=dataset.attr_names, target=dataset.target,
-                                              inputs=feature_bagging(dataset))) for _ in range(n)]
+        inputs = [i for i in self.dataset.inputs if probability(p)]
+        return inputs or self.dataset.inputs
 
-    return predict
+    def predict(self, example):
+        return mode(predictor.predict(example) for predictor in self.predictors)
 
 
 def WeightedLearner(unweighted_learner):
@@ -804,7 +888,11 @@ def WeightedLearner(unweighted_learner):
     """
 
     def train(dataset, weights):
-        return unweighted_learner(replicated_dataset(dataset, weights))
+        dataset = replicated_dataset(dataset, weights)
+        n_samples, n_features = len(dataset.examples), dataset.target
+        X, y = np.array([x[:n_features] for x in dataset.examples]), \
+               np.array([x[n_features] for x in dataset.examples])
+        return unweighted_learner.fit(X, y)
 
     return train
 
diff --git a/perception4e.py b/perception4e.py
index d5bc15718..2cb4b3891 100644
--- a/perception4e.py
+++ b/perception4e.py
@@ -392,7 +392,7 @@ def selective_search(image):
 # faster RCNN
 def pool_rois(feature_map, rois, pooled_height, pooled_width):
     """
-    Applies ROI pooling for a single image and varios ROIs
+    Applies ROI pooling for a single image and various ROIs
     :param feature_map: ndarray, in shape of (width, height, channel)
     :param rois: list of roi
     :param pooled_height: height of pooled area
diff --git a/pytest.ini b/pytest.ini
index 5b9f41dbc..1561b6fe6 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,4 +1,5 @@
 [pytest]
 filterwarnings =
     ignore::DeprecationWarning
+    ignore::UserWarning
     ignore::RuntimeWarning
diff --git a/tests/test_deep_learning4e.py b/tests/test_deep_learning4e.py
index 060e55788..b23f8bcfa 100644
--- a/tests/test_deep_learning4e.py
+++ b/tests/test_deep_learning4e.py
@@ -6,44 +6,45 @@
 
 random.seed("aima-python")
 
+iris_tests = [([5.0, 3.1, 0.9, 0.1], 0),
+              ([5.1, 3.5, 1.0, 0.0], 0),
+              ([4.9, 3.3, 1.1, 0.1], 0),
+              ([6.0, 3.0, 4.0, 1.1], 1),
+              ([6.1, 2.2, 3.5, 1.0], 1),
+              ([5.9, 2.5, 3.3, 1.1], 1),
+              ([7.5, 4.1, 6.2, 2.3], 2),
+              ([7.3, 4.0, 6.1, 2.4], 2),
+              ([7.0, 3.3, 6.1, 2.5], 2)]
+
 
 def test_neural_net():
     iris = DataSet(name='iris')
     classes = ['setosa', 'versicolor', 'virginica']
     iris.classes_to_numbers(classes)
-    nnl_gd = NeuralNetLearner(iris, [4], l_rate=0.15, epochs=100, optimizer=stochastic_gradient_descent)
-    nnl_adam = NeuralNetLearner(iris, [4], l_rate=0.001, epochs=200, optimizer=adam)
-    tests = [([5.0, 3.1, 0.9, 0.1], 0),
-             ([5.1, 3.5, 1.0, 0.0], 0),
-             ([4.9, 3.3, 1.1, 0.1], 0),
-             ([6.0, 3.0, 4.0, 1.1], 1),
-             ([6.1, 2.2, 3.5, 1.0], 1),
-             ([5.9, 2.5, 3.3, 1.1], 1),
-             ([7.5, 4.1, 6.2, 2.3], 2),
-             ([7.3, 4.0, 6.1, 2.4], 2),
-             ([7.0, 3.3, 6.1, 2.5], 2)]
-    assert grade_learner(nnl_gd, tests) >= 1 / 3
-    assert err_ratio(nnl_gd, iris) < 0.21
-    assert grade_learner(nnl_adam, tests) >= 1 / 3
-    assert err_ratio(nnl_adam, iris) < 0.21
+    n_samples, n_features = len(iris.examples), iris.target
+    X, y = np.array([x[:n_features] for x in iris.examples]), \
+           np.array([x[n_features] for x in iris.examples])
+    nnl_gd = NeuralNetworkLearner(iris, [4], l_rate=0.15, epochs=100, optimizer=stochastic_gradient_descent).fit(X, y)
+    assert grade_learner(nnl_gd, iris_tests) > 0.7
+    assert err_ratio(nnl_gd, iris) < 0.08
+    nnl_adam = NeuralNetworkLearner(iris, [4], l_rate=0.001, epochs=200, optimizer=adam).fit(X, y)
+    assert grade_learner(nnl_adam, iris_tests) == 1
+    assert err_ratio(nnl_adam, iris) < 0.08
 
 
 def test_perceptron():
     iris = DataSet(name='iris')
     classes = ['setosa', 'versicolor', 'virginica']
     iris.classes_to_numbers(classes)
-    pl_gd = PerceptronLearner(iris, l_rate=0.01, epochs=100, optimizer=stochastic_gradient_descent)
-    pl_adam = PerceptronLearner(iris, l_rate=0.01, epochs=100, optimizer=adam)
-    tests = [([5, 3, 1, 0.1], 0),
-             ([5, 3.5, 1, 0], 0),
-             ([6, 3, 4, 1.1], 1),
-             ([6, 2, 3.5, 1], 1),
-             ([7.5, 4, 6, 2], 2),
-             ([7, 3, 6, 2.5], 2)]
-    assert grade_learner(pl_gd, tests) > 1 / 2
-    assert err_ratio(pl_gd, iris) < 0.4
-    assert grade_learner(pl_adam, tests) > 1 / 2
-    assert err_ratio(pl_adam, iris) < 0.4
+    n_samples, n_features = len(iris.examples), iris.target
+    X, y = np.array([x[:n_features] for x in iris.examples]), \
+           np.array([x[n_features] for x in iris.examples])
+    pl_gd = PerceptronLearner(iris, l_rate=0.01, epochs=100, optimizer=stochastic_gradient_descent).fit(X, y)
+    assert grade_learner(pl_gd, iris_tests) == 1
+    assert err_ratio(pl_gd, iris) < 0.2
+    pl_adam = PerceptronLearner(iris, l_rate=0.01, epochs=100, optimizer=adam).fit(X, y)
+    assert grade_learner(pl_adam, iris_tests) == 1
+    assert err_ratio(pl_adam, iris) < 0.2
 
 
 def test_rnn():
@@ -52,8 +53,8 @@ def test_rnn():
     train = (train[0][:1000], train[1][:1000])
     val = (val[0][:200], val[1][:200])
     rnn = SimpleRNNLearner(train, val)
-    score = rnn.evaluate(test[0][:200], test[1][:200], verbose=0)
-    assert score[1] >= 0.3
+    score = rnn.evaluate(test[0][:200], test[1][:200], verbose=False)
+    assert score[1] >= 0.2
 
 
 def test_autoencoder():
diff --git a/tests/test_learning.py b/tests/test_learning.py
index fd84d74ed..57d603b86 100644
--- a/tests/test_learning.py
+++ b/tests/test_learning.py
@@ -149,7 +149,7 @@ def test_ada_boost():
              ([6, 2, 3.5, 1], 1),
              ([7.5, 4, 6, 2], 2),
              ([7, 3, 6, 2.5], 2)]
-    assert grade_learner(ab, tests) > 4 / 6
+    assert grade_learner(ab, tests) > 2 / 3
     assert err_ratio(ab, iris) < 0.25
 
 
diff --git a/tests/test_learning4e.py b/tests/test_learning4e.py
index 3913443b1..f0fc50493 100644
--- a/tests/test_learning4e.py
+++ b/tests/test_learning4e.py
@@ -38,42 +38,68 @@ def test_means_and_deviation():
 def test_plurality_learner():
     zoo = DataSet(name='zoo')
     pl = PluralityLearner(zoo)
-    assert pl([1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 4, 1, 0, 1]) == 'mammal'
+    assert pl.predict([1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 4, 1, 0, 1]) == 'mammal'
 
 
 def test_k_nearest_neighbors():
     iris = DataSet(name='iris')
     knn = NearestNeighborLearner(iris, k=3)
-    assert knn([5, 3, 1, 0.1]) == 'setosa'
-    assert knn([6, 5, 3, 1.5]) == 'versicolor'
-    assert knn([7.5, 4, 6, 2]) == 'virginica'
+    assert knn.predict([5, 3, 1, 0.1]) == 'setosa'
+    assert knn.predict([6, 5, 3, 1.5]) == 'versicolor'
+    assert knn.predict([7.5, 4, 6, 2]) == 'virginica'
 
 
 def test_decision_tree_learner():
     iris = DataSet(name='iris')
     dtl = DecisionTreeLearner(iris)
-    assert dtl([5, 3, 1, 0.1]) == 'setosa'
-    assert dtl([6, 5, 3, 1.5]) == 'versicolor'
-    assert dtl([7.5, 4, 6, 2]) == 'virginica'
+    assert dtl.predict([5, 3, 1, 0.1]) == 'setosa'
+    assert dtl.predict([6, 5, 3, 1.5]) == 'versicolor'
+    assert dtl.predict([7.5, 4, 6, 2]) == 'virginica'
+
+
+def test_linear_learner():
+    iris = DataSet(name='iris')
+    classes = ['setosa', 'versicolor', 'virginica']
+    iris.classes_to_numbers(classes)
+    n_samples, n_features = len(iris.examples), iris.target
+    X, y = np.array([x[:n_features] for x in iris.examples]), \
+           np.array([x[n_features] for x in iris.examples])
+    ll = LinearRegressionLearner().fit(X, y)
+    assert np.allclose(ll.w, MeanSquaredError(X, y).x_star)
+
+
+iris_tests = [([[5.0, 3.1, 0.9, 0.1]], 0),
+              ([[5.1, 3.5, 1.0, 0.0]], 0),
+              ([[4.9, 3.3, 1.1, 0.1]], 0),
+              ([[6.0, 3.0, 4.0, 1.1]], 1),
+              ([[6.1, 2.2, 3.5, 1.0]], 1),
+              ([[5.9, 2.5, 3.3, 1.1]], 1),
+              ([[7.5, 4.1, 6.2, 2.3]], 2),
+              ([[7.3, 4.0, 6.1, 2.4]], 2),
+              ([[7.0, 3.3, 6.1, 2.5]], 2)]
+
+
+def test_logistic_learner():
+    iris = DataSet(name='iris')
+    classes = ['setosa', 'versicolor', 'virginica']
+    iris.classes_to_numbers(classes)
+    n_samples, n_features = len(iris.examples), iris.target
+    X, y = np.array([x[:n_features] for x in iris.examples]), \
+           np.array([x[n_features] for x in iris.examples])
+    ll = MultiLogisticRegressionLearner().fit(X, y)
+    assert grade_learner(ll, iris_tests) == 1
+    assert np.allclose(err_ratio(ll, iris), 0.04)
 
 
 def test_svm():
     iris = DataSet(name='iris')
     classes = ['setosa', 'versicolor', 'virginica']
     iris.classes_to_numbers(classes)
-    svm = MultiSVM()
     n_samples, n_features = len(iris.examples), iris.target
     X, y = np.array([x[:n_features] for x in iris.examples]), np.array([x[n_features] for x in iris.examples])
-    svm.fit(X, y)
-    assert svm.predict([[5.0, 3.1, 0.9, 0.1]]) == 0
-    assert svm.predict([[5.1, 3.5, 1.0, 0.0]]) == 0
-    assert svm.predict([[4.9, 3.3, 1.1, 0.1]]) == 0
-    assert svm.predict([[6.0, 3.0, 4.0, 1.1]]) == 1
-    assert svm.predict([[6.1, 2.2, 3.5, 1.0]]) == 1
-    assert svm.predict([[5.9, 2.5, 3.3, 1.1]]) == 1
-    assert svm.predict([[7.5, 4.1, 6.2, 2.3]]) == 2
-    assert svm.predict([[7.3, 4.0, 6.1, 2.4]]) == 2
-    assert svm.predict([[7.0, 3.3, 6.1, 2.5]]) == 2
+    svm = MultiSVM().fit(X, y)
+    assert grade_learner(svm, iris_tests) == 1
+    assert np.isclose(err_ratio(svm, iris), 0.04)
 
 
 def test_information_content():
@@ -109,8 +135,9 @@ def test_random_weights():
 
 def test_ada_boost():
     iris = DataSet(name='iris')
-    iris.classes_to_numbers()
-    wl = WeightedLearner(PerceptronLearner)
+    classes = ['setosa', 'versicolor', 'virginica']
+    iris.classes_to_numbers(classes)
+    wl = WeightedLearner(PerceptronLearner(iris))
     ab = ada_boost(iris, wl, 5)
     tests = [([5, 3, 1, 0.1], 0),
              ([5, 3.5, 1, 0], 0),
@@ -118,7 +145,7 @@ def test_ada_boost():
              ([6, 2, 3.5, 1], 1),
              ([7.5, 4, 6, 2], 2),
              ([7, 3, 6, 2.5], 2)]
-    assert grade_learner(ab, tests) > 4 / 6
+    assert grade_learner(ab, tests) > 2 / 3
     assert err_ratio(ab, iris) < 0.25
 
 
diff --git a/utils4e.py b/utils4e.py
index 777a88e4a..178e887b4 100644
--- a/utils4e.py
+++ b/utils4e.py
@@ -168,6 +168,7 @@ def extend(s, var, val):
 # ______________________________________________________________________________
 # argmin and argmax
 
+
 identity = lambda x: x
 
 
@@ -209,11 +210,6 @@ def histogram(values, mode=0, bin_function=None):
         return sorted(bins.items())
 
 
-def dot_product(x, y):
-    """Return the sum of the element-wise product of vectors x and y."""
-    return sum(_x * _y for _x, _y in zip(x, y))
-
-
 def element_wise_product(x, y):
     if hasattr(x, '__iter__') and hasattr(y, '__iter__'):
         assert len(x) == len(y)
@@ -224,16 +220,6 @@ def element_wise_product(x, y):
         raise Exception('Inputs must be in the same size!')
 
 
-def matrix_multiplication(x, *y):
-    """Return a matrix as a matrix-multiplication of x and arbitrary number of matrices *y."""
-
-    result = x
-    for _y in y:
-        result = np.matmul(result, _y)
-
-    return result
-
-
 def vector_add(a, b):
     """Component-wise addition of two vectors."""
     if not (a and b):
@@ -343,7 +329,8 @@ def mean_boolean_error(x, y):
     return mean(_x != _y for _x, _y in zip(x, y))
 
 
-# loss functions
+# part3. Neural network util functions
+# ______________________________________________________________________________
 
 
 def cross_entropy_loss(x, y):
@@ -356,10 +343,6 @@ def mean_squared_error_loss(x, y):
     return (1.0 / len(x)) * sum((_x - _y) ** 2 for _x, _y in zip(x, y))
 
 
-# part3. Neural network util functions
-# ______________________________________________________________________________
-
-
 def normalize(dist):
     """Multiply each number by a constant such that the sum is 1.0"""
     if isinstance(dist, dict):
@@ -376,6 +359,11 @@ def random_weights(min_value, max_value, num_weights):
     return [random.uniform(min_value, max_value) for _ in range(num_weights)]
 
 
+def softmax1D(x):
+    """Return the softmax vector of input vector x."""
+    return np.exp(x) / np.sum(np.exp(x))
+
+
 def conv1D(x, k):
     """1D convolution. x: input vector; K: kernel vector."""
     return np.convolve(x, k, mode='same')
@@ -395,72 +383,6 @@ def gaussian_kernel_2D(size=3, sigma=0.5):
     return g / g.sum()
 
 
-# activation functions
-
-
-class Activation:
-
-    def function(self, x):
-        return NotImplementedError
-
-    def derivative(self, x):
-        return NotImplementedError
-
-
-def softmax1D(x):
-    """Return the softmax vector of input vector x."""
-    return np.exp(x) / sum(np.exp(x))
-
-
-class Sigmoid(Activation):
-
-    def function(self, x):
-        if x >= 100:
-            return 1
-        if x <= -100:
-            return 0
-        return 1 / (1 + np.exp(-x))
-
-    def derivative(self, value):
-        return value * (1 - value)
-
-
-class Relu(Activation):
-
-    def function(self, x):
-        return max(0, x)
-
-    def derivative(self, value):
-        return 1 if value > 0 else 0
-
-
-class Elu(Activation):
-
-    def function(self, x, alpha=0.01):
-        return x if x > 0 else alpha * (np.exp(x) - 1)
-
-    def derivative(self, value, alpha=0.01):
-        return 1 if value > 0 else alpha * np.exp(value)
-
-
-class Tanh(Activation):
-
-    def function(self, x):
-        return np.tanh(x)
-
-    def derivative(self, value):
-        return 1 - (value ** 2)
-
-
-class LeakyRelu(Activation):
-
-    def function(self, x, alpha=0.01):
-        return x if x > 0 else alpha * x
-
-    def derivative(self, value, alpha=0.01):
-        return 1 if value > 0 else alpha
-
-
 def step(x):
     """Return activation value of x with sign function."""
     return 1 if x >= 0 else 0
@@ -471,15 +393,6 @@ def gaussian(mean, st_dev, x):
     return 1 / (np.sqrt(2 * np.pi) * st_dev) * np.exp(-0.5 * (float(x - mean) / st_dev) ** 2)
 
 
-def gaussian_2D(means, sigma, point):
-    det = sigma[0][0] * sigma[1][1] - sigma[0][1] * sigma[1][0]
-    inverse = np.linalg.inv(sigma)
-    assert det != 0
-    x_u = vector_add(point, scalar_vector_product(-1, means))
-    buff = matrix_multiplication(matrix_multiplication([x_u], inverse), np.array(x_u).T)
-    return 1 / (np.sqrt(det) * 2 * np.pi) * np.exp(-0.5 * buff[0][0])
-
-
 def linear_kernel(x, y=None):
     if y is None:
         y = x
@@ -540,6 +453,7 @@ def distance_squared(a, b):
 # ______________________________________________________________________________
 # Misc Functions
 
+
 class injection:
     """Dependency injection of temporary values for global functions/classes/etc.
     E.g., `with injection(DataBase=MockDataBase): ...`"""
@@ -636,6 +550,7 @@ def failure_test(algorithm, tests):
 # See https://docs.python.org/3/reference/expressions.html#operator-precedence
 # See https://docs.python.org/3/reference/datamodel.html#special-method-names
 
+
 class Expr:
     """A mathematical expression with an operator and 0 or more arguments.
     op is a str like '+' or 'sin'; args are Expressions.
@@ -870,6 +785,8 @@ def __hash__(self):
 
 # ______________________________________________________________________________
 # Monte Carlo tree node and ucb function
+
+
 class MCT_Node:
     """Node in the Monte Carlo search tree, keeps track of the children states."""