esavary
diff --git a/‎SELU/README.md
+60 b/‎SELU/README.md
+60
diff --git a/‎SELU/figures/SELU_LR_1E-2.png
89.8 KB b/‎SELU/figures/SELU_LR_1E-2.png
89.8 KB
diff --git a/‎SELU/figures/SELU_LR_1E-3.png
82.2 KB b/‎SELU/figures/SELU_LR_1E-3.png
82.2 KB
diff --git a/‎SELU/figures/SELU_LR_1E-5.png
86.5 KB b/‎SELU/figures/SELU_LR_1E-5.png
86.5 KB
diff --git a/‎SELU/main.py
+35 b/‎SELU/main.py
+35
diff --git a/‎SELU/models.py
+160 b/‎SELU/models.py
+160
diff --git a/‎SELU/plot_results.py
+89 b/‎SELU/plot_results.py
+89
diff --git a/‎SELU/run_experiments.sh
+15 b/‎SELU/run_experiments.sh
+15
@@ -0,0 +1,60 @@
+# Experiments with MNIST and SELU
+
+Pytorch implementation of some experiments from [Self-Normalizing Networks](https://arxiv.org/pdf/1706.02515.pdf)
+
+## Dependencies
+
+- python (tested on Anaconda python 3.6.1)
+- pytorch (tested on 0.1.12_2)
+- sklearn (tested on 0.18.1)
+- matplotlib (tested on 2.0.1)
+- tqdm
+- numpy
+
+
+## Uage
+
+Main command:
+
+	python main.py
+
+Arguments:
+
+	  --model MODEL         Model name, RELUNet or SELUNet
+	  --n_inner_layers N_INNER_LAYERS
+	                        Number of inner hidden layers
+	  --hidden_dim HIDDEN_DIM
+	                        Hidden layer dimension
+	  --dropout DROPOUT     Dropout rate
+	  --use_cuda            Use CUDA
+	  --nb_epoch NB_EPOCH   Number of training epochs
+	  --batchnorm           Whether to use BN for RELUNet
+	  --batch_size BATCH_SIZE
+	                        Batch size
+	  --optimizer OPTIMIZER
+	                        Optimizer
+	  --learning_rate LEARNING_RATE
+	                        Learning rate
+
+
+## Run a batch of experiments
+
+Modify `run_experiments.sh` as needed then run
+
+	bash run_experiments.sh
+
+
+## Plot results
+
+Run a few experiments. Results are saved in a `results` folder.
+Modify `plot_results.py` to select your experiments, then run:
+
+
+	python plot_results.py
+
+
+## Notes
+
+- The architecture of the NN is the same as in the original paper.
+- We plot the loss curves to give some more perspective.
+- Initially had a hard time reproducing results. Inspection of loss curves show you just have to train longer until Soboleb loss and MSE loss have similar magnitude. Or increase the weight on the Sobolev loss.
@@ -0,0 +1,35 @@
+from __future__ import print_function
+import os
+import argparse
+import torchvision.datasets as dset
+import train
+
+# Training settings
+parser = argparse.ArgumentParser(description='MNIST SELU experiments')
+
+# Neural Net archi
+parser.add_argument('--model', default="RELUNet", type=str, help="Model name, RELUNet or SELUNet")
+parser.add_argument('--n_inner_layers', default=4, type=int, help="Number of inner hidden layers")
+parser.add_argument('--hidden_dim', default=-1, type=int, help="Hidden layer dimension")
+parser.add_argument('--dropout', default=0, type=float, help="Dropout rate")
+# Training params
+parser.add_argument('--use_cuda', action="store_true", help="Use CUDA")
+parser.add_argument('--nb_epoch', default=100, type=int, help="Number of training epochs")
+parser.add_argument('--batchnorm', action="store_true", help="Whether to use BN for RELUNet")
+parser.add_argument('--batch_size', default=128, type=int, help='Batch size')
+parser.add_argument('--optimizer', default="SGD", type=str, help="Optimizer")
+parser.add_argument('--learning_rate', default=1E-5, type=float, help="Learning rate")
+args = parser.parse_args()
+
+
+assert args.model in ["RELUNet", "SELUNet"]
+
+# Download mnist if it does not exist
+if not os.path.isfile("processed/training.pt"):
+    dset.MNIST(root=".", download=True)
+
+if not os.path.exists("results"):
+    os.makedirs("results")
+
+# Launch training
+train.train(args)
@@ -0,0 +1,160 @@
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.init as init
+from torch.autograd import Variable
+import numpy as np
+
+
+# network
+class RELUNet(nn.Module):
+    def __init__(self, n_inner_layers, input_dim, hidden_dim, output_dim, dropout=0, batchnorm=True):
+
+        super(RELUNet, self).__init__()
+
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.hidden_dim = hidden_dim
+        self.dropout = dropout
+        self.batchnorm = batchnorm
+        self.n_inner_layers = n_inner_layers
+
+        # FC layers
+        self.fc_in = nn.Linear(input_dim, hidden_dim)
+        # Hacky way to set inner layers. Ensures they are all converted to
+        for k in range(n_inner_layers):
+            setattr(self, "fc_%s" % k, nn.Linear(hidden_dim, hidden_dim))
+        self.fc_out = nn.Linear(hidden_dim, output_dim)
+
+        # BN layers
+        self.bn_in = nn.BatchNorm1d(hidden_dim)
+
+        for k in range(n_inner_layers):
+            setattr(self, "bn_%s" % k, nn.BatchNorm1d(hidden_dim))
+
+        # Initialize weights specifically for relu
+
+        # First layer
+        init.normal(self.fc_in.weight, std=2. / np.sqrt(np.float32(self.input_dim)))
+        init.constant(self.fc_in.bias, 0.)
+
+        # Inner layers
+        for i in range(self.n_inner_layers):
+            init.normal(getattr(self, "fc_%s" % i).weight, std=2. / np.sqrt(np.float32(self.hidden_dim)))
+            init.constant(getattr(self, "fc_%s" % i).bias, 0.)
+
+        # Last layer
+        init.normal(self.fc_out.weight, std=2. / np.sqrt(np.float32(self.hidden_dim)))
+        init.constant(self.fc_out.bias, 0.)
+
+    def forward(self, x, training=False):
+
+        # First layer
+        x = self.fc_in(x)
+        if self.batchnorm:
+            x = self.bn_in(x)
+        x = F.relu(x)
+        if self.dropout > 0:
+            x = F.dropout(x, p=self.dropout, training=training)
+
+        # Inner layers
+        for i in range(self.n_inner_layers):
+            x = getattr(self, "fc_%s" % i)(x)
+            if self.batchnorm:
+                x = getattr(self, "bn_%s" % i)(x)
+            x = F.relu(x)
+            if self.dropout > 0:
+                x = F.dropout(x, p=self.dropout, training=training)
+
+        # Output layers
+        x = self.fc_out(x)
+
+        return x
+
+
+def alpha_dropout(input, p=0.5, training=False):
+    """Applies alpha dropout to the input.
+
+    See :class:`~torch.nn.AlphaDropout` for details.
+
+    Args:
+        p (float, optional): the drop probability
+        training (bool, optional): switch between training and evaluation mode
+    """
+    if p < 0 or p > 1:
+        raise ValueError("dropout probability has to be between 0 and 1, "
+                         "but got {}".format(p))
+
+    if p == 0 or not training:
+        return input
+
+    alpha = -1.7580993408473766
+    keep_prob = 1 - p
+    # TODO avoid casting to byte after resize
+    noise = input.data.new().resize_(input.size())
+    noise.bernoulli_(p)
+    noise = Variable(noise.byte())
+
+    output = input.masked_fill(noise, alpha)
+
+    a = (keep_prob + alpha ** 2 * keep_prob * (1 - keep_prob)) ** (-0.5)
+    b = -a * alpha * (1 - keep_prob)
+
+    return output.mul_(a).add_(b)
+
+
+def selu(x):
+    alpha = 1.6732632423543772848170429916717
+    scale = 1.0507009873554804934193349852946
+    return scale * F.elu(x, alpha)
+
+
+class SELUNet(nn.Module):
+    def __init__(self, n_inner_layers, input_dim, hidden_dim, output_dim, dropout=0.05):
+
+        super(SELUNet, self).__init__()
+
+        self.dropout = dropout
+
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.hidden_dim = hidden_dim
+        self.n_inner_layers = n_inner_layers
+        self.fc_in = nn.Linear(input_dim, hidden_dim)
+        for k in range(n_inner_layers):
+            setattr(self, "fc_%s" % k, nn.Linear(hidden_dim, hidden_dim))
+        self.fc_out = nn.Linear(hidden_dim, output_dim)
+
+        # Initialize weights specifically for selu
+
+        # First layer
+        init.normal(self.fc_in.weight, std=1. / np.sqrt(np.float32(self.input_dim)))
+        init.constant(self.fc_in.bias, 0.)
+
+        # Inner layers
+        for i in range(self.n_inner_layers):
+            init.normal(getattr(self, "fc_%s" % i).weight, std=1. / np.sqrt(np.float32(self.hidden_dim)))
+            init.constant(getattr(self, "fc_%s" % i).bias, 0.)
+
+        # Last layer
+        init.normal(self.fc_out.weight, std=1. / np.sqrt(np.float32(self.hidden_dim)))
+        init.constant(self.fc_out.bias, 0.)
+
+    def forward(self, x, training=False):
+
+        # First layer
+        x = self.fc_in(x)
+        x = selu(x)
+        if self.dropout > 0:
+            x = alpha_dropout(x, p=self.dropout, training=training)
+
+        # Inner layers
+        for i in range(self.n_inner_layers):
+            x = getattr(self, "fc_%s" % i)(x)
+            x = selu(x)
+            if self.dropout > 0:
+                x = alpha_dropout(x, p=self.dropout, training=training)
+
+        # Output layers
+        x = self.fc_out(x)
+
+        return x
@@ -0,0 +1,89 @@
+import matplotlib.pylab as plt
+import matplotlib.gridspec as gridspec
+import json
+import os
+
+
+def plot():
+
+    ############################
+    # Adapt to existing experiments
+    ############################
+    list_width = [0.5, 1, 1.5, 2]
+    list_depth = [6, 10, 18, 34]
+    learning_rate = "1E-5"
+    result_folder = "results_1e-5"
+
+    plt.figure(figsize=(12,5))
+    gs = gridspec.GridSpec(1,2)
+
+    ##################
+    # SGD results
+    ##################
+    ax0 = plt.subplot(gs[0])
+
+    for i in range(len(list_depth)):
+        depth = list_depth[i]
+        exp = "%s/RELUNet_depth_%s_opt_SGD_drop_0_bn_True.json" % (result_folder, depth)
+        with open(exp, "r") as f:
+            d_losses = json.load(f)
+            ax0.plot(d_losses["train_loss"],
+                     linewidth=list_width[i],
+                     color="C0",
+                     label="RELU, Depth: %s" % depth)
+
+    for i in range(len(list_depth)):
+        depth = list_depth[i]
+        exp = "%s/SELUNet_depth_%s_opt_SGD_drop_0_bn_False.json" % (result_folder, depth)
+        with open(exp, "r") as f:
+            d_losses = json.load(f)
+            ax0.plot(d_losses["train_loss"],
+                     linewidth=list_width[i],
+                     color="C1",
+                     label="SELU, Depth: %s" % depth)
+    ax0.legend(loc="best")
+    ax0.set_title("SGD, Learning Rate = %s" % learning_rate, fontsize=16)
+    ax0.set_yscale("log")
+    ax0.set_ylim([1E-6, 10])
+    ax0.set_xlabel("Epochs", fontsize=18)
+    ax0.set_ylabel("Train logloss", fontsize=18)
+
+    ##################
+    # Adam results
+    ##################
+    ax1 = plt.subplot(gs[1])
+
+    for i in range(len(list_depth)):
+        depth = list_depth[i]
+        exp = "%s/RELUNet_depth_%s_opt_Adam_drop_0_bn_True.json" % (result_folder, depth)
+        with open(exp, "r") as f:
+            d_losses = json.load(f)
+            ax1.plot(d_losses["train_loss"],
+                     linewidth=list_width[i],
+                     color="C0",
+                     label="RELU, Depth: %s" % depth)
+
+    for i in range(len(list_depth)):
+        depth = list_depth[i]
+        exp = "%s/SELUNet_depth_%s_opt_Adam_drop_0_bn_False.json" % (result_folder, depth)
+        with open(exp, "r") as f:
+            d_losses = json.load(f)
+            ax1.plot(d_losses["train_loss"],
+                     linewidth=list_width[i],
+                     color="C1",
+                     label="SELU, Depth: %s" % depth)
+    ax1.legend(loc="best")
+    ax1.set_title("Adam, Learning Rate = %s" % learning_rate, fontsize=16)
+    ax1.set_yscale("log")
+    ax1.set_ylim([1E-6, 10])
+    ax1.set_xlabel("Epochs", fontsize=18)
+    ax1.set_ylabel("Train logloss", fontsize=18)
+
+    if not os.path.exists("figures"):
+        os.makedirs("figures")
+
+    plt.savefig("figures/SELU_LR_%s.png" % learning_rate)
+
+if __name__ == '__main__':
+
+    plot()
@@ -0,0 +1,15 @@
+# RELU
+python main.py --use_cuda --n_inner_layers 4 --model RELUNet --learning_rate 1E-5 --nb_epoch 20 --batchnorm --optimizer Adam
+python main.py --use_cuda --n_inner_layers 8 --model RELUNet --learning_rate 1E-5 --nb_epoch 20 --batchnorm --optimizer Adam
+python main.py --use_cuda --n_inner_layers 16 --model RELUNet --learning_rate 1E-5 --nb_epoch 20 --batchnorm --optimizer Adam
+python main.py --use_cuda --n_inner_layers 32 --model RELUNet --learning_rate 1E-5 --nb_epoch 20 --batchnorm --optimizer Adam
+# SELU
+python main.py --use_cuda --n_inner_layers 4 --model SELUNet --learning_rate 1E-5 --nb_epoch 20 --optimizer Adam
+python main.py --use_cuda --n_inner_layers 8 --model SELUNet --learning_rate 1E-5 --nb_epoch 20 --optimizer Adam
+python main.py --use_cuda --n_inner_layers 16 --model SELUNet --learning_rate 1E-5 --nb_epoch 20 --optimizer Adam
+python main.py --use_cuda --n_inner_layers 32 --model SELUNet --learning_rate 1E-5 --nb_epoch 20 --optimizer Adam
+# SELU + dropout
+python main.py --use_cuda --n_inner_layers 4 --model SELUNet --learning_rate 1E-5 --nb_epoch 200 --dropout 0.05
+python main.py --use_cuda --n_inner_layers 8 --model SELUNet --learning_rate 1E-5 --nb_epoch 200 --dropout 0.05
+python main.py --use_cuda --n_inner_layers 16 --model SELUNet --learning_rate 1E-5 --nb_epoch 200 --dropout 0.05
+python main.py --use_cuda --n_inner_layers 32 --model SELUNet --learning_rate 1E-5 --nb_epoch 200 --dropout 0.05