|
| 1 | +package org.nd4j.examples.samediff.quickstart.modeling; |
| 2 | + |
| 3 | +import org.deeplearning4j.datasets.iterator.impl.MnistDataSetIterator; |
| 4 | +import org.nd4j.autodiff.listeners.impl.ScoreListener; |
| 5 | +import org.nd4j.autodiff.listeners.records.History; |
| 6 | +import org.nd4j.autodiff.samediff.SDVariable; |
| 7 | +import org.nd4j.autodiff.samediff.SameDiff; |
| 8 | +import org.nd4j.autodiff.samediff.TrainingConfig; |
| 9 | +import org.nd4j.evaluation.classification.Evaluation; |
| 10 | +import org.nd4j.evaluation.classification.Evaluation.Metric; |
| 11 | +import org.nd4j.linalg.api.buffer.DataType; |
| 12 | +import org.nd4j.linalg.api.ops.impl.layers.convolution.config.Conv2DConfig; |
| 13 | +import org.nd4j.linalg.api.ops.impl.layers.convolution.config.Pooling2DConfig; |
| 14 | +import org.nd4j.linalg.dataset.api.iterator.DataSetIterator; |
| 15 | +import org.nd4j.linalg.learning.config.Adam; |
| 16 | +import org.nd4j.weightinit.impl.XavierInitScheme; |
| 17 | + |
| 18 | +import java.util.List; |
| 19 | + |
| 20 | +/** |
| 21 | + * Based on a great overview of transforms in pytorch: |
| 22 | + * https://medium.com/the-dl/transformers-from-scratch-in-pytorch-8777e346ca51 |
| 23 | + * |
| 24 | + * Transformers are well known in many applications starting from NLP and |
| 25 | + * moving on to computer vision. This example shows how to implement transformers using samediff. |
| 26 | + * |
| 27 | + * Samediff supports both key ops for transforms: scaled dot production attention |
| 28 | + * <code> |
| 29 | + * SameDiff sd = SameDiff.create(); |
| 30 | + * SDVariable queries = null; |
| 31 | + * SDVariable keys = null; |
| 32 | + * SDVariable values = null; |
| 33 | + * SDVariable mask = null; |
| 34 | + * |
| 35 | + * SDVariable sdVariable = sd.nn().dotProductAttention(queries, keys, values, mask, true); |
| 36 | + * </code> |
| 37 | + * |
| 38 | + * |
| 39 | + * |
| 40 | + * Samediff also supports multi-head attention: |
| 41 | + * <code> |
| 42 | + * SameDiff sd = SameDiff.create(); |
| 43 | + * SDVariable queries = null; |
| 44 | + * SDVariable keys = null; |
| 45 | + * SDVariable values = null; |
| 46 | + * SDVariable mask = null; |
| 47 | + * SDVariable wq = null; |
| 48 | + * SDVariable wk = null; |
| 49 | + * SDVariable wv = null; |
| 50 | + * SDVariable wo = null; |
| 51 | + * SDVariable sdVariable1 = sd.nn().multiHeadDotProductAttention(queries, keys, values, wq, wk, wv, wo, mask, true); |
| 52 | + * </code> |
| 53 | + * |
| 54 | + * multi-head attention |
| 55 | + * |
| 56 | + */ |
| 57 | +public class TransformersExample { |
| 58 | + |
| 59 | + public static SameDiff makeMNISTNet() { |
| 60 | + SameDiff sd = SameDiff.create(); |
| 61 | + SDVariable queries = null; |
| 62 | + SDVariable keys = null; |
| 63 | + SDVariable values = null; |
| 64 | + SDVariable mask = null; |
| 65 | + SDVariable wq = null; |
| 66 | + SDVariable wk = null; |
| 67 | + SDVariable wv = null; |
| 68 | + SDVariable wo = null; |
| 69 | + SDVariable sdVariable1 = sd.nn().multiHeadDotProductAttention(queries, keys, values, wq, wk, wv, wo, mask, true); |
| 70 | + |
| 71 | + SDVariable sdVariable = sd.nn().dotProductAttention(queries, keys, values, mask, true); |
| 72 | + //Properties for MNIST dataset: |
| 73 | + int nIn = 28 * 28; |
| 74 | + int nOut = 10; |
| 75 | + |
| 76 | + //Create input and label variables |
| 77 | + SDVariable in = sd.placeHolder("input", DataType.FLOAT, -1, nIn); //Shape: [?, 784] - i.e., minibatch x 784 for MNIST |
| 78 | + SDVariable label = sd.placeHolder("label", DataType.FLOAT, -1, nOut); //Shape: [?, 10] - i.e., minibatch x 10 for MNIST |
| 79 | + |
| 80 | + SDVariable reshaped = in.reshape(-1, 1, 28, 28); |
| 81 | + |
| 82 | + Pooling2DConfig poolConfig = Pooling2DConfig.builder().kH(2).kW(2).sH(2).sW(2).build(); |
| 83 | + |
| 84 | + Conv2DConfig convConfig = Conv2DConfig.builder().kH(3).kW(3).build(); |
| 85 | + |
| 86 | + // layer 1: Conv2D with a 3x3 kernel and 4 output channels |
| 87 | + SDVariable w0 = sd.var("w0", new XavierInitScheme('c', 28 * 28, 26 * 26 * 4), DataType.FLOAT, 3, 3, 1, 4); |
| 88 | + SDVariable b0 = sd.zero("b0", 4); |
| 89 | + |
| 90 | + SDVariable conv1 = sd.cnn().conv2d(reshaped, w0, b0, convConfig); |
| 91 | + |
| 92 | + // layer 2: MaxPooling2D with a 2x2 kernel and stride, and ReLU activation |
| 93 | + SDVariable pool1 = sd.cnn().maxPooling2d(conv1, poolConfig); |
| 94 | + |
| 95 | + SDVariable relu1 = sd.nn().relu(pool1, 0); |
| 96 | + |
| 97 | + // layer 3: Conv2D with a 3x3 kernel and 8 output channels |
| 98 | + SDVariable w1 = sd.var("w1", new XavierInitScheme('c', 13 * 13 * 4, 11 * 11 * 8), DataType.FLOAT, 3, 3, 4, 8); |
| 99 | + SDVariable b1 = sd.zero("b1", 8); |
| 100 | + |
| 101 | + SDVariable conv2 = sd.cnn().conv2d(relu1, w1, b1, convConfig); |
| 102 | + |
| 103 | + // layer 4: MaxPooling2D with a 2x2 kernel and stride, and ReLU activation |
| 104 | + SDVariable pool2 = sd.cnn().maxPooling2d(conv2, poolConfig); |
| 105 | + |
| 106 | + SDVariable relu2 = sd.nn().relu(pool2, 0); |
| 107 | + |
| 108 | + SDVariable flat = relu2.reshape(-1, 5 * 5 * 8); |
| 109 | + |
| 110 | + // layer 5: Output layer on flattened input |
| 111 | + SDVariable wOut = sd.var("wOut", new XavierInitScheme('c', 5 * 5 * 8, 10), DataType.FLOAT, 5 * 5 * 8, 10); |
| 112 | + SDVariable bOut = sd.zero("bOut", 10); |
| 113 | + |
| 114 | + SDVariable z = sd.nn().linear("z", flat, wOut, bOut); |
| 115 | + |
| 116 | + // softmax crossentropy loss function |
| 117 | + SDVariable out = sd.nn().softmax("out", z, 1); |
| 118 | + SDVariable loss = sd.loss().softmaxCrossEntropy("loss", label, out, null); |
| 119 | + |
| 120 | + sd.setLossVariables(loss); |
| 121 | + |
| 122 | + return sd; |
| 123 | + } |
| 124 | + |
| 125 | + public static void main(String[] args) throws Exception { |
| 126 | + SameDiff sd = makeMNISTNet(); |
| 127 | + |
| 128 | + //Create and set the training configuration |
| 129 | + |
| 130 | + Evaluation evaluation = new Evaluation(); |
| 131 | + |
| 132 | + double learningRate = 1e-3; |
| 133 | + TrainingConfig config = new TrainingConfig.Builder() |
| 134 | + .l2(1e-4) //L2 regularization |
| 135 | + .updater(new Adam(learningRate)) //Adam optimizer with specified learning rate |
| 136 | + .dataSetFeatureMapping("input") //DataSet features array should be associated with variable "input" |
| 137 | + .dataSetLabelMapping("label") //DataSet label array should be associated with variable "label" |
| 138 | + .trainEvaluation("out", 0, evaluation) // add a training evaluation |
| 139 | + .build(); |
| 140 | + |
| 141 | + // You can add validation evaluations as well, but they have some issues in beta5 and most likely won't work. |
| 142 | + // If you want to use them, use the SNAPSHOT build. |
| 143 | + |
| 144 | + sd.setTrainingConfig(config); |
| 145 | + |
| 146 | + // Adding a listener to the SameDiff instance is necessary because of a beta5 bug, and is not necessary in snapshots |
| 147 | + sd.addListeners(new ScoreListener(20)); |
| 148 | + |
| 149 | + int batchSize = 32; |
| 150 | + DataSetIterator trainData = new MnistDataSetIterator(batchSize, true, 12345); |
| 151 | + |
| 152 | + //Perform training for 4 epochs |
| 153 | + int numEpochs = 4; |
| 154 | + History hist = sd.fit() |
| 155 | + .train(trainData, numEpochs) |
| 156 | + .exec(); |
| 157 | + List<Double> acc = hist.trainingEval(Metric.ACCURACY); |
| 158 | + |
| 159 | + System.out.println("Accuracy: " + acc); |
| 160 | + } |
| 161 | +} |
0 commit comments