deepjavalibrary
diff --git a/‎chapter_attention-mechanisms/attention-cues.ipynb
Lines changed: 1 addition & 8 deletions b/‎chapter_attention-mechanisms/attention-cues.ipynb
Lines changed: 1 addition & 8 deletions
diff --git a/‎chapter_attention-mechanisms/attention-scoring-functions.ipynb
Lines changed: 72 additions & 79 deletions b/‎chapter_attention-mechanisms/attention-scoring-functions.ipynb
Lines changed: 72 additions & 79 deletions
@@ -183,7 +183,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "NDManager manager = NDManager.newBaseManager(Functions.tryGpu(0));"
+    "NDManager manager = NDManager.newBaseManager();"
    ]
   },
   {
@@ -297,13 +297,6 @@
     "1. What can be the volitional cue when decoding a sequence token by token in machine translation? What are the nonvolitional cues and the sensory inputs?\n",
     "1. Randomly generate a $10 \\times 10$ matrix and use the softmax operation to ensure each row is a valid probability distribution. Visualize the output attention weights.\n"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
 
@@ -92,7 +92,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "NDManager manager = NDManager.newBaseManager(Functions.tryGpu(0));"
+    "NDManager manager = NDManager.newBaseManager();"
    ]
   },
   {
@@ -136,20 +136,19 @@
     "    // `X`: 3D NDArray, `validLens`: 1D or 2D NDArray\n",
     "    if (validLens == null) {\n",
     "        return X.softmax(-1);\n",
+    "    }\n",
+    "    \n",
+    "    Shape shape = X.getShape();\n",
+    "    if (validLens.getShape().dimension() == 1) {\n",
+    "        validLens = validLens.repeat(shape.get(1));\n",
     "    } else {\n",
-    "        Shape shape = X.getShape();\n",
-    "        if (validLens.getShape().dimension() == 1) {\n",
-    "            validLens = validLens.repeat(shape.get(1));\n",
-    "        } else {\n",
-    "            validLens = validLens.reshape(-1);\n",
-    "        }\n",
-    "        // On the last axis, replace masked elements with a very large negative\n",
-    "        // value, whose exponentiation outputs 0\n",
-    "        X =\n",
-    "                X.reshape(new Shape(-1, shape.get(shape.dimension() - 1)))\n",
-    "                        .sequenceMask(validLens, (float) -1E6);\n",
-    "        return X.softmax(-1).reshape(shape);\n",
+    "        validLens = validLens.reshape(-1);\n",
     "    }\n",
+    "    // On the last axis, replace masked elements with a very large negative\n",
+    "    // value, whose exponentiation outputs 0\n",
+    "    X = X.reshape(new Shape(-1, shape.get(shape.dimension() - 1)))\n",
+    "            .sequenceMask(validLens, (float) -1E6);\n",
+    "    return X.softmax(-1).reshape(shape);\n",
     "}"
    ]
   },
@@ -174,10 +173,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "System.out.println(\n",
-    "        maskedSoftmax(\n",
-    "                manager.randomUniform(0, 1, new Shape(2, 2, 4)),\n",
-    "                manager.create(new float[] {2, 3})));"
+    "maskedSoftmax(\n",
+    "        manager.randomUniform(0, 1, new Shape(2, 2, 4)),\n",
+    "        manager.create(new float[] {2, 3}));"
    ]
   },
   {
@@ -198,10 +196,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "System.out.println(\n",
-    "        maskedSoftmax(\n",
-    "                manager.randomUniform(0, 1, new Shape(2, 2, 4)),\n",
-    "                manager.create(new float[][] {{1, 3}, {2, 4}})));"
+    "maskedSoftmax(\n",
+    "        manager.randomUniform(0, 1, new Shape(2, 2, 4)),\n",
+    "        manager.create(new float[][] {{1, 3}, {2, 4}}));"
    ]
   },
   {
@@ -243,32 +240,31 @@
    "outputs": [],
    "source": [
     "/* Additive attention. */\n",
-    "public class AdditiveAttention extends AbstractBlock {\n",
-    "    private static final byte VERSION = 1;\n",
+    "public static class AdditiveAttention extends AbstractBlock {\n",
+    "\n",
     "    private Linear W_k;\n",
     "    private Linear W_q;\n",
     "    private Linear W_v;\n",
     "    private Dropout dropout;\n",
     "    public NDArray attentionWeights;\n",
     "\n",
     "    public AdditiveAttention(int numHiddens, float dropout) {\n",
-    "        super(VERSION);\n",
-    "        this.W_k = Linear.builder().setUnits(numHiddens).optBias(false).build();\n",
-    "        this.addChildBlock(\"W_k\", this.W_k);\n",
+    "        W_k = Linear.builder().setUnits(numHiddens).optBias(false).build();\n",
+    "        addChildBlock(\"W_k\", W_k);\n",
     "\n",
-    "        this.W_q = Linear.builder().setUnits(numHiddens).optBias(false).build();\n",
-    "        this.addChildBlock(\"W_q\", this.W_q);\n",
+    "        W_q = Linear.builder().setUnits(numHiddens).optBias(false).build();\n",
+    "        addChildBlock(\"W_q\", W_q);\n",
     "\n",
-    "        this.W_v = Linear.builder().setUnits(1).optBias(false).build();\n",
-    "        this.addChildBlock(\"W_v\", this.W_v);\n",
+    "        W_v = Linear.builder().setUnits(1).optBias(false).build();\n",
+    "        addChildBlock(\"W_v\", W_v);\n",
     "\n",
     "        this.dropout = Dropout.builder().optRate(dropout).build();\n",
-    "        this.addChildBlock(\"dropout\", this.dropout);\n",
+    "        addChildBlock(\"dropout\", this.dropout);\n",
     "    }\n",
     "\n",
     "    @Override\n",
     "    protected NDList forwardInternal(\n",
-    "            ParameterStore parameterStore,\n",
+    "            ParameterStore ps,\n",
     "            NDList inputs,\n",
     "            boolean training,\n",
     "            PairList<String, Object> params) {\n",
@@ -279,8 +275,8 @@
     "        NDArray values = inputs.get(2);\n",
     "        NDArray validLens = inputs.get(3);\n",
     "\n",
-    "        queries = this.W_q.forward(parameterStore, new NDList(queries), training, params).head();\n",
-    "        keys = this.W_k.forward(parameterStore, new NDList(keys), training, params).head();\n",
+    "        queries = W_q.forward(ps, new NDList(queries), training, params).head();\n",
+    "        keys = W_k.forward(ps, new NDList(keys), training, params).head();\n",
     "        // After dimension expansion, shape of `queries`: (`batchSize`, no. of\n",
     "        // queries, 1, `numHiddens`) and shape of `keys`: (`batchSize`, 1,\n",
     "        // no. of key-value pairs, `numHiddens`). Sum them up with\n",
@@ -290,18 +286,12 @@
     "        // There is only one output of `this.W_v`, so we remove the last\n",
     "        // one-dimensional entry from the shape. Shape of `scores`:\n",
     "        // (`batchSize`, no. of queries, no. of key-value pairs)\n",
-    "        NDArray result =\n",
-    "                this.W_v.forward(parameterStore, new NDList(features), training, params).head();\n",
+    "        NDArray result = W_v.forward(ps, new NDList(features), training, params).head();\n",
     "        NDArray scores = result.squeeze(-1);\n",
-    "        this.attentionWeights = maskedSoftmax(scores, validLens);\n",
-    "        // Shape of `values`: (`batchSize`, no. of key-value pairs, value\n",
-    "        // dimension)\n",
-    "        return new NDList(\n",
-    "                this.dropout\n",
-    "                        .forward(\n",
-    "                                parameterStore, new NDList(this.attentionWeights), training, params)\n",
-    "                        .head()\n",
-    "                        .batchDot(values));\n",
+    "        attentionWeights = maskedSoftmax(scores, validLens);\n",
+    "        // Shape of `values`: (`batchSize`, no. of key-value pairs, value dimension)\n",
+    "        NDList list = dropout.forward(ps, new NDList(attentionWeights), training, params);\n",
+    "        return new NDList(list.head().batchDot(values));\n",
     "    }\n",
     "\n",
     "    @Override\n",
@@ -310,8 +300,21 @@
     "    }\n",
     "\n",
     "    @Override\n",
-    "    public void initializeChildBlocks(NDManager manager, DataType dataType, Shape... inputShapes) {}\n",
-    "}"
+    "    public void initializeChildBlocks(\n",
+    "            NDManager manager, DataType dataType, Shape... inputShapes) {\n",
+    "        W_q.initialize(manager, dataType, inputShapes[0]);\n",
+    "        W_k.initialize(manager, dataType, inputShapes[1]);\n",
+    "        long[] q = W_q.getOutputShapes(new Shape[] {inputShapes[0]})[0].getShape();\n",
+    "        long[] k = W_k.getOutputShapes(new Shape[] {inputShapes[1]})[0].getShape();\n",
+    "        long w = Math.max(q[q.length - 2], k[k.length - 2]);\n",
+    "        long h = Math.max(q[q.length - 1], k[k.length - 1]);\n",
+    "        long[] shape = new long[] {2, 1, w, h};\n",
+    "        W_v.initialize(manager, dataType, new Shape(shape));\n",
+    "        long[] dropoutShape = new long[shape.length - 1];\n",
+    "        System.arraycopy(shape, 0, dropoutShape, 0, dropoutShape.length);\n",
+    "        dropout.initialize(manager, dataType, new Shape(dropoutShape));\n",
+    "    }\n",
+    "}\n"
    ]
   },
   {
@@ -343,12 +346,10 @@
     "NDArray validLens = manager.create(new float[] {2, 6});\n",
     "\n",
     "AdditiveAttention attention = new AdditiveAttention(8, 0.1f);\n",
-    "attention\n",
-    "        .forward(\n",
-    "                new ParameterStore(manager, false),\n",
-    "                new NDList(queries, keys, values, validLens),\n",
-    "                false)\n",
-    "        .head();"
+    "NDList input = new NDList(queries, keys, values, validLens);\n",
+    "ParameterStore ps = new ParameterStore(manager, false);\n",
+    "attention.initialize(manager, DataType.FLOAT32, input.getShapes());\n",
+    "attention.forward(ps, input, false).head();"
    ]
   },
   {
@@ -435,22 +436,20 @@
    "outputs": [],
    "source": [
     "/* Scaled dot product attention. */\n",
-    "public class DotProductAttention extends AbstractBlock {\n",
-    "    private static final byte VERSION = 1;\n",
+    "public static class DotProductAttention extends AbstractBlock {\n",
+    "\n",
     "    private Dropout dropout;\n",
     "    public NDArray attentionWeights;\n",
     "\n",
     "    public DotProductAttention(float dropout) {\n",
-    "        super(VERSION);\n",
-    "\n",
     "        this.dropout = Dropout.builder().optRate(dropout).build();\n",
     "        this.addChildBlock(\"dropout\", this.dropout);\n",
     "        this.dropout.setInitializer(new UniformInitializer(0.07f), Parameter.Type.WEIGHT);\n",
     "    }\n",
     "\n",
     "    @Override\n",
     "    protected NDList forwardInternal(\n",
-    "            ParameterStore parameterStore,\n",
+    "            ParameterStore ps,\n",
     "            NDList inputs,\n",
     "            boolean training,\n",
     "            PairList<String, Object> params) {\n",
@@ -459,7 +458,7 @@
     "        // Shape of `values`: (`batchSize`, no. of key-value pairs, value\n",
     "        // dimension)\n",
     "        // Shape of `valid_lens`: (`batchSize`,) or (`batchSize`, no. of queries)\n",
-    "        NDArray queries = inputs.head();\n",
+    "        NDArray queries = inputs.get(0);\n",
     "        NDArray keys = inputs.get(1);\n",
     "        NDArray values = inputs.get(2);\n",
     "        NDArray validLens = inputs.get(3);\n",
@@ -468,12 +467,8 @@
     "        // Swap the last two dimensions of `keys` and perform batchDot\n",
     "        NDArray scores = queries.batchDot(keys.swapAxes(1, 2)).div(Math.sqrt(2));\n",
     "        attentionWeights = maskedSoftmax(scores, validLens);\n",
-    "        return new NDList(\n",
-    "                this.dropout\n",
-    "                        .forward(\n",
-    "                                parameterStore, new NDList(this.attentionWeights), training, params)\n",
-    "                        .head()\n",
-    "                        .batchDot(values));\n",
+    "        NDList list = dropout.forward(ps, new NDList(attentionWeights), training, params);\n",
+    "        return new NDList(list.head().batchDot(values));\n",
     "    }\n",
     "\n",
     "    @Override\n",
@@ -482,7 +477,15 @@
     "    }\n",
     "\n",
     "    @Override\n",
-    "    public void initializeChildBlocks(NDManager manager, DataType dataType, Shape... inputShapes) {}\n",
+    "    public void initializeChildBlocks(\n",
+    "            NDManager manager, DataType dataType, Shape... inputShapes) {\n",
+    "        try (NDManager sub = manager.newSubManager()) {\n",
+    "            NDArray queries = sub.zeros(inputShapes[0], dataType);\n",
+    "            NDArray keys = sub.zeros(inputShapes[1], dataType);\n",
+    "            NDArray scores = queries.batchDot(keys.swapAxes(1, 2));\n",
+    "            dropout.initialize(manager, dataType, scores.getShape());\n",
+    "        }\n",
+    "    }\n",
     "}"
    ]
   },
@@ -508,12 +511,9 @@
    "source": [
     "queries = manager.randomNormal(0, 1, new Shape(2, 1, 2), DataType.FLOAT32);\n",
     "DotProductAttention productAttention = new DotProductAttention(0.5f);\n",
-    "productAttention\n",
-    "        .forward(\n",
-    "                new ParameterStore(manager, false),\n",
-    "                new NDList(queries, keys, values, validLens),\n",
-    "                false)\n",
-    "        .head();"
+    "input = new NDList(queries, keys, values, validLens);\n",
+    "productAttention.initialize(manager, DataType.FLOAT32, input.getShapes());\n",
+    "productAttention.forward(ps, input, false).head();"
    ]
   },
   {
@@ -562,13 +562,6 @@
     "1. Using matrix multiplications only, can you design a new scoring function for queries and keys with different vector lengths?\n",
     "1. When queries and keys have the same vector length, is vector summation a better design than dot product for the scoring function? Why or why not?\n"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
Original file line number	Diff line number	Diff line change
`@@ -183,7 +183,7 @@`
`183`	`183`	`"metadata": {},`
`184`	`184`	`"outputs": [],`
`185`	`185`	`"source": [`
`186`		`- "NDManager manager = NDManager.newBaseManager(Functions.tryGpu(0));"`
	`186`	`+ "NDManager manager = NDManager.newBaseManager();"`
`187`	`187`	`]`
`188`	`188`	`},`
`189`	`189`	`{`
`@@ -297,13 +297,6 @@`
`297`	`297`	`"1. What can be the volitional cue when decoding a sequence token by token in machine translation? What are the nonvolitional cues and the sensory inputs?\n",`
`298`	`298`	`"1. Randomly generate a $10 \\times 10$ matrix and use the softmax operation to ensure each row is a valid probability distribution. Visualize the output attention weights.\n"`
`299`	`299`	`]`
`300`		`- },`
`301`		`- {`
`302`		`- "cell_type": "code",`
`303`		`- "execution_count": null,`
`304`		`- "metadata": {},`
`305`		`- "outputs": [],`
`306`		`- "source": []`
`307`	`300`	`}`
`308`	`301`	`],`
`309`	`302`	`"metadata": {`