[sentiment] Support training multi-hot MLP and writing out embedding files (#225)

caisq · web-flow · commit 18cc9ce0c4c7 · 2019-03-14T14:21:37.000-04:00
Also in this change: - Adjust some hyperparameters Example screenshot from using the new `--embeddingFilesPrefix` flag + the Embedding Projector: ![image](https://user-images.githubusercontent.com/16824702/52145038-f0fce480-262d-11e9-9313-9a5014ace25f.png)
diff --git a/sentiment/README.md b/sentiment/README.md
@@ -37,8 +37,11 @@ yarn train <MODEL_TYPE>
 where `MODEL_TYPE` is a required argument that specifies what type of model is to be
 trained. The available options are:
 
+- `multihot`: A model that takes a multi-hot encoding of the words in the sequence.
+  In terms of data representation and model complexity, this is the simplest model
+  in this example.
 - `flatten`: A model that flattens the embedding vectors of all words in the sequence.
-- `cnn`: A 1D convolutional model.
+- `cnn`: A 1D convolutional model, with a dropout layer included.
 - `simpleRNN`: A model that uses a SimpleRNN layer (`tf.layers.simpleRNN`)
 - `lstm`: A model that uses a LSTM laayer (`tf.layers.lstm`)
 - `bidirectionalLSTM`: A model that uses a bidirectional LSTM layer
@@ -65,5 +68,34 @@ Other arguments of the `yarn train` command include:
 - `--epochs`, `--batchSize`, and `--validationSplit` are training-related settings.
 - `--modelSavePath` allows you to specify where to store the model and metadata after
   training completes.
+- `--embeddingFilesPrefix` Prefix for the path to which to save the embedding vectors
+  and labels files (optinal). See the section below for details.
 
 The detailed code for training are in the file [train.js](./train.js).
+
+### Visualizing the word embeddings in embedding projector
+
+If you train a word embedding-based model (e.g., `cnn` or `lstm`), you can let the
+`yarn train` script write the embedding vectors, together with the corresponding
+word labels, to files after the model training completes. This is done using the
+``--embeddingFilesPrefix`, e.g.,
+
+```sh
+yarn train --maxLen 500 cnn --epochs 2 --embeddingFilesPrefix /tmp/imdb_embed
+```
+
+The above command will generate two files:
+
+- `/tmp/imdb_embed_vectors.tsv`: A tab-separated-values file that for the numeric
+  values of the word embeddings. Each line contains the embedding vector from a
+  word.
+- `/tmp/imdb_embed_labels.tsv`: A file consisting of the word labels that correspond
+  to the vectors in the previous file. Each line is a word.
+
+These files can be directly uploaded to the Embedding Projector
+(https://projector.tensorflow.org/) for visualization using the
+[T-SNE](https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding) or
+[PCA](https://en.wikipedia.org/wiki/Principal_component_analysis) algorithm
+
+See example screenshot:
+![image](https://user-images.githubusercontent.com/16824702/52145038-f0fce480-262d-11e9-9313-9a5014ace25f.png)
diff --git a/sentiment/data.js b/sentiment/data.js
@@ -39,10 +39,14 @@ const METADATA_TEMPLATE_URL =
  *   that exceed this limit will be marked as `OOV_INDEX`.
  * @param {string} maxLen Length of each sequence. Longer sequences will be
  *   pre-truncated; shorter ones will be pre-padded.
- * @return {tf.Tensor} The dataset represented as a 2D `tf.Tensor` of shape
- *   `[]` and dtype `int32` .
+ * @param {string} multihot Whether to use multi-hot encoding of the words.
+ *   Default: `false`.
+ * @return {tf.Tensor} If `multihot` is `false` (default), the dataset
+ *   represented as a 2D `tf.Tensor` of shape `[numExamples, maxLen]` and
+ *   dtype `int32`. Else, the dataset represented as a 2D `tf.Tensor` of
+ *   shape `[numExamples, numWords]` and dtype `float32`.
  */
-function loadFeatures(filePath, numWords, maxLen) {
+function loadFeatures(filePath, numWords, maxLen, multihot = false) {
   const buffer = fs.readFileSync(filePath);
   const numBytes = buffer.byteLength;
 
@@ -67,10 +71,39 @@ function loadFeatures(filePath, numWords, maxLen) {
   if (seq.length > 0) {
     sequences.push(seq);
   }
-  const paddedSequences =
-      padSequences(sequences, maxLen, 'pre', 'pre');
-  return tf.tensor2d(
-      paddedSequences, [paddedSequences.length, maxLen], 'int32');
+
+  // Get some sequence length stats.
+  let minLength = Infinity;
+  let maxLength = -Infinity;
+  sequences.forEach(seq => {
+    const length = seq.length;
+    if (length < minLength) {
+      minLength = length;
+    }
+    if (length > maxLength) {
+      maxLength = length;
+    }
+  });
+  console.log(`Sequence length: min = ${minLength}; max = ${maxLength}`);
+
+  if (multihot) {
+    // If requested by the arg, encode the sequences as multi-hot
+    // vectors.
+    const buffer = tf.buffer([sequences.length, numWords]);
+    sequences.forEach((seq, i) => {
+      seq.forEach(wordIndex => {
+        if (wordIndex !== OOV_CHAR) {
+          buffer.set(1, i, wordIndex);
+        }
+      });
+    });
+    return buffer.toTensor();
+  } else {
+    const paddedSequences =
+        padSequences(sequences, maxLen, 'pre', 'pre');
+    return tf.tensor2d(
+        paddedSequences, [paddedSequences.length, maxLen], 'int32');
+  }
 }
 
 /**
@@ -84,10 +117,23 @@ function loadTargets(filePath) {
   const buffer = fs.readFileSync(filePath);
   const numBytes = buffer.byteLength;
 
+  let numPositive = 0;
+  let numNegative = 0;
+
   let ys = [];
   for (let i = 0; i < numBytes; ++i) {
-    ys.push(buffer.readUInt8(i));
+    const y = buffer.readUInt8(i);
+    if (y === 1) {
+      numPositive++;
+    } else {
+      numNegative++;
+    }
+    ys.push(y);
   }
+
+  console.log(
+      `Loaded ${numPositive} positive examples and ` +
+      `${numNegative} negative examples.`);
   return tf.tensor2d(ys, [ys.length, 1], 'float32');
 }
 
@@ -171,13 +217,13 @@ async function maybeDownloadAndExtract() {
  *   xTest: The same as `xTrain`, but for the test dataset.
  *   yTest: The same as `yTrain`, but for the test dataset.
  */
-export async function loadData(numWords, len) {
+export async function loadData(numWords, len, multihot = false) {
   const dataDir = await maybeDownloadAndExtract();
 
   const trainFeaturePath = path.join(dataDir, 'imdb_train_data.bin');
-  const xTrain = loadFeatures(trainFeaturePath, numWords, len);
+  const xTrain = loadFeatures(trainFeaturePath, numWords, len, multihot);
   const testFeaturePath = path.join(dataDir, 'imdb_test_data.bin');
-  const xTest = loadFeatures(testFeaturePath, numWords, len);
+  const xTest = loadFeatures(testFeaturePath, numWords, len, multihot);
   const trainTargetsPath = path.join(dataDir, 'imdb_train_targets.bin');
   const yTrain = loadTargets(trainTargetsPath);
   const testTargetsPath = path.join(dataDir, 'imdb_test_targets.bin');
diff --git a/sentiment/embedding.js b/sentiment/embedding.js
@@ -0,0 +1,118 @@
+/**
+ * @license
+ * Copyright 2019 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+
+/**
+ * Utilites for extracting the embedding matrix and output them as files.
+ */
+
+import {writeFileSync} from 'fs';
+import * as tf from '@tensorflow/tfjs';
+
+/**
+ * Extract the first embedding matrix from a TensorFlow.js model.
+ *
+ * @param {tf.model} model An instance of tf.Model, assumed to contain an
+ *   Embedding layer.
+ * @retuns {tf.Tensor} The embedding matrix from the first Embedding
+ *   layer encoutnered while iterating through all layers of the model.
+ * @throws Error if no embedding layer can be found in the model.
+ */
+function extractEmbeddingMatrix(model) {
+  for (const layer of model.layers) {
+    if (layer.getClassName() === 'Embedding') {
+      const embed = layer.getWeights()[0];
+      tf.util.assert(
+        embed.rank === 2,
+        `Expected the rank of an embedding matrix to be 2, ` + 
+        `but got ${embed.rank}`);
+      return embed;
+    }
+  }
+  throw new Error('Cannot find any Embedding layer in model.');
+}
+
+/**
+ * Write the values of the first embedding matrix of a model to files.
+ * 
+ * The word labels are writen as well. The vectors and labels files are
+ * directly loadable into the Embedding Projector
+ * (https://projector.tensorflow.org/).
+ *
+ * @param {tf.model} model An instance of tf.Model, assumed to contain an
+ *   Embedding layer.
+ * @param {string} prefix Path prefix for writing the vectors and labels files.
+ *   For exapmle if `prefix` is `/tmp/embed`, then 
+ *   - the vectors will be written to `/tmp/embed_vectors.tsv`
+ *   - the labels will be written to `/tmp/embed_labels.tsv`
+ * @param {{[word: string]: number}} wordIndex A dictionary mapping words to
+ *   their integer indices.
+ * @param {number} indexFrom The basevalue of the integer indices.
+ */
+export async function writeEmbeddingMatrixAndLabels(
+    model, prefix, wordIndex, indexFrom) {
+  tf.util.assert(
+      prefix != null && prefix.length > 0,
+      `Null, undefined or empty path prefix`);
+
+  const embed = extractEmbeddingMatrix(model);
+
+  const numWords = embed.shape[0];
+  const embedDims = embed.shape[1];
+  const embedData = await embed.data();
+  
+  // Write the ebmedding matrix to file.
+  let vectorsStr = '';
+  let index = 0;
+  for (let i = 0; i < numWords; ++i) {
+    for (let j = 0; j < embedDims; ++j) {
+      vectorsStr += embedData[index++].toFixed(5);
+      if (j < embedDims - 1) {
+        vectorsStr += '\t';
+      } else {
+        vectorsStr += '\n';
+      }
+    }
+  }
+
+  const vectorsFilePath = `${prefix}_vectors.tsv`;
+  writeFileSync(vectorsFilePath, vectorsStr, {encoding: 'utf-8'});
+  console.log(
+      `Written embedding vectors (${numWords} * ${embedDims}) to: ` +
+      `${vectorsFilePath}`);
+
+  // Collect and write the word labels.
+  const indexToWord = {};
+  for (const word in wordIndex) {
+    indexToWord[wordIndex[word]] = word;
+  }
+
+  let labelsStr = '';
+  for(let i = 0; i < numWords; ++i) {
+    if (i >= indexFrom) {
+      labelsStr += indexToWord[i - indexFrom];
+    } else {
+      labelsStr += 'not-a-word';
+    }
+    labelsStr += '\n';
+  }
+
+  const labelsFilePath = `${prefix}_labels.tsv`;
+  writeFileSync(labelsFilePath, labelsStr, {encoding: 'utf-8'});
+  console.log(
+      `Written embedding labels (${numWords}) to: ` +
+      `${labelsFilePath}`);
+}
diff --git a/sentiment/train.js b/sentiment/train.js