diff --git a/model/ver20220624/model.json b/model/ver20220624/model.json new file mode 100644 index 0000000..8593dbf --- /dev/null +++ b/model/ver20220624/model.json @@ -0,0 +1,199 @@ +{ + "modelTopology": { + "class_name": "Sequential", + "config": { + "name": "sequential_1", + "layers": [ + { + "class_name": "Dense", + "config": { + "units": 32, + "activation": "relu", + "use_bias": true, + "kernel_initializer": { + "class_name": "VarianceScaling", + "config": { + "scale": 1, + "mode": "fan_avg", + "distribution": "normal", + "seed": null + } + }, + "bias_initializer": { "class_name": "Zeros", "config": {} }, + "kernel_regularizer": null, + "bias_regularizer": null, + "activity_regularizer": null, + "kernel_constraint": null, + "bias_constraint": null, + "name": "dense_Dense1", + "trainable": true, + "batch_input_shape": [null, 512], + "dtype": "float32" + } + }, + { + "class_name": "BatchNormalization", + "config": { + "axis": -1, + "momentum": 0.99, + "epsilon": 0.001, + "center": true, + "scale": true, + "beta_initializer": { "class_name": "Zeros", "config": {} }, + "gamma_initializer": { "class_name": "Ones", "config": {} }, + "moving_mean_initializer": { "class_name": "Zeros", "config": {} }, + "moving_variance_initializer": { + "class_name": "Ones", + "config": {} + }, + "beta_regularizer": null, + "gamma_regularizer": null, + "beta_constraint": null, + "gamma_constraint": null, + "name": "batch_normalization_BatchNormalization1", + "trainable": true + } + }, + { + "class_name": "Dense", + "config": { + "units": 32, + "activation": "relu", + "use_bias": true, + "kernel_initializer": { + "class_name": "VarianceScaling", + "config": { + "scale": 1, + "mode": "fan_avg", + "distribution": "normal", + "seed": null + } + }, + "bias_initializer": { "class_name": "Zeros", "config": {} }, + "kernel_regularizer": null, + "bias_regularizer": null, + "activity_regularizer": null, + "kernel_constraint": null, + "bias_constraint": null, + "name": "dense_Dense2", + "trainable": true + } + }, + { + "class_name": "BatchNormalization", + "config": { + "axis": -1, + "momentum": 0.99, + "epsilon": 0.001, + "center": true, + "scale": true, + "beta_initializer": { "class_name": "Zeros", "config": {} }, + "gamma_initializer": { "class_name": "Ones", "config": {} }, + "moving_mean_initializer": { "class_name": "Zeros", "config": {} }, + "moving_variance_initializer": { + "class_name": "Ones", + "config": {} + }, + "beta_regularizer": null, + "gamma_regularizer": null, + "beta_constraint": null, + "gamma_constraint": null, + "name": "batch_normalization_BatchNormalization2", + "trainable": true + } + }, + { + "class_name": "Dense", + "config": { + "units": 1, + "activation": "sigmoid", + "use_bias": true, + "kernel_initializer": { + "class_name": "VarianceScaling", + "config": { + "scale": 1, + "mode": "fan_avg", + "distribution": "normal", + "seed": null + } + }, + "bias_initializer": { "class_name": "Zeros", "config": {} }, + "kernel_regularizer": null, + "bias_regularizer": null, + "activity_regularizer": null, + "kernel_constraint": null, + "bias_constraint": null, + "name": "dense_Dense3", + "trainable": true + } + } + ] + }, + "keras_version": "tfjs-layers 3.18.0", + "backend": "tensor_flow.js" + }, + "weightsManifest": [ + { + "paths": ["weights.bin"], + "weights": [ + { + "name": "dense_Dense1/kernel", + "shape": [512, 32], + "dtype": "float32" + }, + { "name": "dense_Dense1/bias", "shape": [32], "dtype": "float32" }, + { + "name": "batch_normalization_BatchNormalization1/gamma", + "shape": [32], + "dtype": "float32" + }, + { + "name": "batch_normalization_BatchNormalization1/beta", + "shape": [32], + "dtype": "float32" + }, + { + "name": "dense_Dense2/kernel", + "shape": [32, 32], + "dtype": "float32" + }, + { "name": "dense_Dense2/bias", "shape": [32], "dtype": "float32" }, + { + "name": "batch_normalization_BatchNormalization2/gamma", + "shape": [32], + "dtype": "float32" + }, + { + "name": "batch_normalization_BatchNormalization2/beta", + "shape": [32], + "dtype": "float32" + }, + { "name": "dense_Dense3/kernel", "shape": [32, 1], "dtype": "float32" }, + { "name": "dense_Dense3/bias", "shape": [1], "dtype": "float32" }, + { + "name": "batch_normalization_BatchNormalization1/moving_mean", + "shape": [32], + "dtype": "float32" + }, + { + "name": "batch_normalization_BatchNormalization1/moving_variance", + "shape": [32], + "dtype": "float32" + }, + { + "name": "batch_normalization_BatchNormalization2/moving_mean", + "shape": [32], + "dtype": "float32" + }, + { + "name": "batch_normalization_BatchNormalization2/moving_variance", + "shape": [32], + "dtype": "float32" + } + ] + } + ], + "format": "layers-model", + "generatedBy": "TensorFlow.js tfjs-layers v3.18.0", + "convertedBy": null +} diff --git a/model/ver20220624/weights.bin b/model/ver20220624/weights.bin new file mode 100644 index 0000000..e41e012 Binary files /dev/null and b/model/ver20220624/weights.bin differ diff --git a/trainer/datasets.ts b/trainer/datasets.ts new file mode 100644 index 0000000..fb95a3d --- /dev/null +++ b/trainer/datasets.ts @@ -0,0 +1,76 @@ +import * as use from '@tensorflow-models/universal-sentence-encoder' +import * as tf from '@tensorflow/tfjs-node' + +/** + * 스마일게이트 데이터셋을 universal-sentence-encoder 를 통해 encoding한 tf.data.Dataset을 반환한다. + * @param filepath 데이터셋 CSV URL @see getUnsmileDataUrl + * @param encoder use.UniversalSentenceEncoder 를 사용하여 string을 인코딩 + * @link https://github.com/smilegate-ai/korean_unsmile_dataset + */ +async function loadUnsmileData({ + filepath, + encoder, +}: { + filepath: string + encoder: use.UniversalSentenceEncoder +}): Promise> { + return tf.data + .csv(filepath, { + delimiter: '\t', + hasHeader: true, + configuredColumnsOnly: true, + columnConfigs: { + clean: { + dtype: 'int32', + isLabel: true, + }, + 문장: { + dtype: 'string', + }, + }, + }) + .mapAsync(async (data: any) => { + const out = await encoder.embed(data.xs['문장']) + return { + xs: out.flatten(), + ys: Object.values(data.ys), + } + }) + .batch(32) + .shuffle(32) +} + +/** + * 스마일게이트 데이터셋을 universal-sentence-encoder 를 통해 encoding한 tf.data.Dataset을 반환한다. + * 학습 데이터와 밸리데이션 데이터를 tf.data.Dataset 형태로 반환한다. + * + * @param encoder use.UniversalSentenceEncoder + * @returns + */ +export async function loadUnsmileTrainValidData( + encoder: use.UniversalSentenceEncoder, +): Promise<{ + trainData: tf.data.Dataset + valData: tf.data.Dataset +}> { + const trainData = await loadUnsmileData({ + filepath: getUnsmileDataUrl('train', 'v1.0'), + encoder, + }) + const valData = await loadUnsmileData({ + filepath: getUnsmileDataUrl('valid', 'v1.0'), + encoder, + }) + return { trainData, valData } +} + +/** + * 스마일게이트 데이터셋 CSV URL을 위한 도움 함수. + * + * @param type "train" or "valid" + * @param version "v1.0" + * @returns full url path + */ +function getUnsmileDataUrl(type: string, version: string): string { + return `https://raw.githubusercontent.com/smilegate-ai/korean_unsmile_dataset/main/unsmile_${type}_${version}.tsv` +} diff --git a/trainer/model.ts b/trainer/model.ts new file mode 100644 index 0000000..5c2bd4f --- /dev/null +++ b/trainer/model.ts @@ -0,0 +1,44 @@ +import * as tf from '@tensorflow/tfjs-node' +import path from 'path' + +const FILE_SCHEME = 'file://' + +/** + * 모델을 불러오거나 불러오는데 실패할 경우 새로운 모델을 생성한다. + * + * @param modelDirectoryPath 저장된 모델의 경로. 인풋 형식은 https://www.tensorflow.org/js/guide/save_load 참조 할 것. + * @returns 학습 모델을 반환 + */ +export async function getModel( + modelDirectoryPath: string, +): Promise { + try { + const modelPath = + FILE_SCHEME + + path.join(modelDirectoryPath.replace(FILE_SCHEME, ''), 'model.json') + console.info(`Trying to load a model from ${modelPath}`) + return await tf.loadLayersModel(modelPath) + } catch (e) { + console.warn(e) + console.warn(`Unable to load a model. Creating a new model`) + return tf.sequential({ + layers: [ + tf.layers.dense({ + inputDim: 512, + units: 32, + activation: 'relu', + }), + tf.layers.batchNormalization(), + tf.layers.dense({ + units: 32, + activation: 'relu', + }), + tf.layers.batchNormalization(), + tf.layers.dense({ + units: 1, + activation: 'sigmoid', + }), + ], + }) + } +} diff --git a/trainer/package.json b/trainer/package.json index 264b7b5..71bbba5 100644 --- a/trainer/package.json +++ b/trainer/package.json @@ -4,7 +4,7 @@ "description": "", "main": "index.js", "scripts": { - "build": "npx ts-node trainer.ts" + "start": "ts-node trainer.ts" }, "keywords": [], "author": "", diff --git a/trainer/trainer.ts b/trainer/trainer.ts index c6a7813..4c0dcf4 100644 --- a/trainer/trainer.ts +++ b/trainer/trainer.ts @@ -1,94 +1,52 @@ import * as use from '@tensorflow-models/universal-sentence-encoder' import * as tf from '@tensorflow/tfjs-node' -async function main() { - const encoder = await use.load() - const trainData = tf.data - .csv( - 'https://raw.githubusercontent.com/smilegate-ai/korean_unsmile_dataset/main/unsmile_train_v1.0.tsv', - { - delimiter: '\t', - hasHeader: true, - configuredColumnsOnly: true, - columnConfigs: { - clean: { - dtype: 'int32', - isLabel: true, - }, - 문장: { - dtype: 'string', - }, - }, - }, - ) - .mapAsync(async (data: any) => { - const out = await encoder.embed(data.xs['문장']) - return { - xs: out.flatten(), - ys: Object.values(data.ys), - } - }) - .batch(32) - .shuffle(32) +import path from 'path' +import { loadUnsmileTrainValidData } from './datasets' +import { getModel } from './model' - const valData = tf.data - .csv( - 'https://raw.githubusercontent.com/smilegate-ai/korean_unsmile_dataset/main/unsmile_valid_v1.0.tsv', - { - delimiter: '\t', - hasHeader: true, - configuredColumnsOnly: true, - columnConfigs: { - clean: { - dtype: 'int32', - isLabel: true, - }, - 문장: { - dtype: 'string', - }, - }, - }, - ) - .mapAsync(async (data: any) => { - const out = await encoder.embed(data.xs['문장']) - return { - xs: out.flatten(), - ys: Object.values(data.ys), - } - }) - .batch(32) - .shuffle(32) +// model will be saved into ${MODEL_DIRECTORY_PATH}/{model.json,weights.bin} +const MODEL_DIRECTORY_PATH = `file://${path.join( + __dirname, + '..', + 'model', + 'ver20220624', +)}` - const model = tf.sequential({ - layers: [ - tf.layers.dense({ - inputDim: 512, - units: 512, - activation: 'relu', - }), - tf.layers.batchNormalization(), - tf.layers.dense({ - units: 512, - activation: 'relu', - }), - tf.layers.batchNormalization(), - tf.layers.dense({ - units: 1, - activation: 'sigmoid', - }), - ], - }) +async function main() { + const encoder = await use.load() + const { trainData, valData } = await loadUnsmileTrainValidData(encoder) + const model = await getModel(MODEL_DIRECTORY_PATH) model.compile({ optimizer: tf.train.adam(), loss: tf.losses.sigmoidCrossEntropy, - metrics: [tf.metrics.binaryAccuracy], + metrics: [ + tf.metrics.binaryAccuracy, + tf.metrics.precision, + tf.metrics.recall, + ], }) - model.fitDataset(trainData, { - epochs: 5, + model.summary() + + await model.fitDataset(trainData, { + epochs: 1, validationData: valData, + callbacks: [ + tf.callbacks.earlyStopping({ + patience: 1, + }), + ], }) + + const savedResult = await model.save(MODEL_DIRECTORY_PATH) + + if (savedResult.errors) { + console.error(savedResult) + } else { + console.info(savedResult) + } } main() diff --git a/trainer/tsconfig.json b/trainer/tsconfig.json index ba64835..ce0f9e8 100644 --- a/trainer/tsconfig.json +++ b/trainer/tsconfig.json @@ -57,7 +57,7 @@ // "downlevelIteration": true, /* Emit more compliant, but verbose and less performant JavaScript for iteration. */ // "sourceRoot": "", /* Specify the root path for debuggers to find the reference source code. */ // "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */ - // "inlineSourceMap": true, /* Include sourcemap files inside the emitted JavaScript. */ + "inlineSourceMap": true /* Include sourcemap files inside the emitted JavaScript. */, // "inlineSources": true, /* Include source code in the sourcemaps inside the emitted JavaScript. */ // "emitBOM": true, /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */ // "newLine": "crlf", /* Set the newline character for emitting files. */