Skip to content

PipeModeDataset leads to infinite loop / memory exhaust when re-using dataset with tf.keras #46

@fmannhardt

Description

@fmannhardt

I have tried out with several configurations to use PipeModeDataset together with tf.keras and I run into trouble re-using the same dataset (e.g. validation) for use in both fit and evaluate. It seems that on the second call the Sagemaker instance exhausts all available GPU memory and goes into some kind of loop.

This is my current training script (I will try to strip it down further, but this works perfectly when using File mode but fails on the evaluate call when executed in ``Pipe` mode:

import argparse, os

import logging
import math
import json

logging.getLogger().setLevel(logging.INFO)
logging.getLogger("tensorflow").setLevel(logging.INFO)
import tensorflow as tf

import glob

def load_data_as_dataset(channel_name, channel, data_config):

    def get_filenames(channel):
        return(glob.glob(channel + "/*.tfrecord"))

    mode = data_config[channel_name]['TrainingInputMode']    

    logging.info("Running {} in mode: {}".format(channel_name, mode))

    if mode == 'Pipe':
        # Construct a `PipeModeDataset` reading from a 'training' channel, using
        # the TF Record encoding.        
        from sagemaker_tensorflow import PipeModeDataset
        ds = PipeModeDataset(channel=channel_name, record_format='TFRecord')
    else:    
        filenames = get_filenames(channel)
        logging.info("Loading files {}".format(filenames))
        ds = tf.data.TFRecordDataset(filenames) 
    
    return ds

def extract_example(example_proto):

    image_feature_description = {
        'image/encoded': tf.io.FixedLenFeature([], tf.string),
        'image/class/obj1_center_x': tf.io.FixedLenFeature([], tf.float32),
        'image/class/obj1_center_y': tf.io.FixedLenFeature([], tf.float32),
        'image/class/obj2_center_x': tf.io.FixedLenFeature([], tf.float32),
        'image/class/obj2_center_y': tf.io.FixedLenFeature([], tf.float32)
    }

    feature = tf.io.parse_single_example(example_proto, image_feature_description)

    image = feature['image/encoded']
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.convert_image_dtype(image, tf.float32)

    return (image, tf.convert_to_tensor([feature['image/class/obj1_center_x'], 
                                         feature['image/class/obj1_center_y'],
                                         feature['image/class/obj2_center_x'],
                                         feature['image/class/obj2_center_y']]))

def train_preprocess(image, label):
    
    image = tf.image.random_brightness(image, max_delta=32.0 / 255.0)
    image = tf.image.random_saturation(image, lower=0.5, upper=1.5)

    #Make sure the image is still in [0, 1]
    image = tf.clip_by_value(image, 0.0, 1.0)

    return image, label


def build_model(input_shape):
    
    model = tf.keras.Sequential()

    model.add(tf.keras.layers.Conv2D(8, kernel_size=(3,3), padding='same', 
                                     input_shape=input_shape))
    model.add(tf.keras.layers.LeakyReLU())
    
    model.add(tf.keras.layers.Conv2D(8, kernel_size=(3,3), padding='same'))
    model.add(tf.keras.layers.LeakyReLU())    
    
    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2,2), strides=2))

    model.add(tf.keras.layers.Conv2D(16, kernel_size=(3,3), padding='same'))
    model.add(tf.keras.layers.LeakyReLU())

    model.add(tf.keras.layers.Conv2D(16, kernel_size=(3,3), padding='same'))
    model.add(tf.keras.layers.LeakyReLU())
    
    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2,2), strides=2))

    model.add(tf.keras.layers.Conv2D(32, kernel_size=(3,3), padding='same'))
    model.add(tf.keras.layers.LeakyReLU())

    model.add(tf.keras.layers.Conv2D(32, kernel_size=(3,3), padding='same'))
    model.add(tf.keras.layers.LeakyReLU())

    model.add(tf.keras.layers.GlobalAveragePooling2D())

    model.add(tf.keras.layers.Dense(32))
    model.add(tf.keras.layers.LeakyReLU())
    model.add(tf.keras.layers.Dropout(rate = 0.1))

    model.add(tf.keras.layers.Dense(4, activation='linear'))
    
    return (model)

if __name__ == '__main__':
        
    parser = argparse.ArgumentParser()
    
    #
    # Standard parameters required by Sagemaker
    #
    parser.add_argument('--gpu-count', type=int, default=os.environ['SM_NUM_GPUS'])
    parser.add_argument('--output-dir', type=str, default=os.environ.get('SM_OUTPUT_DIR'))        
    parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    
    parser.add_argument('--data-config',type=json.loads,default=os.environ.get('SM_INPUT_DATA_CONFIG'))

    #
    # Input Channels
    #
    parser.add_argument('--training', type=str, required=False, default=os.environ.get('SM_CHANNEL_TRAINING'))
    parser.add_argument('--validation', type=str, required=False, default=os.environ.get('SM_CHANNEL_VALIDATION'))       

    #
    # Input Parameters
    #
    parser.add_argument('--num-channels', type=int, default=3)
    parser.add_argument('--img-height', type=int, default=416)
    parser.add_argument('--img-width', type=int, default=416)
    
    parser.add_argument('--num-samples', type=int, required=True)
    parser.add_argument('--num-validation', type=int, default=64)
    
    #
    # Training Parameters
    #
    parser.add_argument('--epochs', type=int, default=10)
    parser.add_argument('--learning-rate', type=float, default=0.01)
    parser.add_argument('--batch-size', type=int, default=16)
        
    args, _ = parser.parse_known_args()
    
    # NCWH format
    input_shape = (args.img_width,
                   args.img_height,
                   args.num_channels)     
    
    #
    # Build model
    #
    
    model = build_model(input_shape)
       
    model.compile(optimizer = tf.keras.optimizers.Adam(lr = args.learning_rate),
                  loss= "mse", 
                  metrics=['mse', 'mae'])
    
    
    len_train = args.num_samples
    logging.info("Training samples: {}".format(len_train))
    
    len_val = args.num_validation
    logging.info("Validation samples: {}".format(len_val))
    
    #
    # Prepare tf.dataset input
    #
    
    x_train = load_data_as_dataset("training", args.training, args.data_config)    
    x_train = x_train.shuffle(buffer_size = 8 * args.batch_size)    
    x_train = x_train.repeat()
    
    x_train = x_train.map(extract_example, num_parallel_calls=4)
    x_train = x_train.map(train_preprocess, num_parallel_calls=4)
    x_train = x_train.batch(args.batch_size)
    
    x_train = x_train.prefetch(1)
    
    if (args.validation != None and args.num_validation > 0):
        x_val = load_data_as_dataset("validation", args.validation, args.data_config)
        x_val = x_val.repeat()        
        x_val = x_val.map(extract_example, num_parallel_calls=4)
        x_val = x_val.batch(args.batch_size)
        
        x_val = x_val.prefetch(1)
    else:
        x_val = None
    
    #
    # Train model
    #
    
    logging.info("Batch size: {}".format(args.batch_size))
    
    steps_per_epoch = len_train // args.batch_size
    logging.info("Training steps per epoch {}".format(steps_per_epoch))
    
    validation_steps = len_val // args.batch_size
    logging.info("Validation steps per epoch {}".format(validation_steps))    
    
    if (x_val != None):
        history = model.fit(x = x_train,
                            validation_data = x_val, 
                            epochs = args.epochs,
                            steps_per_epoch = steps_per_epoch,
                            validation_steps = validation_steps,
                            verbose = 2)
    else:
        history = model.fit(x = x_train,
                            epochs = args.epochs,
                            steps_per_epoch = steps_per_epoch,
                            verbose = 2)        
    
    logging.info("Result: {}".format(history.history))
    
    #
    # Evaluate model
    #    
    
    if (x_val != None):
        score = model.evaluate(x_val,
                       steps=validation_steps,
                       verbose=2)        
        
        logging.info('Validation: {}'.format(list(zip(score, model.metrics_names))))
              
    #
    # Save/Export model
    #

    tf.contrib.saved_model.save_keras_model(model, os.path.join(args.model_dir, 'model/1'))
    model.save(os.path.join(args.output_dir, 'model.h5'))

I tried with framework version: v1.13, v1.14 both show the same behaviour and this seems to be related to re-using the dataset after model.fit is done. If I don't call model.evaluate, then everything is fine.

Unfortunately not much logging output except for this warning:

2019-08-12 07:44:06.993021: W tensorflow/core/framework/dataset.cc:393] Input of PipeModeDatasetOp::Dataset will not be optimized because the dataset does not implement the AsGraphDefInternal() method needed to apply optimizations.
2019-08-12 07:44:07.056622: W tensorflow/core/framework/dataset.cc:393] Input of PipeModeDatasetOp::Dataset will not be optimized because the dataset does not implement the AsGraphDefInternal() method needed to apply optimizations.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions