two_sample/generate.py

import os
import sys

import numpy as np
from sklearn.utils import check_random_state


################################################################################
### Simple toy problems

def sample_SG(n, dim, rs=None):
    rs = check_random_state(rs)
    mu = np.zeros(dim)
    sigma = np.eye(dim)
    X = rs.multivariate_normal(mu, sigma, size=n)
    Y = rs.multivariate_normal(mu, sigma, size=n)
    return X, Y


def sample_GMD(n, dim, rs=None):
    rs = check_random_state(rs)
    mu = np.zeros(dim)
    sigma = np.eye(dim)
    X = rs.multivariate_normal(mu, sigma, size=n)
    mu[0] += 1
    Y = rs.multivariate_normal(mu, sigma, size=n)
    return X, Y


def sample_GVD(n, dim, rs=None):
    rs = check_random_state(rs)
    mu = np.zeros(dim)
    sigma = np.eye(dim)
    X = rs.multivariate_normal(mu, sigma, size=n)
    sigma[0, 0] = 2
    Y = rs.multivariate_normal(mu, sigma, size=n)
    return X, Y


def sample_blobs(n, ratio, rows=5, cols=5, sep=10, rs=None):
    rs = check_random_state(rs)
    # ratio is eigenvalue ratio
    correlation = (ratio - 1) / (ratio + 1)

    # generate within-blob variation
    mu = np.zeros(2)
    sigma = np.eye(2)
    X = rs.multivariate_normal(mu, sigma, size=n)

    corr_sigma = np.array([[1, correlation], [correlation, 1]])
    Y = rs.multivariate_normal(mu, corr_sigma, size=n)

    # assign to blobs
    X[:, 0] += rs.randint(rows, size=n) * sep
    X[:, 1] += rs.randint(cols, size=n) * sep
    Y[:, 0] += rs.randint(rows, size=n) * sep
    Y[:, 1] += rs.randint(cols, size=n) * sep

    return X, Y


################################################################################
### Sample images from GANs

def _load_mnist(dset='t10k'):
    # Basically taken from Lasagne/examples/mnist.py
    if sys.version_info[0] == 2:
        from urllib import urlretrieve
    else:
        from urllib.request import urlretrieve

    def download(filename, source="http://yann.lecun.com/exdb/mnist/"):
        print("Downloading {}".format(filename))
        urlretrieve(source + filename, filename)

    import gzip
    def load_mnist_images(filename):
        if not os.path.exists(filename):
            download(filename)

        with gzip.open(filename, 'rb') as f:
            data = np.frombuffer(f.read(), np.uint8, offset=16)
        data = data.reshape(-1, 1, 28, 28)
        return data / np.float32(255)

    return load_mnist_images(dset + '-images-idx3-ubyte.gz')


def _sample_trained_minibatch_gan(params_file, n, batch_size, rs):
    import lasagne
    from lasagne.init import Normal
    import lasagne.layers as ll
    import theano as th
    from theano.sandbox.rng_mrg import MRG_RandomStreams
    import theano.tensor as T

    import nn

    theano_rng = MRG_RandomStreams(rs.randint(2 ** 15))
    lasagne.random.set_rng(np.random.RandomState(rs.randint(2 ** 15)))

    noise_dim = (batch_size, 100)
    noise = theano_rng.uniform(size=noise_dim)
    ls = [ll.InputLayer(shape=noise_dim, input_var=noise)]
    ls.append(nn.batch_norm(
        ll.DenseLayer(ls[-1], num_units=4*4*512, W=Normal(0.05),
                      nonlinearity=nn.relu),
        g=None))
    ls.append(ll.ReshapeLayer(ls[-1], (batch_size,512,4,4)))
    ls.append(nn.batch_norm(
        nn.Deconv2DLayer(ls[-1], (batch_size,256,8,8), (5,5), W=Normal(0.05),
                         nonlinearity=nn.relu),
        g=None)) # 4 -> 8
    ls.append(nn.batch_norm(
        nn.Deconv2DLayer(ls[-1], (batch_size,128,16,16), (5,5), W=Normal(0.05),
                         nonlinearity=nn.relu),
        g=None)) # 8 -> 16
    ls.append(nn.weight_norm(
        nn.Deconv2DLayer(ls[-1], (batch_size,3,32,32), (5,5), W=Normal(0.05),
                         nonlinearity=T.tanh),
        train_g=True, init_stdv=0.1)) # 16 -> 32
    gen_dat = ll.get_output(ls[-1])

    with np.load(params_file) as d:
        params = [d['arr_{}'.format(i)] for i in range(9)]
    ll.set_all_param_values(ls[-1], params, trainable=True)

    sample_batch = th.function(inputs=[], outputs=gen_dat)
    samps = []
    while len(samps) < n:
        samps.extend(sample_batch())
    samps = np.array(samps[:n])
    return samps


def sample_mnist_minibatch_gan(
        n, params_file, batch_size=100, rs=None, mnist_images=None,
        discretize=None, bw=False, grayscale=True, clip=True, scaled=False,
        trim_edges=False):
    rs = check_random_state(rs)

    Y = _sample_trained_minibatch_gan(params_file, n, min(n, batch_size), rs)

    if mnist_images is None:
        mnist_images = _load_mnist()

    X = mnist_images[rs.choice(mnist_images.shape[0], n, replace=False), :]

    # X is shape (n, 1, 28, 28); Y is (n, 3, 32, 32)
    # Process them to a common format:

    # GAN images are color, MNIST are grayscale
    if grayscale or bw:
        # 0.2125 R + 0.7154 G + 0.0721 B, per skimage.color.rgb2gray
        Y = np.einsum('nchw,c->nhw', Y, [0.2125, 0.7154, 0.0721])
        X = X[:, 0, :, :]
    else:
        X = np.tile(X, (1, 3, 1, 1))

    # GAN images are 32x32, MNIST are 28x28
    if trim_edges:
        Y = Y[..., 2:-2, 2:-2]
    else:
        t = X
        X = np.zeros(tuple(32 if s == 28 else s for s in t.shape), t.dtype)
        X[..., 2:-2, 2:-2] = t

    # GAN images have range [-1, 1]; MNIST has [0, 1]
    if scaled:
        Y += 1
        Y /= 2
    elif clip:
        np.clip(Y, 0, 1, out=Y)

    # flatten
    X = X.reshape(n, -1)
    Y = Y.reshape(n, -1)

    # pixel-level differences make the problem too easy; maybe discretize
    if bw:
        X = X.round()
        Y = Y.round()
        if not scaled and not clip:
            np.clip(Y, 0, 1, out=Y)
    elif discretize:
        bins = np.linspace(0, 1 + np.spacing(1), num=discretize + 1)
        midpoints = (bins[:-1] + bins[1:]) / 2.

        Y = midpoints[np.digitize(Y, bins) - 1]
        X = midpoints[np.digitize(X, bins) - 1]

    return X, Y


################################################################################
### Helpers to use with argparse

def add_problem_args(group):
    g = group.add_mutually_exclusive_group(required=True)
    g.add_argument('--sg', '--same-gaussian', type=int, metavar='DIM')
    g.add_argument('--gmd', '--gaussian-mean-difference',
                   type=int, metavar='DIM')
    g.add_argument('--gvd', '--gaussian-var-difference',
                   type=int, metavar='DIM')
    g.add_argument('--blobs', type=float, metavar='EIG_RATIO')
    g.add_argument('--mnist-minibatch-gan', metavar='PARAMS_FILE')
    g.add_argument('--mnist-traintest', action='store_true')

    g = group.add_mutually_exclusive_group()
    g.add_argument('--grayscale', action='store_true', default=False,
                   help="For GAN outputs: make images grayscale.")
    g.add_argument('--no-grayscale', action='store_false', dest='grayscale')

    g = group.add_mutually_exclusive_group()
    g.add_argument('--bw', action='store_true', default=False,
                   help="For GAN outputs: make images black+white (implies "
                        "--grayscale).")
    group.add_argument('--discretize', type=int, metavar='N_BINS',
                       help="For GAN outputs: discretize possible outputs "
                            "into N_BINS bins. Note that "
                            "`--grayscale --discretize 2` makes the outputs "
                            "[.25, .75], where `--bw` makes them [0, 1].")

    g = group.add_mutually_exclusive_group()
    g.add_argument('--trim-edges', action='store_true', default=False,
                   help="For MNIST GANs: trim the outer border of samples.")
    g.add_argument('--no-trim-edges', action='store_false', dest='trim_edges')

    g = group.add_mutually_exclusive_group()
    g.add_argument('--clip', action='store_true', default=True,
                   help="For GAN outputs: clip pixel values to [0, 1]. "
                        "On by default.")
    g.add_argument('--no-clip', action='store_false', dest='clip',
                   help="For GAN outputs: leave pixels as they are, possibly "
                        "in [-1, 1].")
    g.add_argument('--scaled', action='store_true', default=False,
                   help="For GAN outputs: scale pixel values to [0, 1].")


def generate_data(args, n, dtype=None, rs=None):
    if args.sg is not None:
        X, Y = sample_SG(n, args.sg, rs=rs)
    elif args.gmd is not None:
        X, Y = sample_GMD(n, args.gmd, rs=rs)
    elif args.gvd is not None:
        X, Y = sample_GVD(n, args.gvd, rs=rs)
    elif args.blobs is not None:
        X, Y = sample_blobs(n, args.blobs, rs=rs)
    elif args.mnist_minibatch_gan is not None:
        X, Y = sample_mnist_minibatch_gan(
            n, args.mnist2_gan, rs=rs, grayscale=args.grayscale, bw=args.bw,
            trim_edges=args.trim_edges, clip=args.clip, scaled=args.scaled,
            discretize=args.discretize)
    elif args.mnist_traintest:
        rs = check_random_state(rs)
        # MNIST loads as n x 1 x 28 x 28; want n x 784
        X = _load_mnist('t10k').reshape(-1, 784)
        X = X[rs.choice(X.shape[0], n, replace=False), :]
        Y = _load_mnist('train').reshape(-1, 784)
        Y = Y[rs.choice(Y.shape[0], n, replace=False), :]
    else:
        raise ValueError("No dataset passed")

    if dtype is not None:
        X = X.astype(dtype)
        Y = Y.astype(dtype)
    return X, Y