data-maker/data/gan.py

"""
usage :
    optional :
    --num_gpu   number of gpus to use will default to 1
    --epoch     steps per epoch default to 256
"""
import tensorflow as tf
from tensorflow.contrib.layers import l2_regularizer
import numpy as np
import pandas as pd
import time
import os
import sys
from data.params import SYS_ARGS
from data.bridge import Binary
import json
import pickle

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# STEPS_PER_EPOCH     = int(SYS_ARGS['epoch']) if 'epoch' in SYS_ARGS else 256
# NUM_GPUS            = 1 if 'num_gpu' not in SYS_ARGS else int(SYS_ARGS['num_gpu'])
# BATCHSIZE_PER_GPU   = 2000
# TOTAL_BATCHSIZE     = BATCHSIZE_PER_GPU * NUM_GPUS

class void :
    pass
class GNet :
    """
    This is the base class of a generative network functions, the details will be implemented in the subclasses.
    An instance of this class is accessed as follows
    object.layers.normalize applies batch normalization or otherwise
    obect.get.variables     instanciate variables on cpu and return a reference (tensor)
    """
    def __init__(self,**args):
        self.layers = void()
        self.layers.normalize = self.normalize


        self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu']


        self.X_SPACE_SIZE = args['real'].shape[1] if 'real' in args else 854
        self.G_STRUCTURE = [128,128] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE]
        self.D_STRUCTURE = [self.X_SPACE_SIZE,256,128] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE*2, self.X_SPACE_SIZE] #-- change 854 to number of diagnosis
        # self.NUM_LABELS     = 8 if 'label' not in args elif len(args['label'].shape) args['label'].shape[1]
        if 'label' in args and len(args['label'].shape) == 2 :
            self.NUM_LABELS = args['label'].shape[1]
        elif 'label' in args and len(args['label']) == 1 :
            self.NUM_LABELS = args['label'].shape[0]
        else:
            self.NUM_LABELS = 8
        self.Z_DIM = 128 #self.X_SPACE_SIZE
        self.BATCHSIZE_PER_GPU = args['real'].shape[0] if 'real' in args else 256
        self.TOTAL_BATCHSIZE = self.BATCHSIZE_PER_GPU * self.NUM_GPUS
        self.STEPS_PER_EPOCH = 256 #int(np.load('ICD9/train.npy').shape[0] / 2000)
        self.MAX_EPOCHS = 10 if 'max_epochs' not in args else int(args['max_epochs'])
        self.ROW_COUNT = args['real'].shape[0] if 'real' in args else 100
        self.CONTEXT = args['context']
        self.ATTRIBUTES = {"id":args['column_id'] if 'column_id' in args else None,"synthetic":args['column'] if 'column' in args else None}
        self._REAL = args['real'] if 'real' in args else None
        self._LABEL = args['label'] if 'label' in args else None

        self.get = void()
        self.get.variables = self._variable_on_cpu
        self.get.suffix = lambda : "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
        self.logger = args['logger'] if 'logger' in args and args['logger'] else None
        self.init_logs(**args)

    def init_logs(self,**args):
        self.log_dir = args['logs'] if 'logs' in args else 'logs'
        self.mkdir(self.log_dir)
        #
        #
        for key in ['train','output'] :
            self.mkdir(os.sep.join([self.log_dir,key]))
            self.mkdir (os.sep.join([self.log_dir,key,self.CONTEXT]))

        self.train_dir  = os.sep.join([self.log_dir,'train',self.CONTEXT])
        self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT])

    def load_meta(self,column):
        """
        This function is designed to accomodate the uses of the sub-classes outside of a strict dependency model.
        Because prediction and training can happen independently
        """
        # suffix = "-".join(column) if isinstance(column,list)else column
        suffix = self.get.suffix()
        _name = os.sep.join([self.out_dir,'meta-'+suffix+'.json'])
        if os.path.exists(_name) :
            attr = json.loads((open(_name)).read())
            for key in attr :
                value = attr[key]
                setattr(self,key,value)
        self.train_dir  = os.sep.join([self.log_dir,'train',self.CONTEXT])
        self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT])


    def log_meta(self,**args) :
        _object = {
            'CONTEXT':self.CONTEXT,
            'ATTRIBUTES':self.ATTRIBUTES,
            'BATCHSIZE_PER_GPU':self.BATCHSIZE_PER_GPU,
            'Z_DIM':self.Z_DIM,
            "X_SPACE_SIZE":self.X_SPACE_SIZE,
            "D_STRUCTURE":self.D_STRUCTURE,
            "G_STRUCTURE":self.G_STRUCTURE,
            "NUM_GPUS":self.NUM_GPUS,
            "NUM_LABELS":self.NUM_LABELS,
            "MAX_EPOCHS":self.MAX_EPOCHS,
            "ROW_COUNT":self.ROW_COUNT
        }
        if args and 'key' in args and 'value' in args :
            key = args['key']
            value= args['value']
            object[key] = value
        # suffix = "-".join(self.column) if isinstance(self.column,list) else self.column
        suffix = self.get.suffix()
        _name = os.sep.join([self.out_dir,'meta-'+suffix])

        f = open(_name+'.json','w')
        f.write(json.dumps(_object))
        return _object
    def mkdir (self,path):
        if not os.path.exists(path) :
            os.mkdir(path)


    def normalize(self,**args):
        """
        This function will perform a batch normalization on an network layer
        inputs      input layer of the neural network
        name        name of the scope the
        labels      labels (attributes not synthesized) by default None
        n_labels    number of labels default None
        """
        inputs  = args['inputs']
        name    = args['name']
        labels  = None if 'labels' not in args else args['labels']
        n_labels= None if 'n_labels' not in args else args['n_labels']
        shift   = [0] if self.__class__.__name__.lower() == 'generator' else [1] #-- not sure what this is doing
        mean, var   = tf.nn.moments(inputs, shift, keep_dims=True)
        shape       = inputs.shape[1].value
        offset_m    = self.get.variables(shape=[n_labels,shape], name='offset'+name,
                                    initializer=tf.zeros_initializer)
        scale_m = self.get.variables(shape=[n_labels,shape], name='scale'+name,
                                initializer=tf.ones_initializer)

        offset  = tf.nn.embedding_lookup(offset_m, labels)
        scale   = tf.nn.embedding_lookup(scale_m, labels)
        result  = tf.nn.batch_normalization(inputs, mean, var, offset, scale, 1e-8)
        return result

    def _variable_on_cpu(self,**args):
        """
        This function makes sure variables/tensors are not created on the GPU but rather on the CPU
        """

        name = args['name']
        shape = args['shape']
        initializer=None if 'initializer' not in args else args['initializer']
        with tf.device('/cpu:0') :
            cpu_var =  tf.compat.v1.get_variable(name,shape,initializer= initializer)
        return cpu_var
    def average_gradients(self,tower_grads):
        average_grads = []
        for grad_and_vars in zip(*tower_grads):
            grads = []
            for g, _ in grad_and_vars:
                expanded_g = tf.expand_dims(g, 0)
                grads.append(expanded_g)

            grad = tf.concat(axis=0, values=grads)
            grad = tf.reduce_mean(grad, 0)

            v = grad_and_vars[0][1]
            grad_and_var = (grad, v)
            average_grads.append(grad_and_var)
        return average_grads


class Generator (GNet):
    """
    This class is designed to handle generation of candidate datasets for this it will aggregate a discriminator, this allows the generator not to be random

    """
    def __init__(self,**args):
        GNet.__init__(self,**args)
        self.discriminator = Discriminator(**args)
    def loss(self,**args):
        fake    = args['fake']
        label   = args['label']
        y_hat_fake = self.discriminator.network(inputs=fake, label=label)
        all_regs = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
        loss = -tf.reduce_mean(y_hat_fake) + sum(all_regs)
        tf.add_to_collection('glosses', loss)
        return loss, loss
    def load_meta(self, column):
        super().load_meta(column)
        self.discriminator.load_meta(column)
    def network(self,**args) :
        """
        This function will build the network that will generate the synthetic candidates
        :inputs matrix of data that we need
        :dim    dimensions of ...
        """
        x       = args['inputs']
        tmp_dim = self.Z_DIM if 'dim' not in args else args['dim']
        label   = args['label']

        with tf.compat.v1.variable_scope('G', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)):
            for i, dim in enumerate(self.G_STRUCTURE[:-1]):
                kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, dim])
                h1 = self.normalize(inputs=tf.matmul(x, kernel),shift=0, name='cbn' + str(i), labels=label, n_labels=self.NUM_LABELS)
                h2 = tf.nn.relu(h1)
                x = x + h2
                tmp_dim = dim
            i = len(self.G_STRUCTURE) - 1
            #
            # This seems to be an extra hidden layer:
            # It's goal is to map continuous values to discrete values (pre-trained to do this)
            kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, self.G_STRUCTURE[-1]])
            h1 = self.normalize(inputs=tf.matmul(x, kernel), name='cbn' + str(i),
                        labels=label, n_labels=self.NUM_LABELS)
            h2 = tf.nn.tanh(h1)
            x = x + h2
            # This seems to be the output layer
            #
            kernel = self.get.variables(name='W_' + str(i+1), shape=[self.Z_DIM, self.X_SPACE_SIZE])
            bias = self.get.variables(name='b_' + str(i+1), shape=[self.X_SPACE_SIZE])
            x = tf.nn.sigmoid(tf.add(tf.matmul(x, kernel), bias))
        return x

class Discriminator(GNet):
    def __init__(self,**args):
        GNet.__init__(self,**args)
    def network(self,**args):
        """
        This function will apply a computational graph on a dataset passed in with the associated labels and the last layer must have a single output (neuron)
        :inputs
        :label
        """
        x = args['inputs']
        print ()
        print (x[:3,:])
        print()
        label = args['label']
        with tf.compat.v1.variable_scope('D', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)):
            for i, dim in enumerate(self.D_STRUCTURE[1:]):
                kernel = self.get.variables(name='W_' + str(i), shape=[self.D_STRUCTURE[i], dim])
                bias = self.get.variables(name='b_' + str(i), shape=[dim])
                print (["\t",bias,kernel])
                x = tf.nn.relu(tf.add(tf.matmul(x, kernel), bias))
                x = self.normalize(inputs=x, name='cln' + str(i), shift=1,labels=label, n_labels=self.NUM_LABELS)
            i = len(self.D_STRUCTURE)
            kernel = self.get.variables(name='W_' + str(i), shape=[self.D_STRUCTURE[-1], 1])
            bias = self.get.variables(name='b_' + str(i), shape=[1])
            y = tf.add(tf.matmul(x, kernel), bias)
        return y

    def loss(self,**args) :
        """
        This function compute the loss of
        :real
        :fake
        :label
        """
        real    = args['real']
        fake    = args['fake']
        label   = args['label']
        epsilon = tf.random.uniform(shape=[self.BATCHSIZE_PER_GPU,1],minval=0,maxval=1)

        x_hat       = real + epsilon * (fake - real)
        y_hat_fake  = self.network(inputs=fake, label=label)

        y_hat_real  = self.network(inputs=real, label=label)
        y_hat       = self.network(inputs=x_hat, label=label)

        grad        = tf.gradients(y_hat, [x_hat])[0]
        slopes      = tf.sqrt(tf.reduce_sum(tf.square(grad), 1))
        gradient_penalty = tf.reduce_mean((slopes - 1.) ** 2)
        all_regs    = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
        w_distance  = -tf.reduce_mean(y_hat_real) + tf.reduce_mean(y_hat_fake)
        loss        = w_distance + 10 * gradient_penalty + sum(all_regs)
        tf.add_to_collection('dlosses', loss)

        return w_distance, loss
class Train (GNet):
    def __init__(self,**args):
        GNet.__init__(self,**args)
        self.generator = Generator(**args)
        self.discriminator = Discriminator(**args)
        self._REAL = args['real']
        self._LABEL= args['label']
        self.column = args['column']
        # print ([" *** ",self.BATCHSIZE_PER_GPU])

        self.meta = self.log_meta()
    def load_meta(self, column):
        """
        This function will delegate the calls to load meta data to it's dependents
        column name
        """
        super().load_meta(column)
        self.generator.load_meta(column)
        self.discriminator.load_meta(column)
    def loss(self,**args):
        """
        This function will compute a "tower" loss of the generated candidate against real data
        Training will consist in having both generator and discriminators
        :scope
        :stage
        :real
        :label
        """

        scope   = args['scope']
        stage   = args['stage']
        real    = args['real']
        label   = args['label']
        label   = tf.cast(label, tf.int32)
        #
        # @TODO: Ziqi needs to explain what's going on here
        m = [[i] for i in np.arange(self._LABEL.shape[1]-2)]
        label   = label[:, 1] * len(m) + tf.squeeze(
            tf.matmul(label[:, 2:], tf.constant(m, dtype=tf.int32))
            )
        # label = label[:,1] * 4 + tf.squeeze( label[:,2]*[[0],[1],[2],[3]] )
        z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM])

        fake = self.generator.network(inputs=z, label=label)
        if stage == 'D':
            w, loss = self.discriminator.loss(real=real, fake=fake, label=label)
            losses = tf.get_collection('dlosses', scope)
        else:
            w, loss = self.generator.loss(fake=fake, label=label)
            losses = tf.get_collection('glosses', scope)

        total_loss = tf.add_n(losses, name='total_loss')

        return total_loss, w
    def input_fn(self):
        """
        This function seems to produce
        """
        features_placeholder = tf.compat.v1.placeholder(shape=self._REAL.shape, dtype=tf.float32)
        labels_placeholder = tf.compat.v1.placeholder(shape=self._LABEL.shape, dtype=tf.float32)
        dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder))
        dataset = dataset.repeat(10000)
        dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU)
        dataset = dataset.prefetch(1)
        iterator = dataset.make_initializable_iterator()
        # next_element = iterator.get_next()
        # init_op = iterator.initializer
        return iterator, features_placeholder, labels_placeholder

    def network(self,**args):
    # def graph(stage, opt):
        # global_step = tf.get_variable(stage+'_step', [], initializer=tf.constant_initializer(0), trainable=False)
        stage   = args['stage']
        opt     = args['opt']
        tower_grads = []
        per_gpu_w   = []
        iterator, features_placeholder, labels_placeholder = self.input_fn()
        with tf.compat.v1.variable_scope(tf.compat.v1.get_variable_scope()):
            for i in range(self.NUM_GPUS):
                with tf.device('/gpu:%d' % i):
                    with tf.name_scope('%s_%d' % ('TOWER', i)) as scope:
                        (real, label) = iterator.get_next()
                        loss, w = self.loss(scope=scope, stage=stage, real=self._REAL, label=self._LABEL)
                        tf.get_variable_scope().reuse_variables()
                        vars_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=stage)
                        grads = opt.compute_gradients(loss, vars_)
                        tower_grads.append(grads)
                        per_gpu_w.append(w)

        grads = self.average_gradients(tower_grads)
        apply_gradient_op = opt.apply_gradients(grads)

        mean_w = tf.reduce_mean(per_gpu_w)
        train_op = apply_gradient_op
        return train_op, mean_w, iterator, features_placeholder, labels_placeholder
    def apply(self,**args):
        # max_epochs = args['max_epochs'] if 'max_epochs' in args else 10
        REAL = self._REAL
        LABEL= self._LABEL
        with tf.device('/cpu:0'):
            opt_d = tf.compat.v1.train.AdamOptimizer(1e-4)
            opt_g = tf.compat.v1.train.AdamOptimizer(1e-4)

            train_d, w_distance, iterator_d, features_placeholder_d, labels_placeholder_d = self.network(stage='D', opt=opt_d)
            train_g, _, iterator_g, features_placeholder_g, labels_placeholder_g = self.network(stage='G', opt=opt_g)
            # saver = tf.train.Saver()
            saver   = tf.compat.v1.train.Saver()
            init    = tf.global_variables_initializer()
            logs = []
            with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
                sess.run(init)
                sess.run(iterator_d.initializer,
                        feed_dict={features_placeholder_d: REAL, labels_placeholder_d: LABEL})
                sess.run(iterator_g.initializer,
                        feed_dict={features_placeholder_g: REAL, labels_placeholder_g: LABEL})

                for epoch in range(1, self.MAX_EPOCHS + 1):
                    start_time = time.time()
                    w_sum = 0
                    for i in range(self.STEPS_PER_EPOCH):
                        for _ in range(2):
                            _, w = sess.run([train_d, w_distance])
                            w_sum += w
                        sess.run(train_g)
                    duration = time.time() - start_time

                    assert not np.isnan(w_sum), 'Model diverged with loss = NaN'

                    format_str = 'epoch: %d, w_distance = %f (%.1f)'
                    print(format_str % (epoch, -w_sum/(self.STEPS_PER_EPOCH*2), duration))
                    # print (dir (w_distance))

                    logs.append({"epoch":epoch,"distance":-w_sum/(self.STEPS_PER_EPOCH*2) })

                    if epoch % self.MAX_EPOCHS == 0:
                        # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
                        suffix = self.get.suffix()
                        _name  = os.sep.join([self.train_dir,suffix])
                        # saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch)
                        saver.save(sess, _name, write_meta_graph=False, global_step=epoch)
                        #
                        #
                        if self.logger :
                            row = {"logs":logs} #,"model":pickle.dump(sess)}

                            self.logger.write(row=row)

class Predict(GNet):
    """
    This class uses synthetic data given a learned model
    """
    def __init__(self,**args):
        GNet.__init__(self,**args)
        self.generator = Generator(**args)
        self.values  = args['values']
    def load_meta(self, column):
        super().load_meta(column)
        self.generator.load_meta(column)
    def apply(self,**args):
        # print (self.train_dir)
        # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
        suffix = self.get.suffix()
        model_dir = os.sep.join([self.train_dir,suffix+'-'+str(self.MAX_EPOCHS)])
        demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo']
        tf.compat.v1.reset_default_graph()
        z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM])
        y = tf.compat.v1.placeholder(shape=[self.BATCHSIZE_PER_GPU, self.NUM_LABELS], dtype=tf.int32)
        ma = [[i] for i in np.arange(self.NUM_LABELS - 2)]
        label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32)))

        fake = self.generator.network(inputs=z, label=label)
        init    = tf.compat.v1.global_variables_initializer()
        saver = tf.compat.v1.train.Saver()
        with tf.compat.v1.Session() as sess:

            # sess.run(init)
            saver.restore(sess, model_dir)
            labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) )

            labels= demo
            f = sess.run(fake,feed_dict={y:labels})
            #
            # if we are dealing with numeric values only we can perform a simple marginal sum against the indexes
            #

            df =  ( pd.DataFrame(np.round(f).astype(np.int32)))
            # i = df.T.index.astype(np.int32) #-- These are numeric pseudonyms
            # df = (i * df).sum(axis=1)
            #
            # In case we are dealing with actual values like diagnosis codes we can perform
            #
            columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']]

            r = np.zeros((self.ROW_COUNT,len(columns)))
            for col in df :
                i = np.where(df[col])[0]
                r[i] = col

            df = pd.DataFrame(r,columns=columns)

            df[df.columns] = (df.apply(lambda value: self.values[ int(value)],axis=1))
            return df.to_dict(orient='lists')
            # return df.to_dict(orient='list')
            # count = str(len(os.listdir(self.out_dir)))
            # _name = os.sep.join([self.out_dir,self.CONTEXT+'-'+count+'.csv'])
            # df.to_csv(_name,index=False)


            # output.extend(np.round(f))

            # for m in range(2):
            #     for n in range(2, self.NUM_LABELS):
            #         idx1 = (demo[:, m] == 1)
            #         idx2 = (demo[:, n] == 1)
            #         idx = [idx1[j] and idx2[j] for j in range(len(idx1))]
            #         num = np.sum(idx)
            #         print ("___________________list__")
            #         print (idx1)
            #         print (idx2)
            #         print (idx)
            #         print (num)
            #         print ("_____________________")
            #         nbatch = int(np.ceil(num / self.BATCHSIZE_PER_GPU))
            #         label_input = np.zeros((nbatch*self.BATCHSIZE_PER_GPU, self.NUM_LABELS))
            #         label_input[:, n] = 1
            #         label_input[:, m] = 1
            #         output = []
            #         for i in range(nbatch):
            #             f = sess.run(fake,feed_dict={y: label_input[i* self.BATCHSIZE_PER_GPU:(i+1)* self.BATCHSIZE_PER_GPU]})
            #             output.extend(np.round(f))
            #         output = np.array(output)[:num]
                    # print ([m,n,output])

                    # np.save(self.out_dir + str(m) + str(n), output)


if __name__ == '__main__' :
    #
    # Now we get things done ...
    column      = SYS_ARGS['column']
    column_id   = SYS_ARGS['id'] if 'id' in SYS_ARGS else 'person_id'
    df = pd.read_csv(SYS_ARGS['raw-data'])
    LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values

    context     = SYS_ARGS['raw-data'].split(os.sep)[-1:][0][:-4]
    if set(['train','learn']) & set(SYS_ARGS.keys()):

        df = pd.read_csv(SYS_ARGS['raw-data'])

        # cols = SYS_ARGS['column']
        # _map,_df = (Binary()).Export(df)
        # i = np.arange(_map[column]['start'],_map[column]['end'])
        max_epochs = np.int32(SYS_ARGS['max_epochs']) if 'max_epochs' in SYS_ARGS else 10
        # REAL    = _df[:,i]
        REAL    = pd.get_dummies(df[column]).astype(np.float32).values
        LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values
        trainer = Train(context=context,max_epochs=max_epochs,real=REAL,label=LABEL,column=column,column_id=column_id)
        trainer.apply()


        #
        # We should train upon this data
        #
        # -- we need to convert the data-frame to binary matrix, given a column
        #
        pass
    elif 'generate' in SYS_ARGS:
        values = df[column].unique().tolist()
        values.sort()

        p = Predict(context=context,label=LABEL,values=values,column=column)
        p.load_meta(column)
        r = p.apply()
        print (df)
        print ()
        df[column] = r[column]
        print (df)


    else:
        print (SYS_ARGS.keys())
        print (__doc__)
    pass