2019-12-12 17:04:41 +00:00
"""
2020-01-10 19:12:58 +00:00
This code was originally writen by Ziqi Zhang < ziqi . zhang @vanderbilt.edu > in order to generate synthetic data .
The code is an implementation of a Generative Adversarial Network that uses the Wasserstein Distance ( WGAN ) .
It is intended to be used in 2 modes ( embedded in code or using CLI )
USAGE :
The following parameters should be provided in a configuration file ( JSON format )
python data / maker - - config < path - to - config - file . json >
CONFIGURATION FILE STRUCTURE :
2020-02-13 23:30:56 +00:00
context what it is you are loading ( stroke , hypertension , . . . )
data path of the file to be loaded
logs folder to store training model and meta data about learning
max_epochs number of iterations in learning
num_gpu number of gpus to be used ( will still run if the GPUs are not available )
2020-01-10 19:12:58 +00:00
EMBEDDED IN CODE :
2019-12-12 17:04:41 +00:00
"""
import tensorflow as tf
2022-01-13 21:05:00 +00:00
# from tensorflow.contrib.layers import l2_regularizer
from tensorflow . keras import layers
from tensorflow . keras . regularizers import L2 as l2_regularizer
2019-12-12 17:04:41 +00:00
import numpy as np
import pandas as pd
import time
import os
import sys
2020-01-01 05:27:53 +00:00
from data . params import SYS_ARGS
from data . bridge import Binary
2019-12-12 17:04:41 +00:00
import json
2020-01-04 03:47:05 +00:00
import pickle
2019-12-12 17:04:41 +00:00
os . environ [ " CUDA_DEVICE_ORDER " ] = " PCI_BUS_ID "
os . environ [ ' CUDA_VISIBLE_DEVICES ' ] = " 0 "
os . environ [ ' TF_CPP_MIN_LOG_LEVEL ' ] = ' 2 '
2022-01-13 21:05:00 +00:00
tf . compat . v1 . disable_eager_execution ( )
2020-02-13 23:30:56 +00:00
# STEPS_PER_EPOCH = int(SYS_ARGS['epoch']) if 'epoch' in SYS_ARGS else 256
# NUM_GPUS = 1 if 'num_gpu' not in SYS_ARGS else int(SYS_ARGS['num_gpu'])
2019-12-12 17:04:41 +00:00
# BATCHSIZE_PER_GPU = 2000
2020-02-13 23:30:56 +00:00
# TOTAL_BATCHSIZE = BATCHSIZE_PER_GPU * NUM_GPUS
2019-12-12 17:04:41 +00:00
class void :
2020-02-13 23:30:56 +00:00
pass
2019-12-12 17:04:41 +00:00
class GNet :
2020-02-13 23:30:56 +00:00
def log ( self , * * args ) :
self . logs = dict ( args , * * self . logs )
2019-12-12 17:04:41 +00:00
"""
2020-02-13 23:30:56 +00:00
This is the base class of a generative network functions , the details will be implemented in the subclasses .
An instance of this class is accessed as follows
object . layers . normalize applies batch normalization or otherwise
obect . get . variables instanciate variables on cpu and return a reference ( tensor )
2019-12-12 17:04:41 +00:00
"""
2020-02-13 23:30:56 +00:00
def __init__ ( self , * * args ) :
self . layers = void ( )
self . layers . normalize = self . normalize
self . logs = { }
2021-03-30 09:56:01 +00:00
# self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu']
self . GPU_CHIPS = None if ' gpu ' not in args else args [ ' gpu ' ]
if self . GPU_CHIPS is None :
self . GPU_CHIPS = [ 0 ]
if ' CUDA_VISIBLE_DEVICES ' in os . environ :
os . environ . pop ( ' CUDA_VISIBLE_DEVICES ' )
2021-04-01 18:20:35 +00:00
self . NUM_GPUS = 0
else :
self . NUM_GPUS = len ( self . GPU_CHIPS )
2021-04-07 20:30:59 +00:00
# os.environ['CUDA_VISIBLE_DEVICES'] = str(self.GPU_CHIPS[0])
2021-03-30 09:56:01 +00:00
2021-04-07 20:30:59 +00:00
self . PARTITION = args [ ' partition ' ] if ' partition ' in args else None
2020-02-18 23:23:13 +00:00
# if self.NUM_GPUS > 1 :
# os.environ['CUDA_VISIBLE_DEVICES'] = "4"
2020-02-13 23:30:56 +00:00
self . X_SPACE_SIZE = args [ ' real ' ] . shape [ 1 ] if ' real ' in args else 854
self . G_STRUCTURE = [ 128 , 128 ] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE]
self . D_STRUCTURE = [ self . X_SPACE_SIZE , 256 , 128 ] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE*2, self.X_SPACE_SIZE] #-- change 854 to number of diagnosis
# self.NUM_LABELS = 8 if 'label' not in args elif len(args['label'].shape) args['label'].shape[1]
if ' label ' in args and len ( args [ ' label ' ] . shape ) == 2 :
self . NUM_LABELS = args [ ' label ' ] . shape [ 1 ]
elif ' label ' in args and len ( args [ ' label ' ] ) == 1 :
self . NUM_LABELS = args [ ' label ' ] . shape [ 0 ]
else :
2020-02-18 18:25:47 +00:00
self . NUM_LABELS = None
2020-02-13 23:30:56 +00:00
# self.Z_DIM = 128 #self.X_SPACE_SIZE
self . Z_DIM = 128 #-- used as rows down stream
self . G_STRUCTURE = [ self . Z_DIM , self . Z_DIM ]
2020-02-18 23:23:13 +00:00
PROPOSED_BATCH_PER_GPU = 2000 if ' batch_size ' not in args else int ( args [ ' batch_size ' ] )
2020-02-20 15:52:53 +00:00
self . BATCHSIZE_PER_GPU = PROPOSED_BATCH_PER_GPU
if ' real ' in args :
self . D_STRUCTURE = [ args [ ' real ' ] . shape [ 1 ] , 256 , self . Z_DIM ]
if args [ ' real ' ] . shape [ 0 ] < PROPOSED_BATCH_PER_GPU :
self . BATCHSIZE_PER_GPU = int ( args [ ' real ' ] . shape [ 0 ] * 1 )
2020-02-18 23:23:13 +00:00
# self.BATCHSIZE_PER_GPU = 2000 if 'batch_size' not in args else int(args['batch_size'])
2020-02-13 23:30:56 +00:00
self . TOTAL_BATCHSIZE = self . BATCHSIZE_PER_GPU * self . NUM_GPUS
self . STEPS_PER_EPOCH = 256 #int(np.load('ICD9/train.npy').shape[0] / 2000)
self . MAX_EPOCHS = 10 if ' max_epochs ' not in args else int ( args [ ' max_epochs ' ] )
self . ROW_COUNT = args [ ' real ' ] . shape [ 0 ] if ' real ' in args else 100
self . CONTEXT = args [ ' context ' ]
self . ATTRIBUTES = { " id " : args [ ' column_id ' ] if ' column_id ' in args else None , " synthetic " : args [ ' column ' ] if ' column ' in args else None }
self . _REAL = args [ ' real ' ] if ' real ' in args else None
self . _LABEL = args [ ' label ' ] if ' label ' in args else None
self . get = void ( )
self . get . variables = self . _variable_on_cpu
self . get . suffix = lambda : " - " . join ( self . ATTRIBUTES [ ' synthetic ' ] ) if isinstance ( self . ATTRIBUTES [ ' synthetic ' ] , list ) else self . ATTRIBUTES [ ' synthetic ' ]
self . logger = args [ ' logger ' ] if ' logger ' in args and args [ ' logger ' ] else None
self . init_logs ( * * args )
def init_logs ( self , * * args ) :
self . log_dir = args [ ' logs ' ] if ' logs ' in args else ' logs '
self . mkdir ( self . log_dir )
#
#
for key in [ ' train ' , ' output ' ] :
self . mkdir ( os . sep . join ( [ self . log_dir , key ] ) )
self . mkdir ( os . sep . join ( [ self . log_dir , key , self . CONTEXT ] ) )
2021-04-07 20:30:59 +00:00
if ' partition ' in args :
self . mkdir ( os . sep . join ( [ self . log_dir , key , self . CONTEXT , str ( args [ ' partition ' ] ) ] ) )
2020-02-13 23:30:56 +00:00
self . train_dir = os . sep . join ( [ self . log_dir , ' train ' , self . CONTEXT ] )
self . out_dir = os . sep . join ( [ self . log_dir , ' output ' , self . CONTEXT ] )
2021-04-07 20:30:59 +00:00
if ' partition ' in args :
self . train_dir = os . sep . join ( [ self . train_dir , str ( args [ ' partition ' ] ) ] )
self . out_dir = os . sep . join ( [ self . out_dir , str ( args [ ' partition ' ] ) ] )
2021-03-29 16:10:57 +00:00
# if self.logger :
# We will clear the logs from the data-store
# column = self.ATTRIBUTES['synthetic']
# db = self.logger.db
# if db[column].count() > 0 :
# db.backup.insert({'name':column,'logs':list(db[column].find()) })
# db[column].drop()
2020-02-13 23:30:56 +00:00
2021-04-07 20:30:59 +00:00
def load_meta ( self , * * args ) :
2020-02-13 23:30:56 +00:00
"""
This function is designed to accomodate the uses of the sub - classes outside of a strict dependency model .
Because prediction and training can happen independently
"""
# suffix = "-".join(column) if isinstance(column,list)else column
2021-03-29 16:10:57 +00:00
suffix = self . CONTEXT #self.get.suffix()
2020-02-13 23:30:56 +00:00
_name = os . sep . join ( [ self . out_dir , ' meta- ' + suffix + ' .json ' ] )
if os . path . exists ( _name ) :
attr = json . loads ( ( open ( _name ) ) . read ( ) )
for key in attr :
value = attr [ key ]
setattr ( self , key , value )
self . train_dir = os . sep . join ( [ self . log_dir , ' train ' , self . CONTEXT ] )
self . out_dir = os . sep . join ( [ self . log_dir , ' output ' , self . CONTEXT ] )
2021-04-07 20:30:59 +00:00
if ' partition ' in args :
self . train_dir = os . sep . join ( [ self . train_dir , str ( args [ ' partition ' ] ) ] )
self . out_dir = os . sep . join ( [ self . out_dir , str ( args [ ' partition ' ] ) ] )
2020-02-13 23:30:56 +00:00
def log_meta ( self , * * args ) :
_object = {
# '_id':'meta',
' CONTEXT ' : self . CONTEXT ,
' ATTRIBUTES ' : self . ATTRIBUTES ,
' BATCHSIZE_PER_GPU ' : self . BATCHSIZE_PER_GPU ,
' Z_DIM ' : self . Z_DIM ,
" X_SPACE_SIZE " : self . X_SPACE_SIZE ,
" D_STRUCTURE " : self . D_STRUCTURE ,
" G_STRUCTURE " : self . G_STRUCTURE ,
" NUM_GPUS " : self . NUM_GPUS ,
2021-03-30 09:56:01 +00:00
" GPU_CHIPS " : self . GPU_CHIPS ,
2020-02-13 23:30:56 +00:00
" NUM_LABELS " : self . NUM_LABELS ,
" MAX_EPOCHS " : self . MAX_EPOCHS ,
" ROW_COUNT " : self . ROW_COUNT
}
if args and ' key ' in args and ' value ' in args :
key = args [ ' key ' ]
value = args [ ' value ' ]
object [ key ] = value
# suffix = "-".join(self.column) if isinstance(self.column,list) else self.column
2021-03-29 16:10:57 +00:00
suffix = self . CONTEXT #self.get.suffix()
2020-02-13 23:30:56 +00:00
_name = os . sep . join ( [ self . out_dir , ' meta- ' + suffix ] )
f = open ( _name + ' .json ' , ' w ' )
f . write ( json . dumps ( _object ) )
return _object
def mkdir ( self , path ) :
if not os . path . exists ( path ) :
2020-02-25 17:41:40 +00:00
if os . sep in path :
pass
root = [ ]
for loc in path . split ( os . sep ) :
root . append ( loc )
2020-03-25 22:43:23 +00:00
if not os . path . exists ( os . sep . join ( root ) ) :
2020-02-25 17:54:27 +00:00
os . mkdir ( os . sep . join ( root ) )
2020-02-25 17:41:40 +00:00
2020-02-25 17:54:27 +00:00
elif not os . path . exists ( path ) :
2020-02-25 17:41:40 +00:00
os . mkdir ( path )
2019-12-12 17:04:41 +00:00
2020-02-13 23:30:56 +00:00
def normalize ( self , * * args ) :
"""
This function will perform a batch normalization on an network layer
inputs input layer of the neural network
name name of the scope the
labels labels ( attributes not synthesized ) by default None
n_labels number of labels default None
"""
inputs = args [ ' inputs ' ]
name = args [ ' name ' ]
labels = None if ' labels ' not in args else args [ ' labels ' ]
n_labels = None if ' n_labels ' not in args else args [ ' n_labels ' ]
shift = [ 0 ] if self . __class__ . __name__ . lower ( ) == ' generator ' else [ 1 ] #-- not sure what this is doing
2022-01-13 21:05:00 +00:00
# mean, var = tf.nn.moments(inputs, shift, keep_dims=True)
mean , var = tf . nn . moments ( inputs , shift , keepdims = True )
# shape = inputs.shape[1].value
shape = inputs . shape [ 1 ]
2020-02-18 18:25:47 +00:00
if labels is not None :
2022-01-13 21:05:00 +00:00
offset_m = self . get . variables ( shape = [ 1 , shape ] , name = ' offset ' + name , initializer = tf . zeros_initializer )
scale_m = self . get . variables ( shape = [ n_labels , shape ] , name = ' scale ' + name , initializer = tf . ones_initializer )
2020-02-18 18:25:47 +00:00
offset = tf . nn . embedding_lookup ( offset_m , labels )
scale = tf . nn . embedding_lookup ( scale_m , labels )
else :
offset = None
scale = None
result = tf . nn . batch_normalization ( inputs , mean , var , offset , scale , 1e-8 )
2020-02-13 23:30:56 +00:00
return result
def _variable_on_cpu ( self , * * args ) :
"""
This function makes sure variables / tensors are not created on the GPU but rather on the CPU
"""
name = args [ ' name ' ]
shape = args [ ' shape ' ]
initializer = None if ' initializer ' not in args else args [ ' initializer ' ]
with tf . device ( ' /cpu:0 ' ) :
cpu_var = tf . compat . v1 . get_variable ( name , shape , initializer = initializer )
return cpu_var
def average_gradients ( self , tower_grads ) :
average_grads = [ ]
for grad_and_vars in zip ( * tower_grads ) :
grads = [ ]
for g , _ in grad_and_vars :
expanded_g = tf . expand_dims ( g , 0 )
grads . append ( expanded_g )
grad = tf . concat ( axis = 0 , values = grads )
grad = tf . reduce_mean ( grad , 0 )
v = grad_and_vars [ 0 ] [ 1 ]
grad_and_var = ( grad , v )
average_grads . append ( grad_and_var )
return average_grads
2019-12-12 17:04:41 +00:00
class Generator ( GNet ) :
"""
2020-02-13 23:30:56 +00:00
This class is designed to handle generation of candidate datasets for this it will aggregate a discriminator , this allows the generator not to be random
2019-12-12 17:04:41 +00:00
2020-02-13 23:30:56 +00:00
"""
def __init__ ( self , * * args ) :
GNet . __init__ ( self , * * args )
self . discriminator = Discriminator ( * * args )
def loss ( self , * * args ) :
fake = args [ ' fake ' ]
label = args [ ' label ' ]
y_hat_fake = self . discriminator . network ( inputs = fake , label = label )
#all_regs = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
all_regs = tf . compat . v1 . get_collection ( tf . compat . v1 . GraphKeys . REGULARIZATION_LOSSES )
loss = - tf . reduce_mean ( y_hat_fake ) + sum ( all_regs )
#tf.add_to_collection('glosses', loss)
tf . compat . v1 . add_to_collection ( ' glosses ' , loss )
return loss , loss
2021-04-07 20:30:59 +00:00
def load_meta ( self , * * args ) :
super ( ) . load_meta ( * * args )
self . discriminator . load_meta ( * * args )
2020-02-13 23:30:56 +00:00
def network ( self , * * args ) :
"""
This function will build the network that will generate the synthetic candidates
: inputs matrix of data that we need
: dim dimensions of . . .
"""
x = args [ ' inputs ' ]
tmp_dim = self . Z_DIM if ' dim ' not in args else args [ ' dim ' ]
label = args [ ' label ' ]
2020-02-18 23:23:13 +00:00
2020-02-13 23:30:56 +00:00
with tf . compat . v1 . variable_scope ( ' G ' , reuse = tf . compat . v1 . AUTO_REUSE , regularizer = l2_regularizer ( 0.00001 ) ) :
for i , dim in enumerate ( self . G_STRUCTURE [ : - 1 ] ) :
kernel = self . get . variables ( name = ' W_ ' + str ( i ) , shape = [ tmp_dim , dim ] )
h1 = self . normalize ( inputs = tf . matmul ( x , kernel ) , shift = 0 , name = ' cbn ' + str ( i ) , labels = label , n_labels = self . NUM_LABELS )
h2 = tf . nn . relu ( h1 )
x = x + h2
tmp_dim = dim
i = len ( self . G_STRUCTURE ) - 1
#
# This seems to be an extra hidden layer:
# It's goal is to map continuous values to discrete values (pre-trained to do this)
kernel = self . get . variables ( name = ' W_ ' + str ( i ) , shape = [ tmp_dim , self . G_STRUCTURE [ - 1 ] ] )
h1 = self . normalize ( inputs = tf . matmul ( x , kernel ) , name = ' cbn ' + str ( i ) ,
labels = label , n_labels = self . NUM_LABELS )
h2 = tf . nn . tanh ( h1 )
x = x + h2
# This seems to be the output layer
#
kernel = self . get . variables ( name = ' W_ ' + str ( i + 1 ) , shape = [ self . Z_DIM , self . X_SPACE_SIZE ] )
bias = self . get . variables ( name = ' b_ ' + str ( i + 1 ) , shape = [ self . X_SPACE_SIZE ] )
x = tf . nn . sigmoid ( tf . add ( tf . matmul ( x , kernel ) , bias ) )
return x
2019-12-12 17:04:41 +00:00
class Discriminator ( GNet ) :
2020-02-13 23:30:56 +00:00
def __init__ ( self , * * args ) :
GNet . __init__ ( self , * * args )
def network ( self , * * args ) :
"""
This function will apply a computational graph on a dataset passed in with the associated labels and the last layer must have a single output ( neuron )
: inputs
: label
"""
x = args [ ' inputs ' ]
label = args [ ' label ' ]
with tf . compat . v1 . variable_scope ( ' D ' , reuse = tf . compat . v1 . AUTO_REUSE , regularizer = l2_regularizer ( 0.00001 ) ) :
for i , dim in enumerate ( self . D_STRUCTURE [ 1 : ] ) :
kernel = self . get . variables ( name = ' W_ ' + str ( i ) , shape = [ self . D_STRUCTURE [ i ] , dim ] )
bias = self . get . variables ( name = ' b_ ' + str ( i ) , shape = [ dim ] )
# print (["\t",bias,kernel])
x = tf . nn . relu ( tf . add ( tf . matmul ( x , kernel ) , bias ) )
x = self . normalize ( inputs = x , name = ' cln ' + str ( i ) , shift = 1 , labels = label , n_labels = self . NUM_LABELS )
i = len ( self . D_STRUCTURE )
kernel = self . get . variables ( name = ' W_ ' + str ( i ) , shape = [ self . D_STRUCTURE [ - 1 ] , 1 ] )
bias = self . get . variables ( name = ' b_ ' + str ( i ) , shape = [ 1 ] )
y = tf . add ( tf . matmul ( x , kernel ) , bias )
return y
2019-12-12 17:04:41 +00:00
2020-02-13 23:30:56 +00:00
def loss ( self , * * args ) :
"""
This function compute the loss of
: real
: fake
: label
"""
real = args [ ' real ' ]
fake = args [ ' fake ' ]
label = args [ ' label ' ]
epsilon = tf . random . uniform ( shape = [ self . BATCHSIZE_PER_GPU , 1 ] , minval = 0 , maxval = 1 )
x_hat = real + epsilon * ( fake - real )
y_hat_fake = self . network ( inputs = fake , label = label )
y_hat_real = self . network ( inputs = real , label = label )
y_hat = self . network ( inputs = x_hat , label = label )
grad = tf . gradients ( y_hat , [ x_hat ] ) [ 0 ]
slopes = tf . sqrt ( tf . reduce_sum ( tf . square ( grad ) , 1 ) )
gradient_penalty = tf . reduce_mean ( ( slopes - 1. ) * * 2 )
#all_regs = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
all_regs = tf . compat . v1 . get_collection ( tf . compat . v1 . GraphKeys . REGULARIZATION_LOSSES )
w_distance = - tf . reduce_mean ( y_hat_real ) + tf . reduce_mean ( y_hat_fake )
loss = w_distance + 10 * gradient_penalty + sum ( all_regs )
#tf.add_to_collection('dlosses', loss)
tf . compat . v1 . add_to_collection ( ' dlosses ' , loss )
return w_distance , loss
2019-12-12 17:04:41 +00:00
class Train ( GNet ) :
2020-02-13 23:30:56 +00:00
def __init__ ( self , * * args ) :
GNet . __init__ ( self , * * args )
self . generator = Generator ( * * args )
self . discriminator = Discriminator ( * * args )
self . _REAL = args [ ' real ' ]
2020-02-18 18:25:47 +00:00
self . _LABEL = args [ ' label ' ] if ' label ' in args else None
2021-03-29 16:10:57 +00:00
# self.column = args['column']
2020-02-13 23:30:56 +00:00
# print ([" *** ",self.BATCHSIZE_PER_GPU])
self . meta = self . log_meta ( )
if ( self . logger ) :
2020-03-09 00:33:08 +00:00
self . logger . write ( { " module " : " gan-train " , " action " : " start " , " input " : { " partition " : self . PARTITION , " meta " : self . meta } } )
2020-02-13 23:30:56 +00:00
2020-02-18 18:25:47 +00:00
# self.log (real_shape=list(self._REAL.shape),label_shape = self._LABEL.shape,meta_data=self.meta)
2020-02-13 23:30:56 +00:00
def load_meta ( self , column ) :
"""
This function will delegate the calls to load meta data to it ' s dependents
column name
"""
super ( ) . load_meta ( column )
self . generator . load_meta ( column )
self . discriminator . load_meta ( column )
def loss ( self , * * args ) :
"""
This function will compute a " tower " loss of the generated candidate against real data
Training will consist in having both generator and discriminators
: scope
: stage
: real
: label
"""
scope = args [ ' scope ' ]
stage = args [ ' stage ' ]
real = args [ ' real ' ]
label = args [ ' label ' ]
2020-02-18 18:25:47 +00:00
if label is not None :
label = tf . cast ( label , tf . int32 )
#
# @TODO: Ziqi needs to explain what's going on here
m = [ [ i ] for i in np . arange ( self . _LABEL . shape [ 1 ] - 2 ) ]
label = label [ : , 1 ] * len ( m ) + tf . squeeze (
tf . matmul ( label [ : , 2 : ] , tf . constant ( m , dtype = tf . int32 ) )
)
2020-02-13 23:30:56 +00:00
# label = label[:,1] * 4 + tf.squeeze( label[:,2]*[[0],[1],[2],[3]] )
z = tf . random . normal ( shape = [ self . BATCHSIZE_PER_GPU , self . Z_DIM ] )
fake = self . generator . network ( inputs = z , label = label )
if stage == ' D ' :
w , loss = self . discriminator . loss ( real = real , fake = fake , label = label )
#losses = tf.get_collection('dlosses', scope)
flag = ' dlosses '
losses = tf . compat . v1 . get_collection ( ' dlosses ' , scope )
else :
w , loss = self . generator . loss ( fake = fake , label = label )
#losses = tf.get_collection('glosses', scope)
flag = ' glosses '
losses = tf . compat . v1 . get_collection ( ' glosses ' , scope )
# losses = tf.compat.v1.get_collection(flag, scope)
total_loss = tf . add_n ( losses , name = ' total_loss ' )
2020-03-12 14:41:54 +00:00
# print (total_loss)
2020-02-13 23:30:56 +00:00
return total_loss , w
def input_fn ( self ) :
"""
This function seems to produce
"""
features_placeholder = tf . compat . v1 . placeholder ( shape = self . _REAL . shape , dtype = tf . float32 )
2020-02-18 18:25:47 +00:00
LABEL_SHAPE = [ None , None ] if self . _LABEL is None else self . _LABEL . shape
labels_placeholder = tf . compat . v1 . placeholder ( shape = LABEL_SHAPE , dtype = tf . float32 )
if self . _LABEL is not None :
dataset = tf . data . Dataset . from_tensor_slices ( ( features_placeholder , labels_placeholder ) )
else :
dataset = tf . data . Dataset . from_tensor_slices ( features_placeholder )
# labels_placeholder = None
2020-02-13 23:30:56 +00:00
dataset = dataset . repeat ( 10000 )
2020-03-16 21:22:34 +00:00
2020-02-18 22:56:24 +00:00
dataset = dataset . batch ( batch_size = self . BATCHSIZE_PER_GPU )
2020-02-13 23:30:56 +00:00
dataset = dataset . prefetch ( 1 )
# iterator = dataset.make_initializable_iterator()
iterator = tf . compat . v1 . data . make_initializable_iterator ( dataset )
return iterator , features_placeholder , labels_placeholder
2019-12-12 17:04:41 +00:00
2020-02-13 23:30:56 +00:00
def network ( self , * * args ) :
stage = args [ ' stage ' ]
2020-03-04 17:49:18 +00:00
opt = args [ ' opt ' ]
2020-02-13 23:30:56 +00:00
tower_grads = [ ]
2020-03-04 17:49:18 +00:00
per_gpu_w = [ ]
2020-02-13 23:30:56 +00:00
iterator , features_placeholder , labels_placeholder = self . input_fn ( )
with tf . compat . v1 . variable_scope ( tf . compat . v1 . get_variable_scope ( ) ) :
2021-03-29 16:10:57 +00:00
#
# @TODO: Find a way to handle this across multiple CPU in case the GPU are not available
# - abstract hardware specification
# - determine if the GPU/CPU are busy
#
2021-03-30 09:56:01 +00:00
for i in self . GPU_CHIPS : #range(self.NUM_GPUS):
2021-04-07 20:30:59 +00:00
2020-02-13 23:30:56 +00:00
with tf . device ( ' /gpu: %d ' % i ) :
with tf . name_scope ( ' %s _ %d ' % ( ' TOWER ' , i ) ) as scope :
2020-02-18 18:25:47 +00:00
if self . _LABEL is not None :
( real , label ) = iterator . get_next ( )
else :
real = iterator . get_next ( )
2020-02-18 22:56:24 +00:00
label = None
loss , w = self . loss ( scope = scope , stage = stage , real = real , label = label )
2020-02-13 23:30:56 +00:00
#tf.get_variable_scope().reuse_variables()
tf . compat . v1 . get_variable_scope ( ) . reuse_variables ( )
#vars_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=stage)
vars_ = tf . compat . v1 . get_collection ( tf . compat . v1 . GraphKeys . TRAINABLE_VARIABLES , scope = stage )
grads = opt . compute_gradients ( loss , vars_ )
tower_grads . append ( grads )
per_gpu_w . append ( w )
grads = self . average_gradients ( tower_grads )
apply_gradient_op = opt . apply_gradients ( grads )
mean_w = tf . reduce_mean ( per_gpu_w )
train_op = apply_gradient_op
return train_op , mean_w , iterator , features_placeholder , labels_placeholder
def apply ( self , * * args ) :
# max_epochs = args['max_epochs'] if 'max_epochs' in args else 10
REAL = self . _REAL
LABEL = self . _LABEL
if ( self . logger ) :
pass
with tf . device ( ' /cpu:0 ' ) :
opt_d = tf . compat . v1 . train . AdamOptimizer ( 1e-4 )
opt_g = tf . compat . v1 . train . AdamOptimizer ( 1e-4 )
train_d , w_distance , iterator_d , features_placeholder_d , labels_placeholder_d = self . network ( stage = ' D ' , opt = opt_d )
train_g , _ , iterator_g , features_placeholder_g , labels_placeholder_g = self . network ( stage = ' G ' , opt = opt_g )
# saver = tf.train.Saver()
saver = tf . compat . v1 . train . Saver ( )
# init = tf.global_variables_initializer()
init = tf . compat . v1 . global_variables_initializer ( )
logs = [ ]
#with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
with tf . compat . v1 . Session ( config = tf . compat . v1 . ConfigProto ( allow_soft_placement = True , log_device_placement = False ) ) as sess :
2020-02-18 22:56:24 +00:00
2020-02-13 23:30:56 +00:00
sess . run ( init )
2020-02-18 18:25:47 +00:00
2020-02-13 23:30:56 +00:00
sess . run ( iterator_d . initializer ,
2020-02-18 18:25:47 +00:00
feed_dict = { features_placeholder_d : REAL } )
2020-02-13 23:30:56 +00:00
sess . run ( iterator_g . initializer ,
2020-02-18 18:25:47 +00:00
feed_dict = { features_placeholder_g : REAL } )
2020-02-13 23:30:56 +00:00
for epoch in range ( 1 , self . MAX_EPOCHS + 1 ) :
start_time = time . time ( )
w_sum = 0
for i in range ( self . STEPS_PER_EPOCH ) :
for _ in range ( 2 ) :
_ , w = sess . run ( [ train_d , w_distance ] )
w_sum + = w
sess . run ( train_g )
duration = time . time ( ) - start_time
assert not np . isnan ( w_sum ) , ' Model diverged with loss = NaN '
format_str = ' epoch: %d , w_distance = %f ( %.1f ) '
print ( format_str % ( epoch , - w_sum / ( self . STEPS_PER_EPOCH * 2 ) , duration ) )
# print (dir (w_distance))
logs . append ( { " epoch " : epoch , " distance " : - w_sum / ( self . STEPS_PER_EPOCH * 2 ) } )
2020-03-26 03:22:08 +00:00
# if epoch % self.MAX_EPOCHS == 0:
2020-03-27 04:39:59 +00:00
if epoch in [ 5 , 10 , 20 , 50 , 75 , self . MAX_EPOCHS ] :
2020-02-13 23:30:56 +00:00
# suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
2021-03-29 16:10:57 +00:00
suffix = self . CONTEXT #self.get.suffix()
2020-02-13 23:30:56 +00:00
_name = os . sep . join ( [ self . train_dir , suffix ] )
# saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch)
saver . save ( sess , _name , write_meta_graph = False , global_step = epoch )
#
#
if self . logger :
2020-03-09 00:33:08 +00:00
row = { " module " : " gan-train " , " action " : " logs " , " input " : { " partition " : self . PARTITION , " logs " : logs } } #,"model":pickle.dump(sess)}
2020-02-13 23:30:56 +00:00
self . logger . write ( row )
#
# @TODO:
# We should upload the files in the checkpoint
# This would allow the learnt model to be portable to another system
#
tf . compat . v1 . reset_default_graph ( )
2019-12-12 17:04:41 +00:00
2020-02-13 23:30:56 +00:00
class Predict ( GNet ) :
2019-12-12 17:04:41 +00:00
"""
2020-02-13 23:30:56 +00:00
This class uses synthetic data given a learned model
2019-12-12 17:04:41 +00:00
"""
2020-02-13 23:30:56 +00:00
def __init__ ( self , * * args ) :
GNet . __init__ ( self , * * args )
2020-02-25 17:41:40 +00:00
self . generator = Generator ( * * args )
self . values = args [ ' values ' ]
self . ROW_COUNT = args [ ' row_count ' ]
2020-03-08 13:48:38 +00:00
self . oROW_COUNT = self . ROW_COUNT
2020-04-29 06:27:25 +00:00
# self.MISSING_VALUES = np.nan_to_num(np.nan)
# if 'no_value' in args and args['no_value'] not in ['na','','NA'] :
# self.MISSING_VALUES = args['no_value']
2021-03-29 16:10:57 +00:00
self . MISSING_VALUES = args [ ' missing ' ] if ' missing ' in args else [ ]
2020-04-02 05:04:05 +00:00
2020-03-25 22:43:23 +00:00
# self.MISSING_VALUES = args['no_value']
# self.MISSING_VALUES = int(args['no_value']) if args['no_value'].isnumeric() else np.na if args['no_value'] in ['na','NA','N/A'] else args['no_value']
2021-04-07 20:30:59 +00:00
def load_meta ( self , * * args ) :
super ( ) . load_meta ( * * args )
self . generator . load_meta ( * * args )
2020-03-08 13:48:38 +00:00
self . ROW_COUNT = self . oROW_COUNT
2020-02-13 23:30:56 +00:00
def apply ( self , * * args ) :
2021-03-29 16:10:57 +00:00
suffix = self . CONTEXT #self.get.suffix()
model_dir = os . sep . join ( [ self . train_dir , suffix + ' - ' + str ( self . MAX_EPOCHS ) ] )
demo = self . _LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo']
#
# setup computational graph
tf . compat . v1 . reset_default_graph ( )
z = tf . random . normal ( shape = [ self . ROW_COUNT , self . Z_DIM ] )
y = tf . compat . v1 . placeholder ( shape = [ self . ROW_COUNT , self . NUM_LABELS ] , dtype = tf . int32 )
if self . _LABEL is not None :
ma = [ [ i ] for i in np . arange ( self . NUM_LABELS - 2 ) ]
label = y [ : , 1 ] * len ( ma ) + tf . squeeze ( tf . matmul ( y [ : , 2 : ] , tf . constant ( ma , dtype = tf . int32 ) ) )
else :
label = None
fake = self . generator . network ( inputs = z , label = label )
init = tf . compat . v1 . global_variables_initializer ( )
saver = tf . compat . v1 . train . Saver ( )
df = pd . DataFrame ( )
CANDIDATE_COUNT = args [ ' candidates ' ] if ' candidates ' in args else 1 #0 if self.ROW_COUNT < 1000 else 100
candidates = [ ]
2022-01-13 21:05:00 +00:00
2021-03-29 16:10:57 +00:00
with tf . compat . v1 . Session ( ) as sess :
saver . restore ( sess , model_dir )
if self . _LABEL is not None :
# labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) )
labels = demo
else :
labels = None
for i in np . arange ( CANDIDATE_COUNT ) :
if labels :
_matrix = sess . run ( fake , feed_dict = { y : labels } )
else :
_matrix = sess . run ( fake )
#
# if we are dealing with numeric values only we can perform a simple marginal sum against the indexes
# The code below will insure we have some acceptable cardinal relationships between id and synthetic values
#
# df = pd.DataFrame(np.round(f)).astype(np.int32)
2021-03-30 22:01:59 +00:00
# candidates.append (np.round(_matrix).astype(np.int64))
2021-03-30 22:17:10 +00:00
candidates . append ( np . array ( [ np . round ( row ) . astype ( int ) for row in _matrix ] ) )
2021-03-29 16:10:57 +00:00
# return candidates[0] if len(candidates) == 1 else candidates
2022-03-24 16:38:52 +00:00
return [ candidates [ 0 ] ]
2021-03-29 16:10:57 +00:00
def _apply ( self , * * args ) :
2020-02-13 23:30:56 +00:00
# print (self.train_dir)
# suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
2021-03-29 16:10:57 +00:00
suffix = self . CONTEXT #self.get.suffix()
2020-02-13 23:30:56 +00:00
model_dir = os . sep . join ( [ self . train_dir , suffix + ' - ' + str ( self . MAX_EPOCHS ) ] )
demo = self . _LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo']
tf . compat . v1 . reset_default_graph ( )
2020-02-25 17:41:40 +00:00
z = tf . random . normal ( shape = [ self . ROW_COUNT , self . Z_DIM ] )
2020-03-08 13:48:38 +00:00
2020-02-25 17:41:40 +00:00
y = tf . compat . v1 . placeholder ( shape = [ self . ROW_COUNT , self . NUM_LABELS ] , dtype = tf . int32 )
2020-02-18 18:25:47 +00:00
if self . _LABEL is not None :
ma = [ [ i ] for i in np . arange ( self . NUM_LABELS - 2 ) ]
label = y [ : , 1 ] * len ( ma ) + tf . squeeze ( tf . matmul ( y [ : , 2 : ] , tf . constant ( ma , dtype = tf . int32 ) ) )
else :
label = None
2020-03-04 17:49:18 +00:00
2020-02-13 23:30:56 +00:00
fake = self . generator . network ( inputs = z , label = label )
init = tf . compat . v1 . global_variables_initializer ( )
saver = tf . compat . v1 . train . Saver ( )
df = pd . DataFrame ( )
2021-03-29 16:10:57 +00:00
CANDIDATE_COUNT = 5 #0 if self.ROW_COUNT < 1000 else 100
2020-02-13 23:30:56 +00:00
NTH_VALID_CANDIDATE = count = np . random . choice ( np . arange ( 2 , 60 ) , 2 ) [ 0 ]
with tf . compat . v1 . Session ( ) as sess :
# sess.run(init)
2021-03-29 16:10:57 +00:00
2020-02-13 23:30:56 +00:00
saver . restore ( sess , model_dir )
2020-02-18 18:25:47 +00:00
if self . _LABEL is not None :
labels = np . zeros ( ( self . ROW_COUNT , self . NUM_LABELS ) )
labels = demo
else :
labels = None
2020-02-13 23:30:56 +00:00
found = [ ]
2020-02-25 17:41:40 +00:00
ratio = [ ]
2020-03-08 13:48:38 +00:00
__x__ = None
__ratio = 0
2020-02-13 23:30:56 +00:00
for i in np . arange ( CANDIDATE_COUNT ) :
2020-02-18 18:25:47 +00:00
if labels :
2021-03-29 16:10:57 +00:00
_matrix = sess . run ( fake , feed_dict = { y : labels } )
2020-02-18 18:25:47 +00:00
else :
2021-03-29 16:10:57 +00:00
_matrix = sess . run ( fake )
2020-02-13 23:30:56 +00:00
#
# if we are dealing with numeric values only we can perform a simple marginal sum against the indexes
# The code below will insure we have some acceptable cardinal relationships between id and synthetic values
#
2020-03-06 22:22:37 +00:00
2020-04-13 01:07:15 +00:00
# df = pd.DataFrame(np.round(f)).astype(np.int32)
2021-03-29 16:10:57 +00:00
found . append ( np . round ( _matrix ) . astype ( np . int64 ) )
# df = pd.DataFrame(np.round(_matrix),dtype=int)
2020-02-13 23:30:56 +00:00
p = 0 not in df . sum ( axis = 1 ) . values
2021-03-29 16:10:57 +00:00
# x = df.sum(axis=1).values
2020-02-25 17:41:40 +00:00
2021-03-29 16:10:57 +00:00
# if np.divide( np.sum(x), x.size) > .9 or p and np.sum(x) == x.size :
# ratio.append(np.divide( np.sum(x), x.size))
# found.append(df)
2020-03-14 16:12:13 +00:00
2021-03-29 16:10:57 +00:00
# # break
# if len(found) == CANDIDATE_COUNT:
2020-03-14 16:12:13 +00:00
2021-03-29 16:10:57 +00:00
# break
# else:
# __x__ = df if __x__ is None or np.where(x > 0)[0].size > np.where(__x__ > 0)[0].size else __x__
# __ratio = np.divide( np.sum(x), x.size) if __x__ is None or np.where(x > 0)[0].size > np.where(__x__ > 0)[0].size else __ratio
# continue
2020-03-14 16:12:13 +00:00
2020-02-13 23:30:56 +00:00
# i = df.T.index.astype(np.int32) #-- These are numeric pseudonyms
# df = (i * df).sum(axis=1)
2019-12-12 17:04:41 +00:00
#
2020-02-13 23:30:56 +00:00
# In case we are dealing with actual values like diagnosis codes we can perform
2019-12-12 17:04:41 +00:00
#
2021-03-29 16:10:57 +00:00
# N = len(found)
# _index = [i for i in range(0,N) if found[i].shape[1] == len(self.values)]
# if not _index and not found :
# df = __x__
# INDEX = -1
# else :
# if not _index :
# INDEX = np.random.choice(np.arange(len(found)),1)[0]
# INDEX = ratio.index(np.max(ratio))
# else:
# INDEX = _index[0]
2020-03-06 22:22:37 +00:00
2020-03-04 17:49:18 +00:00
2021-03-29 16:10:57 +00:00
# df = found[INDEX]
# columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']]
2020-02-13 23:30:56 +00:00
# r = np.zeros((self.ROW_COUNT,len(columns)))
2020-03-04 17:49:18 +00:00
# r = np.zeros(self.ROW_COUNT)
2020-03-09 00:33:08 +00:00
2021-03-29 16:10:57 +00:00
# if self.logger :
# info = {"found":len(found),"rows":df.shape[0],"cols":df.shape[1],"expected":len(self.values)}
# if df.shape[1] > len(self.values) :
# df = df.iloc[:len(self.values)]
# if INDEX > 0 :
# info =dict(info ,**{"selected":INDEX, "ratio": ratio[INDEX] })
# else :
2020-03-08 13:48:38 +00:00
2021-03-29 16:10:57 +00:00
# info['selected'] = -1
# info['ratio'] = __ratio
# info['partition'] = self.PARTITION
# self.logger.write({"module":"gan-generate","action":"generate","input":info})
# # df.columns = self.values
# if len(found) or df.columns.size <= len(self.values):
# ii = df.apply(lambda row: np.sum(row) == 0 ,axis=1)
# missing = []
# if ii.sum() > 0 :
# #
# # If the generator had a reductive effect we should be able to get random values from either :
# # - The space of outliers
# # - existing values for smaller spaces that have suffered over training
# #
# N = ii.sum()
# missing_values = self.MISSING_VALUES if self.MISSING_VALUES else self.values
# missing = np.random.choice(missing_values,N)
# # missing = []
# #
# # @TODO:
# # Log the findings here in terms of ratio, missing, candidate count
# # print ([np.max(ratio),len(missing),len(found),i])
# i = np.where(ii == 0)[0]
2020-04-14 06:54:11 +00:00
2021-03-29 16:10:57 +00:00
# df = pd.DataFrame( df.iloc[i].apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1))
# df.columns = columns
# df = df[columns[0]].append(pd.Series(missing))
2020-04-29 06:27:25 +00:00
2021-03-29 16:10:57 +00:00
# if self.logger :
2020-03-08 13:48:38 +00:00
2021-03-29 16:10:57 +00:00
# info= {"missing": i.size,"rows":df.shape[0],"cols":1,'partition':self.PARTITION}
# self.logger.write({"module":"gan-generate","action":"compile.io","input":info})
2020-03-08 13:48:38 +00:00
2020-02-13 23:30:56 +00:00
2020-03-08 13:48:38 +00:00
# print(df.head())
2020-02-13 23:30:56 +00:00
tf . compat . v1 . reset_default_graph ( )
2021-03-29 16:10:57 +00:00
# df = pd.DataFrame(df)
# df.columns = columns
# np.random.shuffle(df[columns[0]].values)
# return df.to_dict(orient='list')
return _matrix
2020-02-13 23:30:56 +00:00