2019-12-12 17:04:41 +00:00
"""
2020-01-10 19:12:58 +00:00
This code was originally writen by Ziqi Zhang < ziqi . zhang @vanderbilt.edu > in order to generate synthetic data .
The code is an implementation of a Generative Adversarial Network that uses the Wasserstein Distance ( WGAN ) .
It is intended to be used in 2 modes ( embedded in code or using CLI )
USAGE :
The following parameters should be provided in a configuration file ( JSON format )
python data / maker - - config < path - to - config - file . json >
CONFIGURATION FILE STRUCTURE :
2020-02-13 23:30:56 +00:00
context what it is you are loading ( stroke , hypertension , . . . )
data path of the file to be loaded
logs folder to store training model and meta data about learning
max_epochs number of iterations in learning
num_gpu number of gpus to be used ( will still run if the GPUs are not available )
2020-01-10 19:12:58 +00:00
EMBEDDED IN CODE :
2019-12-12 17:04:41 +00:00
"""
import tensorflow as tf
from tensorflow . contrib . layers import l2_regularizer
import numpy as np
import pandas as pd
import time
import os
import sys
2020-01-01 05:27:53 +00:00
from data . params import SYS_ARGS
from data . bridge import Binary
2019-12-12 17:04:41 +00:00
import json
2020-01-04 03:47:05 +00:00
import pickle
2019-12-12 17:04:41 +00:00
os . environ [ " CUDA_DEVICE_ORDER " ] = " PCI_BUS_ID "
os . environ [ ' CUDA_VISIBLE_DEVICES ' ] = " 0 "
os . environ [ ' TF_CPP_MIN_LOG_LEVEL ' ] = ' 2 '
2020-02-13 23:30:56 +00:00
# STEPS_PER_EPOCH = int(SYS_ARGS['epoch']) if 'epoch' in SYS_ARGS else 256
# NUM_GPUS = 1 if 'num_gpu' not in SYS_ARGS else int(SYS_ARGS['num_gpu'])
2019-12-12 17:04:41 +00:00
# BATCHSIZE_PER_GPU = 2000
2020-02-13 23:30:56 +00:00
# TOTAL_BATCHSIZE = BATCHSIZE_PER_GPU * NUM_GPUS
2019-12-12 17:04:41 +00:00
class void :
2020-02-13 23:30:56 +00:00
pass
2019-12-12 17:04:41 +00:00
class GNet :
2020-02-13 23:30:56 +00:00
def log ( self , * * args ) :
self . logs = dict ( args , * * self . logs )
2019-12-12 17:04:41 +00:00
"""
2020-02-13 23:30:56 +00:00
This is the base class of a generative network functions , the details will be implemented in the subclasses .
An instance of this class is accessed as follows
object . layers . normalize applies batch normalization or otherwise
obect . get . variables instanciate variables on cpu and return a reference ( tensor )
2019-12-12 17:04:41 +00:00
"""
2020-02-13 23:30:56 +00:00
def __init__ ( self , * * args ) :
self . layers = void ( )
self . layers . normalize = self . normalize
self . logs = { }
self . NUM_GPUS = 1 if ' num_gpu ' not in args else args [ ' num_gpu ' ]
2020-03-09 00:33:08 +00:00
self . PARTITION = args [ ' partition ' ]
2020-02-18 23:23:13 +00:00
# if self.NUM_GPUS > 1 :
# os.environ['CUDA_VISIBLE_DEVICES'] = "4"
2020-02-13 23:30:56 +00:00
self . X_SPACE_SIZE = args [ ' real ' ] . shape [ 1 ] if ' real ' in args else 854
self . G_STRUCTURE = [ 128 , 128 ] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE]
self . D_STRUCTURE = [ self . X_SPACE_SIZE , 256 , 128 ] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE*2, self.X_SPACE_SIZE] #-- change 854 to number of diagnosis
# self.NUM_LABELS = 8 if 'label' not in args elif len(args['label'].shape) args['label'].shape[1]
if ' label ' in args and len ( args [ ' label ' ] . shape ) == 2 :
self . NUM_LABELS = args [ ' label ' ] . shape [ 1 ]
elif ' label ' in args and len ( args [ ' label ' ] ) == 1 :
self . NUM_LABELS = args [ ' label ' ] . shape [ 0 ]
else :
2020-02-18 18:25:47 +00:00
self . NUM_LABELS = None
2020-02-13 23:30:56 +00:00
# self.Z_DIM = 128 #self.X_SPACE_SIZE
self . Z_DIM = 128 #-- used as rows down stream
self . G_STRUCTURE = [ self . Z_DIM , self . Z_DIM ]
2020-02-18 23:23:13 +00:00
PROPOSED_BATCH_PER_GPU = 2000 if ' batch_size ' not in args else int ( args [ ' batch_size ' ] )
2020-02-20 15:52:53 +00:00
self . BATCHSIZE_PER_GPU = PROPOSED_BATCH_PER_GPU
if ' real ' in args :
self . D_STRUCTURE = [ args [ ' real ' ] . shape [ 1 ] , 256 , self . Z_DIM ]
if args [ ' real ' ] . shape [ 0 ] < PROPOSED_BATCH_PER_GPU :
self . BATCHSIZE_PER_GPU = int ( args [ ' real ' ] . shape [ 0 ] * 1 )
2020-02-18 23:23:13 +00:00
# self.BATCHSIZE_PER_GPU = 2000 if 'batch_size' not in args else int(args['batch_size'])
2020-02-13 23:30:56 +00:00
self . TOTAL_BATCHSIZE = self . BATCHSIZE_PER_GPU * self . NUM_GPUS
self . STEPS_PER_EPOCH = 256 #int(np.load('ICD9/train.npy').shape[0] / 2000)
self . MAX_EPOCHS = 10 if ' max_epochs ' not in args else int ( args [ ' max_epochs ' ] )
self . ROW_COUNT = args [ ' real ' ] . shape [ 0 ] if ' real ' in args else 100
self . CONTEXT = args [ ' context ' ]
self . ATTRIBUTES = { " id " : args [ ' column_id ' ] if ' column_id ' in args else None , " synthetic " : args [ ' column ' ] if ' column ' in args else None }
self . _REAL = args [ ' real ' ] if ' real ' in args else None
self . _LABEL = args [ ' label ' ] if ' label ' in args else None
self . get = void ( )
self . get . variables = self . _variable_on_cpu
self . get . suffix = lambda : " - " . join ( self . ATTRIBUTES [ ' synthetic ' ] ) if isinstance ( self . ATTRIBUTES [ ' synthetic ' ] , list ) else self . ATTRIBUTES [ ' synthetic ' ]
self . logger = args [ ' logger ' ] if ' logger ' in args and args [ ' logger ' ] else None
self . init_logs ( * * args )
def init_logs ( self , * * args ) :
self . log_dir = args [ ' logs ' ] if ' logs ' in args else ' logs '
self . mkdir ( self . log_dir )
#
#
for key in [ ' train ' , ' output ' ] :
self . mkdir ( os . sep . join ( [ self . log_dir , key ] ) )
self . mkdir ( os . sep . join ( [ self . log_dir , key , self . CONTEXT ] ) )
self . train_dir = os . sep . join ( [ self . log_dir , ' train ' , self . CONTEXT ] )
self . out_dir = os . sep . join ( [ self . log_dir , ' output ' , self . CONTEXT ] )
if self . logger :
#
# We will clear the logs from the data-store
#
column = self . ATTRIBUTES [ ' synthetic ' ]
db = self . logger . db
if db [ column ] . count ( ) > 0 :
db . backup . insert ( { ' name ' : column , ' logs ' : list ( db [ column ] . find ( ) ) } )
db [ column ] . drop ( )
def load_meta ( self , column ) :
"""
This function is designed to accomodate the uses of the sub - classes outside of a strict dependency model .
Because prediction and training can happen independently
"""
# suffix = "-".join(column) if isinstance(column,list)else column
suffix = self . get . suffix ( )
_name = os . sep . join ( [ self . out_dir , ' meta- ' + suffix + ' .json ' ] )
if os . path . exists ( _name ) :
attr = json . loads ( ( open ( _name ) ) . read ( ) )
for key in attr :
value = attr [ key ]
setattr ( self , key , value )
self . train_dir = os . sep . join ( [ self . log_dir , ' train ' , self . CONTEXT ] )
self . out_dir = os . sep . join ( [ self . log_dir , ' output ' , self . CONTEXT ] )
def log_meta ( self , * * args ) :
_object = {
# '_id':'meta',
' CONTEXT ' : self . CONTEXT ,
' ATTRIBUTES ' : self . ATTRIBUTES ,
' BATCHSIZE_PER_GPU ' : self . BATCHSIZE_PER_GPU ,
' Z_DIM ' : self . Z_DIM ,
" X_SPACE_SIZE " : self . X_SPACE_SIZE ,
" D_STRUCTURE " : self . D_STRUCTURE ,
" G_STRUCTURE " : self . G_STRUCTURE ,
" NUM_GPUS " : self . NUM_GPUS ,
" NUM_LABELS " : self . NUM_LABELS ,
" MAX_EPOCHS " : self . MAX_EPOCHS ,
" ROW_COUNT " : self . ROW_COUNT
}
if args and ' key ' in args and ' value ' in args :
key = args [ ' key ' ]
value = args [ ' value ' ]
object [ key ] = value
# suffix = "-".join(self.column) if isinstance(self.column,list) else self.column
suffix = self . get . suffix ( )
_name = os . sep . join ( [ self . out_dir , ' meta- ' + suffix ] )
f = open ( _name + ' .json ' , ' w ' )
f . write ( json . dumps ( _object ) )
return _object
def mkdir ( self , path ) :
if not os . path . exists ( path ) :
2020-02-25 17:41:40 +00:00
if os . sep in path :
pass
root = [ ]
for loc in path . split ( os . sep ) :
root . append ( loc )
2020-03-25 22:43:23 +00:00
if not os . path . exists ( os . sep . join ( root ) ) :
2020-02-25 17:54:27 +00:00
os . mkdir ( os . sep . join ( root ) )
2020-02-25 17:41:40 +00:00
2020-02-25 17:54:27 +00:00
elif not os . path . exists ( path ) :
2020-02-25 17:41:40 +00:00
os . mkdir ( path )
2019-12-12 17:04:41 +00:00
2020-02-13 23:30:56 +00:00
def normalize ( self , * * args ) :
"""
This function will perform a batch normalization on an network layer
inputs input layer of the neural network
name name of the scope the
labels labels ( attributes not synthesized ) by default None
n_labels number of labels default None
"""
inputs = args [ ' inputs ' ]
name = args [ ' name ' ]
labels = None if ' labels ' not in args else args [ ' labels ' ]
n_labels = None if ' n_labels ' not in args else args [ ' n_labels ' ]
shift = [ 0 ] if self . __class__ . __name__ . lower ( ) == ' generator ' else [ 1 ] #-- not sure what this is doing
mean , var = tf . nn . moments ( inputs , shift , keep_dims = True )
shape = inputs . shape [ 1 ] . value
2020-02-18 18:25:47 +00:00
if labels is not None :
offset_m = self . get . variables ( shape = [ 1 , shape ] , name = ' offset ' + name ,
initializer = tf . zeros_initializer )
scale_m = self . get . variables ( shape = [ n_labels , shape ] , name = ' scale ' + name ,
initializer = tf . ones_initializer )
offset = tf . nn . embedding_lookup ( offset_m , labels )
scale = tf . nn . embedding_lookup ( scale_m , labels )
else :
offset = None
scale = None
result = tf . nn . batch_normalization ( inputs , mean , var , offset , scale , 1e-8 )
2020-02-13 23:30:56 +00:00
return result
def _variable_on_cpu ( self , * * args ) :
"""
This function makes sure variables / tensors are not created on the GPU but rather on the CPU
"""
name = args [ ' name ' ]
shape = args [ ' shape ' ]
initializer = None if ' initializer ' not in args else args [ ' initializer ' ]
with tf . device ( ' /cpu:0 ' ) :
cpu_var = tf . compat . v1 . get_variable ( name , shape , initializer = initializer )
return cpu_var
def average_gradients ( self , tower_grads ) :
average_grads = [ ]
for grad_and_vars in zip ( * tower_grads ) :
grads = [ ]
for g , _ in grad_and_vars :
expanded_g = tf . expand_dims ( g , 0 )
grads . append ( expanded_g )
grad = tf . concat ( axis = 0 , values = grads )
grad = tf . reduce_mean ( grad , 0 )
v = grad_and_vars [ 0 ] [ 1 ]
grad_and_var = ( grad , v )
average_grads . append ( grad_and_var )
return average_grads
2019-12-12 17:04:41 +00:00
class Generator ( GNet ) :
"""
2020-02-13 23:30:56 +00:00
This class is designed to handle generation of candidate datasets for this it will aggregate a discriminator , this allows the generator not to be random
2019-12-12 17:04:41 +00:00
2020-02-13 23:30:56 +00:00
"""
def __init__ ( self , * * args ) :
GNet . __init__ ( self , * * args )
self . discriminator = Discriminator ( * * args )
def loss ( self , * * args ) :
fake = args [ ' fake ' ]
label = args [ ' label ' ]
y_hat_fake = self . discriminator . network ( inputs = fake , label = label )
#all_regs = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
all_regs = tf . compat . v1 . get_collection ( tf . compat . v1 . GraphKeys . REGULARIZATION_LOSSES )
loss = - tf . reduce_mean ( y_hat_fake ) + sum ( all_regs )
#tf.add_to_collection('glosses', loss)
tf . compat . v1 . add_to_collection ( ' glosses ' , loss )
return loss , loss
def load_meta ( self , column ) :
super ( ) . load_meta ( column )
self . discriminator . load_meta ( column )
def network ( self , * * args ) :
"""
This function will build the network that will generate the synthetic candidates
: inputs matrix of data that we need
: dim dimensions of . . .
"""
x = args [ ' inputs ' ]
tmp_dim = self . Z_DIM if ' dim ' not in args else args [ ' dim ' ]
label = args [ ' label ' ]
2020-02-18 23:23:13 +00:00
2020-02-13 23:30:56 +00:00
with tf . compat . v1 . variable_scope ( ' G ' , reuse = tf . compat . v1 . AUTO_REUSE , regularizer = l2_regularizer ( 0.00001 ) ) :
for i , dim in enumerate ( self . G_STRUCTURE [ : - 1 ] ) :
kernel = self . get . variables ( name = ' W_ ' + str ( i ) , shape = [ tmp_dim , dim ] )
h1 = self . normalize ( inputs = tf . matmul ( x , kernel ) , shift = 0 , name = ' cbn ' + str ( i ) , labels = label , n_labels = self . NUM_LABELS )
h2 = tf . nn . relu ( h1 )
x = x + h2
tmp_dim = dim
i = len ( self . G_STRUCTURE ) - 1
#
# This seems to be an extra hidden layer:
# It's goal is to map continuous values to discrete values (pre-trained to do this)
kernel = self . get . variables ( name = ' W_ ' + str ( i ) , shape = [ tmp_dim , self . G_STRUCTURE [ - 1 ] ] )
h1 = self . normalize ( inputs = tf . matmul ( x , kernel ) , name = ' cbn ' + str ( i ) ,
labels = label , n_labels = self . NUM_LABELS )
h2 = tf . nn . tanh ( h1 )
x = x + h2
# This seems to be the output layer
#
kernel = self . get . variables ( name = ' W_ ' + str ( i + 1 ) , shape = [ self . Z_DIM , self . X_SPACE_SIZE ] )
bias = self . get . variables ( name = ' b_ ' + str ( i + 1 ) , shape = [ self . X_SPACE_SIZE ] )
x = tf . nn . sigmoid ( tf . add ( tf . matmul ( x , kernel ) , bias ) )
return x
2019-12-12 17:04:41 +00:00
class Discriminator ( GNet ) :
2020-02-13 23:30:56 +00:00
def __init__ ( self , * * args ) :
GNet . __init__ ( self , * * args )
def network ( self , * * args ) :
"""
This function will apply a computational graph on a dataset passed in with the associated labels and the last layer must have a single output ( neuron )
: inputs
: label
"""
x = args [ ' inputs ' ]
label = args [ ' label ' ]
with tf . compat . v1 . variable_scope ( ' D ' , reuse = tf . compat . v1 . AUTO_REUSE , regularizer = l2_regularizer ( 0.00001 ) ) :
for i , dim in enumerate ( self . D_STRUCTURE [ 1 : ] ) :
kernel = self . get . variables ( name = ' W_ ' + str ( i ) , shape = [ self . D_STRUCTURE [ i ] , dim ] )
bias = self . get . variables ( name = ' b_ ' + str ( i ) , shape = [ dim ] )
# print (["\t",bias,kernel])
x = tf . nn . relu ( tf . add ( tf . matmul ( x , kernel ) , bias ) )
x = self . normalize ( inputs = x , name = ' cln ' + str ( i ) , shift = 1 , labels = label , n_labels = self . NUM_LABELS )
i = len ( self . D_STRUCTURE )
kernel = self . get . variables ( name = ' W_ ' + str ( i ) , shape = [ self . D_STRUCTURE [ - 1 ] , 1 ] )
bias = self . get . variables ( name = ' b_ ' + str ( i ) , shape = [ 1 ] )
y = tf . add ( tf . matmul ( x , kernel ) , bias )
return y
2019-12-12 17:04:41 +00:00
2020-02-13 23:30:56 +00:00
def loss ( self , * * args ) :
"""
This function compute the loss of
: real
: fake
: label
"""
real = args [ ' real ' ]
fake = args [ ' fake ' ]
label = args [ ' label ' ]
epsilon = tf . random . uniform ( shape = [ self . BATCHSIZE_PER_GPU , 1 ] , minval = 0 , maxval = 1 )
x_hat = real + epsilon * ( fake - real )
y_hat_fake = self . network ( inputs = fake , label = label )
y_hat_real = self . network ( inputs = real , label = label )
y_hat = self . network ( inputs = x_hat , label = label )
grad = tf . gradients ( y_hat , [ x_hat ] ) [ 0 ]
slopes = tf . sqrt ( tf . reduce_sum ( tf . square ( grad ) , 1 ) )
gradient_penalty = tf . reduce_mean ( ( slopes - 1. ) * * 2 )
#all_regs = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
all_regs = tf . compat . v1 . get_collection ( tf . compat . v1 . GraphKeys . REGULARIZATION_LOSSES )
w_distance = - tf . reduce_mean ( y_hat_real ) + tf . reduce_mean ( y_hat_fake )
loss = w_distance + 10 * gradient_penalty + sum ( all_regs )
#tf.add_to_collection('dlosses', loss)
tf . compat . v1 . add_to_collection ( ' dlosses ' , loss )
return w_distance , loss
2019-12-12 17:04:41 +00:00
class Train ( GNet ) :
2020-02-13 23:30:56 +00:00
def __init__ ( self , * * args ) :
GNet . __init__ ( self , * * args )
self . generator = Generator ( * * args )
self . discriminator = Discriminator ( * * args )
self . _REAL = args [ ' real ' ]
2020-02-18 18:25:47 +00:00
self . _LABEL = args [ ' label ' ] if ' label ' in args else None
2020-02-13 23:30:56 +00:00
self . column = args [ ' column ' ]
# print ([" *** ",self.BATCHSIZE_PER_GPU])
self . meta = self . log_meta ( )
if ( self . logger ) :
2020-03-09 00:33:08 +00:00
self . logger . write ( { " module " : " gan-train " , " action " : " start " , " input " : { " partition " : self . PARTITION , " meta " : self . meta } } )
2020-02-13 23:30:56 +00:00
2020-02-18 18:25:47 +00:00
# self.log (real_shape=list(self._REAL.shape),label_shape = self._LABEL.shape,meta_data=self.meta)
2020-02-13 23:30:56 +00:00
def load_meta ( self , column ) :
"""
This function will delegate the calls to load meta data to it ' s dependents
column name
"""
super ( ) . load_meta ( column )
self . generator . load_meta ( column )
self . discriminator . load_meta ( column )
def loss ( self , * * args ) :
"""
This function will compute a " tower " loss of the generated candidate against real data
Training will consist in having both generator and discriminators
: scope
: stage
: real
: label
"""
scope = args [ ' scope ' ]
stage = args [ ' stage ' ]
real = args [ ' real ' ]
label = args [ ' label ' ]
2020-02-18 18:25:47 +00:00
if label is not None :
label = tf . cast ( label , tf . int32 )
#
# @TODO: Ziqi needs to explain what's going on here
m = [ [ i ] for i in np . arange ( self . _LABEL . shape [ 1 ] - 2 ) ]
label = label [ : , 1 ] * len ( m ) + tf . squeeze (
tf . matmul ( label [ : , 2 : ] , tf . constant ( m , dtype = tf . int32 ) )
)
2020-02-13 23:30:56 +00:00
# label = label[:,1] * 4 + tf.squeeze( label[:,2]*[[0],[1],[2],[3]] )
z = tf . random . normal ( shape = [ self . BATCHSIZE_PER_GPU , self . Z_DIM ] )
fake = self . generator . network ( inputs = z , label = label )
if stage == ' D ' :
w , loss = self . discriminator . loss ( real = real , fake = fake , label = label )
#losses = tf.get_collection('dlosses', scope)
flag = ' dlosses '
losses = tf . compat . v1 . get_collection ( ' dlosses ' , scope )
else :
w , loss = self . generator . loss ( fake = fake , label = label )
#losses = tf.get_collection('glosses', scope)
flag = ' glosses '
losses = tf . compat . v1 . get_collection ( ' glosses ' , scope )
# losses = tf.compat.v1.get_collection(flag, scope)
total_loss = tf . add_n ( losses , name = ' total_loss ' )
2020-03-12 14:41:54 +00:00
# print (total_loss)
2020-02-13 23:30:56 +00:00
return total_loss , w
def input_fn ( self ) :
"""
This function seems to produce
"""
features_placeholder = tf . compat . v1 . placeholder ( shape = self . _REAL . shape , dtype = tf . float32 )
2020-02-18 18:25:47 +00:00
LABEL_SHAPE = [ None , None ] if self . _LABEL is None else self . _LABEL . shape
labels_placeholder = tf . compat . v1 . placeholder ( shape = LABEL_SHAPE , dtype = tf . float32 )
if self . _LABEL is not None :
dataset = tf . data . Dataset . from_tensor_slices ( ( features_placeholder , labels_placeholder ) )
else :
dataset = tf . data . Dataset . from_tensor_slices ( features_placeholder )
# labels_placeholder = None
2020-02-13 23:30:56 +00:00
dataset = dataset . repeat ( 10000 )
2020-03-16 21:22:34 +00:00
2020-02-18 22:56:24 +00:00
dataset = dataset . batch ( batch_size = self . BATCHSIZE_PER_GPU )
2020-02-13 23:30:56 +00:00
dataset = dataset . prefetch ( 1 )
# iterator = dataset.make_initializable_iterator()
iterator = tf . compat . v1 . data . make_initializable_iterator ( dataset )
return iterator , features_placeholder , labels_placeholder
2019-12-12 17:04:41 +00:00
2020-02-13 23:30:56 +00:00
def network ( self , * * args ) :
stage = args [ ' stage ' ]
2020-03-04 17:49:18 +00:00
opt = args [ ' opt ' ]
2020-02-13 23:30:56 +00:00
tower_grads = [ ]
2020-03-04 17:49:18 +00:00
per_gpu_w = [ ]
2020-02-13 23:30:56 +00:00
iterator , features_placeholder , labels_placeholder = self . input_fn ( )
with tf . compat . v1 . variable_scope ( tf . compat . v1 . get_variable_scope ( ) ) :
for i in range ( self . NUM_GPUS ) :
with tf . device ( ' /gpu: %d ' % i ) :
with tf . name_scope ( ' %s _ %d ' % ( ' TOWER ' , i ) ) as scope :
2020-02-18 18:25:47 +00:00
if self . _LABEL is not None :
( real , label ) = iterator . get_next ( )
else :
real = iterator . get_next ( )
2020-02-18 22:56:24 +00:00
label = None
loss , w = self . loss ( scope = scope , stage = stage , real = real , label = label )
2020-02-13 23:30:56 +00:00
#tf.get_variable_scope().reuse_variables()
tf . compat . v1 . get_variable_scope ( ) . reuse_variables ( )
#vars_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=stage)
vars_ = tf . compat . v1 . get_collection ( tf . compat . v1 . GraphKeys . TRAINABLE_VARIABLES , scope = stage )
grads = opt . compute_gradients ( loss , vars_ )
tower_grads . append ( grads )
per_gpu_w . append ( w )
grads = self . average_gradients ( tower_grads )
apply_gradient_op = opt . apply_gradients ( grads )
mean_w = tf . reduce_mean ( per_gpu_w )
train_op = apply_gradient_op
return train_op , mean_w , iterator , features_placeholder , labels_placeholder
def apply ( self , * * args ) :
# max_epochs = args['max_epochs'] if 'max_epochs' in args else 10
REAL = self . _REAL
LABEL = self . _LABEL
if ( self . logger ) :
pass
with tf . device ( ' /cpu:0 ' ) :
opt_d = tf . compat . v1 . train . AdamOptimizer ( 1e-4 )
opt_g = tf . compat . v1 . train . AdamOptimizer ( 1e-4 )
train_d , w_distance , iterator_d , features_placeholder_d , labels_placeholder_d = self . network ( stage = ' D ' , opt = opt_d )
train_g , _ , iterator_g , features_placeholder_g , labels_placeholder_g = self . network ( stage = ' G ' , opt = opt_g )
# saver = tf.train.Saver()
saver = tf . compat . v1 . train . Saver ( )
# init = tf.global_variables_initializer()
init = tf . compat . v1 . global_variables_initializer ( )
logs = [ ]
#with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
with tf . compat . v1 . Session ( config = tf . compat . v1 . ConfigProto ( allow_soft_placement = True , log_device_placement = False ) ) as sess :
2020-02-18 22:56:24 +00:00
2020-02-13 23:30:56 +00:00
sess . run ( init )
2020-02-18 18:25:47 +00:00
2020-02-13 23:30:56 +00:00
sess . run ( iterator_d . initializer ,
2020-02-18 18:25:47 +00:00
feed_dict = { features_placeholder_d : REAL } )
2020-02-13 23:30:56 +00:00
sess . run ( iterator_g . initializer ,
2020-02-18 18:25:47 +00:00
feed_dict = { features_placeholder_g : REAL } )
2020-02-13 23:30:56 +00:00
for epoch in range ( 1 , self . MAX_EPOCHS + 1 ) :
start_time = time . time ( )
w_sum = 0
for i in range ( self . STEPS_PER_EPOCH ) :
for _ in range ( 2 ) :
_ , w = sess . run ( [ train_d , w_distance ] )
w_sum + = w
sess . run ( train_g )
duration = time . time ( ) - start_time
assert not np . isnan ( w_sum ) , ' Model diverged with loss = NaN '
format_str = ' epoch: %d , w_distance = %f ( %.1f ) '
print ( format_str % ( epoch , - w_sum / ( self . STEPS_PER_EPOCH * 2 ) , duration ) )
# print (dir (w_distance))
logs . append ( { " epoch " : epoch , " distance " : - w_sum / ( self . STEPS_PER_EPOCH * 2 ) } )
2020-03-26 03:22:08 +00:00
# if epoch % self.MAX_EPOCHS == 0:
2020-03-27 04:39:59 +00:00
if epoch in [ 5 , 10 , 20 , 50 , 75 , self . MAX_EPOCHS ] :
2020-02-13 23:30:56 +00:00
# suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
suffix = self . get . suffix ( )
_name = os . sep . join ( [ self . train_dir , suffix ] )
# saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch)
saver . save ( sess , _name , write_meta_graph = False , global_step = epoch )
#
#
if self . logger :
2020-03-09 00:33:08 +00:00
row = { " module " : " gan-train " , " action " : " logs " , " input " : { " partition " : self . PARTITION , " logs " : logs } } #,"model":pickle.dump(sess)}
2020-02-13 23:30:56 +00:00
self . logger . write ( row )
#
# @TODO:
# We should upload the files in the checkpoint
# This would allow the learnt model to be portable to another system
#
tf . compat . v1 . reset_default_graph ( )
2019-12-12 17:04:41 +00:00
2020-02-13 23:30:56 +00:00
class Predict ( GNet ) :
2019-12-12 17:04:41 +00:00
"""
2020-02-13 23:30:56 +00:00
This class uses synthetic data given a learned model
2019-12-12 17:04:41 +00:00
"""
2020-02-13 23:30:56 +00:00
def __init__ ( self , * * args ) :
GNet . __init__ ( self , * * args )
2020-02-25 17:41:40 +00:00
self . generator = Generator ( * * args )
self . values = args [ ' values ' ]
self . ROW_COUNT = args [ ' row_count ' ]
2020-03-08 13:48:38 +00:00
self . oROW_COUNT = self . ROW_COUNT
2020-04-14 05:31:54 +00:00
self . MISSING_VALUES = np . nan_to_num ( np . nan )
2020-04-02 05:04:05 +00:00
if ' no_value ' in args and args [ ' no_value ' ] not in [ ' na ' , ' ' , ' NA ' ] :
2020-03-25 22:43:23 +00:00
self . MISSING_VALUES = args [ ' no_value ' ]
2020-04-02 05:04:05 +00:00
2020-03-25 22:43:23 +00:00
# self.MISSING_VALUES = args['no_value']
# self.MISSING_VALUES = int(args['no_value']) if args['no_value'].isnumeric() else np.na if args['no_value'] in ['na','NA','N/A'] else args['no_value']
2020-02-13 23:30:56 +00:00
def load_meta ( self , column ) :
super ( ) . load_meta ( column )
self . generator . load_meta ( column )
2020-03-08 13:48:38 +00:00
self . ROW_COUNT = self . oROW_COUNT
2020-02-13 23:30:56 +00:00
def apply ( self , * * args ) :
# print (self.train_dir)
# suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
suffix = self . get . suffix ( )
model_dir = os . sep . join ( [ self . train_dir , suffix + ' - ' + str ( self . MAX_EPOCHS ) ] )
demo = self . _LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo']
tf . compat . v1 . reset_default_graph ( )
2020-02-25 17:41:40 +00:00
z = tf . random . normal ( shape = [ self . ROW_COUNT , self . Z_DIM ] )
2020-03-08 13:48:38 +00:00
2020-02-25 17:41:40 +00:00
y = tf . compat . v1 . placeholder ( shape = [ self . ROW_COUNT , self . NUM_LABELS ] , dtype = tf . int32 )
2020-02-18 18:25:47 +00:00
if self . _LABEL is not None :
ma = [ [ i ] for i in np . arange ( self . NUM_LABELS - 2 ) ]
label = y [ : , 1 ] * len ( ma ) + tf . squeeze ( tf . matmul ( y [ : , 2 : ] , tf . constant ( ma , dtype = tf . int32 ) ) )
else :
label = None
2020-03-04 17:49:18 +00:00
2020-02-13 23:30:56 +00:00
fake = self . generator . network ( inputs = z , label = label )
init = tf . compat . v1 . global_variables_initializer ( )
saver = tf . compat . v1 . train . Saver ( )
df = pd . DataFrame ( )
2020-03-14 16:12:13 +00:00
CANDIDATE_COUNT = 10 #0 if self.ROW_COUNT < 1000 else 100
2020-02-13 23:30:56 +00:00
NTH_VALID_CANDIDATE = count = np . random . choice ( np . arange ( 2 , 60 ) , 2 ) [ 0 ]
with tf . compat . v1 . Session ( ) as sess :
# sess.run(init)
saver . restore ( sess , model_dir )
2020-02-18 18:25:47 +00:00
if self . _LABEL is not None :
labels = np . zeros ( ( self . ROW_COUNT , self . NUM_LABELS ) )
labels = demo
else :
labels = None
2020-02-13 23:30:56 +00:00
found = [ ]
2020-02-25 17:41:40 +00:00
ratio = [ ]
2020-03-08 13:48:38 +00:00
__x__ = None
__ratio = 0
2020-02-13 23:30:56 +00:00
for i in np . arange ( CANDIDATE_COUNT ) :
2020-02-18 18:25:47 +00:00
if labels :
f = sess . run ( fake , feed_dict = { y : labels } )
else :
f = sess . run ( fake )
2020-02-13 23:30:56 +00:00
#
# if we are dealing with numeric values only we can perform a simple marginal sum against the indexes
# The code below will insure we have some acceptable cardinal relationships between id and synthetic values
#
2020-03-06 22:22:37 +00:00
2020-04-13 01:07:15 +00:00
# df = pd.DataFrame(np.round(f)).astype(np.int32)
2020-04-14 05:18:23 +00:00
df = pd . DataFrame ( np . round ( f ) , dtype = int )
2020-03-04 17:49:18 +00:00
2020-02-13 23:30:56 +00:00
p = 0 not in df . sum ( axis = 1 ) . values
2020-02-21 05:08:35 +00:00
x = df . sum ( axis = 1 ) . values
2020-02-25 17:41:40 +00:00
2020-03-06 22:22:37 +00:00
if np . divide ( np . sum ( x ) , x . size ) > .9 or p and np . sum ( x ) == x . size :
2020-02-25 17:41:40 +00:00
ratio . append ( np . divide ( np . sum ( x ) , x . size ) )
2020-02-13 23:30:56 +00:00
found . append ( df )
2020-03-14 16:12:13 +00:00
# break
if len ( found ) == CANDIDATE_COUNT :
2020-02-13 23:30:56 +00:00
break
else :
2020-03-08 13:48:38 +00:00
__x__ = df if __x__ is None or np . where ( x > 0 ) [ 0 ] . size > np . where ( __x__ > 0 ) [ 0 ] . size else __x__
__ratio = np . divide ( np . sum ( x ) , x . size ) if __x__ is None or np . where ( x > 0 ) [ 0 ] . size > np . where ( __x__ > 0 ) [ 0 ] . size else __ratio
2020-02-13 23:30:56 +00:00
continue
2020-03-14 16:12:13 +00:00
2020-02-13 23:30:56 +00:00
# i = df.T.index.astype(np.int32) #-- These are numeric pseudonyms
# df = (i * df).sum(axis=1)
2019-12-12 17:04:41 +00:00
#
2020-02-13 23:30:56 +00:00
# In case we are dealing with actual values like diagnosis codes we can perform
2019-12-12 17:04:41 +00:00
#
2020-03-07 15:16:17 +00:00
N = len ( found )
_index = [ i for i in range ( 0 , N ) if found [ i ] . shape [ 1 ] == len ( self . values ) ]
2020-03-08 13:48:38 +00:00
if not _index and not found :
df = __x__
INDEX = - 1
else :
if not _index :
INDEX = np . random . choice ( np . arange ( len ( found ) ) , 1 ) [ 0 ]
INDEX = ratio . index ( np . max ( ratio ) )
else :
INDEX = _index [ 0 ]
2020-03-06 22:22:37 +00:00
2020-03-04 17:49:18 +00:00
2020-03-08 13:48:38 +00:00
df = found [ INDEX ]
2020-02-13 23:30:56 +00:00
columns = self . ATTRIBUTES [ ' synthetic ' ] if isinstance ( self . ATTRIBUTES [ ' synthetic ' ] , list ) else [ self . ATTRIBUTES [ ' synthetic ' ] ]
# r = np.zeros((self.ROW_COUNT,len(columns)))
2020-03-04 17:49:18 +00:00
# r = np.zeros(self.ROW_COUNT)
2020-03-09 00:33:08 +00:00
2020-03-06 21:26:18 +00:00
if self . logger :
2020-03-08 13:48:38 +00:00
info = { " found " : len ( found ) , " rows " : df . shape [ 0 ] , " cols " : df . shape [ 1 ] , " expected " : len ( self . values ) }
2020-04-14 05:18:23 +00:00
if df . shape [ 1 ] > len ( self . values ) :
df = df . iloc [ : len ( self . values ) ]
2020-03-08 13:48:38 +00:00
if INDEX > 0 :
info = dict ( info , * * { " selected " : INDEX , " ratio " : ratio [ INDEX ] } )
else :
info [ ' selected ' ] = - 1
info [ ' ratio ' ] = __ratio
2020-03-09 00:33:08 +00:00
info [ ' partition ' ] = self . PARTITION
2020-03-06 21:26:18 +00:00
self . logger . write ( { " module " : " gan-generate " , " action " : " generate " , " input " : info } )
2020-04-14 06:54:11 +00:00
# df.columns = self.values
if len ( found ) or df . columns . size < = len ( self . values ) :
2020-02-25 17:41:40 +00:00
ii = df . apply ( lambda row : np . sum ( row ) == 0 , axis = 1 )
2020-02-26 15:25:13 +00:00
# print ([' **** ',ii.sum()])
if ii . shape [ 0 ] > 0 :
2020-02-25 17:41:40 +00:00
#
#@TODO Have this be a configurable variable
2020-03-25 22:43:23 +00:00
missing = np . repeat ( self . MISSING_VALUES , np . where ( ii == 1 ) [ 0 ] . size )
2020-02-25 17:41:40 +00:00
else :
missing = [ ]
2020-03-04 17:49:18 +00:00
#
# @TODO:
# Log the findings here in terms of ratio, missing, candidate count
# print ([np.max(ratio),len(missing),len(found),i])
2020-02-25 17:41:40 +00:00
i = np . where ( ii == 0 ) [ 0 ]
2020-04-14 06:54:11 +00:00
2020-02-25 18:25:03 +00:00
df = pd . DataFrame ( df . iloc [ i ] . apply ( lambda row : self . values [ np . random . choice ( np . where ( row != 0 ) [ 0 ] , 1 ) [ 0 ] ] , axis = 1 ) )
2020-02-13 23:30:56 +00:00
df . columns = columns
2020-02-25 17:41:40 +00:00
df = df [ columns [ 0 ] ] . append ( pd . Series ( missing ) )
2020-03-08 13:48:38 +00:00
if self . logger :
2020-03-09 00:33:08 +00:00
info = { " missing " : i . size , " rows " : df . shape [ 0 ] , " cols " : 1 , ' partition ' : self . PARTITION }
2020-03-08 13:48:38 +00:00
self . logger . write ( { " module " : " gan-generate " , " action " : " compile.io " , " input " : info } )
2020-02-13 23:30:56 +00:00
2020-03-08 13:48:38 +00:00
# print(df.head())
2020-02-13 23:30:56 +00:00
tf . compat . v1 . reset_default_graph ( )
2020-02-25 18:27:45 +00:00
df = pd . DataFrame ( df )
2020-02-25 18:33:44 +00:00
df . columns = columns
2020-02-13 23:30:56 +00:00
return df . to_dict ( orient = ' list ' )
# return df.to_dict(orient='list')
# count = str(len(os.listdir(self.out_dir)))
# _name = os.sep.join([self.out_dir,self.CONTEXT+'-'+count+'.csv'])
# df.to_csv(_name,index=False)
# output.extend(np.round(f))
# for m in range(2):
# for n in range(2, self.NUM_LABELS):
# idx1 = (demo[:, m] == 1)
# idx2 = (demo[:, n] == 1)
# idx = [idx1[j] and idx2[j] for j in range(len(idx1))]
# num = np.sum(idx)
# print ("___________________list__")
# print (idx1)
# print (idx2)
# print (idx)
# print (num)
# print ("_____________________")
# nbatch = int(np.ceil(num / self.BATCHSIZE_PER_GPU))
# label_input = np.zeros((nbatch*self.BATCHSIZE_PER_GPU, self.NUM_LABELS))
# label_input[:, n] = 1
# label_input[:, m] = 1
# output = []
# for i in range(nbatch):
# f = sess.run(fake,feed_dict={y: label_input[i* self.BATCHSIZE_PER_GPU:(i+1)* self.BATCHSIZE_PER_GPU]})
# output.extend(np.round(f))
# output = np.array(output)[:num]
# print ([m,n,output])
# np.save(self.out_dir + str(m) + str(n), output)
2019-12-12 17:04:41 +00:00
2020-02-13 23:30:56 +00:00
if __name__ == ' __main__ ' :
#
# Now we get things done ...
column = SYS_ARGS [ ' column ' ]
column_id = SYS_ARGS [ ' id ' ] if ' id ' in SYS_ARGS else ' person_id '
column_id = column_id . split ( ' , ' ) if ' , ' in column_id else column_id
df = pd . read_csv ( SYS_ARGS [ ' raw-data ' ] )
LABEL = pd . get_dummies ( df [ column_id ] ) . astype ( np . float32 ) . values
2019-12-12 17:04:41 +00:00
2020-02-13 23:30:56 +00:00
context = SYS_ARGS [ ' raw-data ' ] . split ( os . sep ) [ - 1 : ] [ 0 ] [ : - 4 ]
if set ( [ ' train ' , ' learn ' ] ) & set ( SYS_ARGS . keys ( ) ) :
df = pd . read_csv ( SYS_ARGS [ ' raw-data ' ] )
# cols = SYS_ARGS['column']
# _map,_df = (Binary()).Export(df)
# i = np.arange(_map[column]['start'],_map[column]['end'])
max_epochs = np . int32 ( SYS_ARGS [ ' max_epochs ' ] ) if ' max_epochs ' in SYS_ARGS else 10
# REAL = _df[:,i]
REAL = pd . get_dummies ( df [ column ] ) . astype ( np . float32 ) . values
LABEL = pd . get_dummies ( df [ column_id ] ) . astype ( np . float32 ) . values
trainer = Train ( context = context , max_epochs = max_epochs , real = REAL , label = LABEL , column = column , column_id = column_id )
trainer . apply ( )
2020-02-11 18:00:16 +00:00
2020-02-13 23:30:56 +00:00
#
# We should train upon this data
2020-02-11 18:00:16 +00:00
#
2020-02-13 23:30:56 +00:00
# -- we need to convert the data-frame to binary matrix, given a column
2020-02-11 18:00:16 +00:00
#
2020-02-13 23:30:56 +00:00
pass
elif ' generate ' in SYS_ARGS :
values = df [ column ] . unique ( ) . tolist ( )
values . sort ( )
2020-02-11 18:00:16 +00:00
2020-02-13 23:30:56 +00:00
p = Predict ( context = context , label = LABEL , values = values , column = column )
p . load_meta ( column )
r = p . apply ( )
2020-02-29 03:37:26 +00:00
# print (df)
# print ()
2020-02-13 23:30:56 +00:00
df [ column ] = r [ column ]
2020-02-29 03:37:26 +00:00
# print (df)
2020-02-11 18:00:16 +00:00
2020-02-13 23:30:56 +00:00
else :
print ( SYS_ARGS . keys ( ) )
print ( __doc__ )
2019-12-12 17:04:41 +00:00
pass