bug fixes: design improvements

This commit is contained in:
Steve Nyemba 2021-03-29 11:10:57 -05:00
parent 71097103da
commit 97bae5ef92
5 changed files with 506 additions and 359 deletions

View File

@ -1,2 +1,17 @@
import data.params as params
from data.params import SYS_ARGS
import transport
from multiprocessing import Process, Queue
from data.maker import prepare
class Trainer (Process) :
pass
class Maker(Process):
pass
if __name__ == '__main__' :
logger = transport.factory.instance(SYS_ARGS['store']['logger'])

View File

@ -111,15 +111,15 @@ class GNet :
self.train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT])
self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT])
if self.logger :
#
# We will clear the logs from the data-store
#
column = self.ATTRIBUTES['synthetic']
db = self.logger.db
if db[column].count() > 0 :
db.backup.insert({'name':column,'logs':list(db[column].find()) })
db[column].drop()
# if self.logger :
# We will clear the logs from the data-store
# column = self.ATTRIBUTES['synthetic']
# db = self.logger.db
# if db[column].count() > 0 :
# db.backup.insert({'name':column,'logs':list(db[column].find()) })
# db[column].drop()
def load_meta(self,column):
"""
@ -127,7 +127,7 @@ class GNet :
Because prediction and training can happen independently
"""
# suffix = "-".join(column) if isinstance(column,list)else column
suffix = self.get.suffix()
suffix = self.CONTEXT #self.get.suffix()
_name = os.sep.join([self.out_dir,'meta-'+suffix+'.json'])
if os.path.exists(_name) :
attr = json.loads((open(_name)).read())
@ -159,7 +159,7 @@ class GNet :
value= args['value']
object[key] = value
# suffix = "-".join(self.column) if isinstance(self.column,list) else self.column
suffix = self.get.suffix()
suffix = self.CONTEXT #self.get.suffix()
_name = os.sep.join([self.out_dir,'meta-'+suffix])
f = open(_name+'.json','w')
@ -351,7 +351,7 @@ class Train (GNet):
self.discriminator = Discriminator(**args)
self._REAL = args['real']
self._LABEL= args['label'] if 'label' in args else None
self.column = args['column']
# self.column = args['column']
# print ([" *** ",self.BATCHSIZE_PER_GPU])
self.meta = self.log_meta()
@ -438,6 +438,11 @@ class Train (GNet):
per_gpu_w = []
iterator, features_placeholder, labels_placeholder = self.input_fn()
with tf.compat.v1.variable_scope(tf.compat.v1.get_variable_scope()):
#
# @TODO: Find a way to handle this across multiple CPU in case the GPU are not available
# - abstract hardware specification
# - determine if the GPU/CPU are busy
#
for i in range(self.NUM_GPUS):
with tf.device('/gpu:%d' % i):
with tf.name_scope('%s_%d' % ('TOWER', i)) as scope:
@ -510,7 +515,7 @@ class Train (GNet):
# if epoch % self.MAX_EPOCHS == 0:
if epoch in [5,10,20,50,75, self.MAX_EPOCHS] :
# suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
suffix = self.get.suffix()
suffix = self.CONTEXT #self.get.suffix()
_name = os.sep.join([self.train_dir,suffix])
# saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch)
saver.save(sess, _name, write_meta_graph=False, global_step=epoch)
@ -539,7 +544,8 @@ class Predict(GNet):
# self.MISSING_VALUES = np.nan_to_num(np.nan)
# if 'no_value' in args and args['no_value'] not in ['na','','NA'] :
# self.MISSING_VALUES = args['no_value']
self.MISSING_VALUES = args['missing']
self.MISSING_VALUES = args['missing'] if 'missing' in args else []
# self.MISSING_VALUES = args['no_value']
# self.MISSING_VALUES = int(args['no_value']) if args['no_value'].isnumeric() else np.na if args['no_value'] in ['na','NA','N/A'] else args['no_value']
@ -548,9 +554,56 @@ class Predict(GNet):
self.generator.load_meta(column)
self.ROW_COUNT = self.oROW_COUNT
def apply(self,**args):
suffix = self.CONTEXT #self.get.suffix()
model_dir = os.sep.join([self.train_dir,suffix+'-'+str(self.MAX_EPOCHS)])
demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo']
#
# setup computational graph
tf.compat.v1.reset_default_graph()
z = tf.random.normal(shape=[self.ROW_COUNT, self.Z_DIM])
y = tf.compat.v1.placeholder(shape=[self.ROW_COUNT, self.NUM_LABELS], dtype=tf.int32)
if self._LABEL is not None :
ma = [[i] for i in np.arange(self.NUM_LABELS - 2)]
label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32)))
else:
label = None
fake = self.generator.network(inputs=z, label=label)
init = tf.compat.v1.global_variables_initializer()
saver = tf.compat.v1.train.Saver()
df = pd.DataFrame()
CANDIDATE_COUNT = args['candidates'] if 'candidates' in args else 1 #0 if self.ROW_COUNT < 1000 else 100
candidates = []
with tf.compat.v1.Session() as sess:
saver.restore(sess, model_dir)
if self._LABEL is not None :
# labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) )
labels= demo
else:
labels = None
for i in np.arange(CANDIDATE_COUNT) :
if labels :
_matrix = sess.run(fake,feed_dict={y:labels})
else:
_matrix = sess.run(fake)
#
# if we are dealing with numeric values only we can perform a simple marginal sum against the indexes
# The code below will insure we have some acceptable cardinal relationships between id and synthetic values
#
# df = pd.DataFrame(np.round(f)).astype(np.int32)
candidates.append (np.round(_matrix).astype(np.int64))
# return candidates[0] if len(candidates) == 1 else candidates
return candidates
def _apply(self,**args):
# print (self.train_dir)
# suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
suffix = self.get.suffix()
suffix = self.CONTEXT #self.get.suffix()
model_dir = os.sep.join([self.train_dir,suffix+'-'+str(self.MAX_EPOCHS)])
demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo']
tf.compat.v1.reset_default_graph()
@ -567,11 +620,12 @@ class Predict(GNet):
init = tf.compat.v1.global_variables_initializer()
saver = tf.compat.v1.train.Saver()
df = pd.DataFrame()
CANDIDATE_COUNT = 10 #0 if self.ROW_COUNT < 1000 else 100
CANDIDATE_COUNT = 5 #0 if self.ROW_COUNT < 1000 else 100
NTH_VALID_CANDIDATE = count = np.random.choice(np.arange(2,60),2)[0]
with tf.compat.v1.Session() as sess:
# sess.run(init)
saver.restore(sess, model_dir)
if self._LABEL is not None :
labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) )
@ -585,109 +639,110 @@ class Predict(GNet):
__ratio=0
for i in np.arange(CANDIDATE_COUNT) :
if labels :
f = sess.run(fake,feed_dict={y:labels})
_matrix = sess.run(fake,feed_dict={y:labels})
else:
f = sess.run(fake)
_matrix = sess.run(fake)
#
# if we are dealing with numeric values only we can perform a simple marginal sum against the indexes
# The code below will insure we have some acceptable cardinal relationships between id and synthetic values
#
# df = pd.DataFrame(np.round(f)).astype(np.int32)
df = pd.DataFrame(np.round(f),dtype=int)
found.append (np.round(_matrix).astype(np.int64))
# df = pd.DataFrame(np.round(_matrix),dtype=int)
p = 0 not in df.sum(axis=1).values
x = df.sum(axis=1).values
# x = df.sum(axis=1).values
if np.divide( np.sum(x), x.size) > .9 or p and np.sum(x) == x.size :
ratio.append(np.divide( np.sum(x), x.size))
found.append(df)
# if np.divide( np.sum(x), x.size) > .9 or p and np.sum(x) == x.size :
# ratio.append(np.divide( np.sum(x), x.size))
# found.append(df)
# break
if len(found) == CANDIDATE_COUNT:
# # break
# if len(found) == CANDIDATE_COUNT:
break
else:
__x__ = df if __x__ is None or np.where(x > 0)[0].size > np.where(__x__ > 0)[0].size else __x__
__ratio = np.divide( np.sum(x), x.size) if __x__ is None or np.where(x > 0)[0].size > np.where(__x__ > 0)[0].size else __ratio
continue
# break
# else:
# __x__ = df if __x__ is None or np.where(x > 0)[0].size > np.where(__x__ > 0)[0].size else __x__
# __ratio = np.divide( np.sum(x), x.size) if __x__ is None or np.where(x > 0)[0].size > np.where(__x__ > 0)[0].size else __ratio
# continue
# i = df.T.index.astype(np.int32) #-- These are numeric pseudonyms
# df = (i * df).sum(axis=1)
#
# In case we are dealing with actual values like diagnosis codes we can perform
#
N = len(found)
_index = [i for i in range(0,N) if found[i].shape[1] == len(self.values)]
if not _index and not found :
df = __x__
INDEX = -1
else :
if not _index :
INDEX = np.random.choice(np.arange(len(found)),1)[0]
INDEX = ratio.index(np.max(ratio))
else:
INDEX = _index[0]
# N = len(found)
# _index = [i for i in range(0,N) if found[i].shape[1] == len(self.values)]
# if not _index and not found :
# df = __x__
# INDEX = -1
# else :
# if not _index :
# INDEX = np.random.choice(np.arange(len(found)),1)[0]
# INDEX = ratio.index(np.max(ratio))
# else:
# INDEX = _index[0]
df = found[INDEX]
columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']]
# df = found[INDEX]
# columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']]
# r = np.zeros((self.ROW_COUNT,len(columns)))
# r = np.zeros(self.ROW_COUNT)
if self.logger :
info = {"found":len(found),"rows":df.shape[0],"cols":df.shape[1],"expected":len(self.values)}
if df.shape[1] > len(self.values) :
df = df.iloc[:len(self.values)]
if INDEX > 0 :
info =dict(info ,**{"selected":INDEX, "ratio": ratio[INDEX] })
else :
# if self.logger :
# info = {"found":len(found),"rows":df.shape[0],"cols":df.shape[1],"expected":len(self.values)}
# if df.shape[1] > len(self.values) :
# df = df.iloc[:len(self.values)]
# if INDEX > 0 :
# info =dict(info ,**{"selected":INDEX, "ratio": ratio[INDEX] })
# else :
info['selected'] = -1
info['ratio'] = __ratio
info['partition'] = self.PARTITION
self.logger.write({"module":"gan-generate","action":"generate","input":info})
# df.columns = self.values
if len(found) or df.columns.size <= len(self.values):
ii = df.apply(lambda row: np.sum(row) == 0 ,axis=1)
missing = []
if ii.sum() > 0 :
#
# If the generator had a reductive effect we should be able to get random values from either :
# - The space of outliers
# - existing values for smaller spaces that have suffered over training
#
# info['selected'] = -1
# info['ratio'] = __ratio
# info['partition'] = self.PARTITION
# self.logger.write({"module":"gan-generate","action":"generate","input":info})
# # df.columns = self.values
# if len(found) or df.columns.size <= len(self.values):
# ii = df.apply(lambda row: np.sum(row) == 0 ,axis=1)
# missing = []
# if ii.sum() > 0 :
# #
# # If the generator had a reductive effect we should be able to get random values from either :
# # - The space of outliers
# # - existing values for smaller spaces that have suffered over training
# #
N = ii.sum()
missing_values = self.MISSING_VALUES if self.MISSING_VALUES else self.values
missing = np.random.choice(missing_values,N)
# missing = []
#
# @TODO:
# Log the findings here in terms of ratio, missing, candidate count
# print ([np.max(ratio),len(missing),len(found),i])
i = np.where(ii == 0)[0]
# N = ii.sum()
# missing_values = self.MISSING_VALUES if self.MISSING_VALUES else self.values
# missing = np.random.choice(missing_values,N)
# # missing = []
# #
# # @TODO:
# # Log the findings here in terms of ratio, missing, candidate count
# # print ([np.max(ratio),len(missing),len(found),i])
# i = np.where(ii == 0)[0]
df = pd.DataFrame( df.iloc[i].apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1))
df.columns = columns
df = df[columns[0]].append(pd.Series(missing))
# df = pd.DataFrame( df.iloc[i].apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1))
# df.columns = columns
# df = df[columns[0]].append(pd.Series(missing))
if self.logger :
# if self.logger :
info= {"missing": i.size,"rows":df.shape[0],"cols":1,'partition':self.PARTITION}
self.logger.write({"module":"gan-generate","action":"compile.io","input":info})
# info= {"missing": i.size,"rows":df.shape[0],"cols":1,'partition':self.PARTITION}
# self.logger.write({"module":"gan-generate","action":"compile.io","input":info})
# print(df.head())
tf.compat.v1.reset_default_graph()
df = pd.DataFrame(df)
df.columns = columns
np.random.shuffle(df[columns[0]].values)
return df.to_dict(orient='list')
# df = pd.DataFrame(df)
# df.columns = columns
# np.random.shuffle(df[columns[0]].values)
# return df.to_dict(orient='list')
return _matrix
if __name__ == '__main__' :

View File

@ -14,6 +14,11 @@ import data.gan as gan
from transport import factory
from data.bridge import Binary
import threading as thread
from data.maker import prepare
import copy
import os
import json
class ContinuousToDiscrete :
ROUND_UP = 2
@staticmethod
@ -77,8 +82,62 @@ class ContinuousToDiscrete :
def train (**_args):
"""
:params sql
:params store
"""
#
# Let us prepare the data by calling the utility function
#
if 'file' in _args :
#
# We are reading data from a file
_args['data'] = pd.read_csv(_args['file'])
else:
#
# data will be read from elsewhere (a data-store)...
pass
# if 'ignore' in _args and 'columns' in _args['ignore']:
def train (**args) :
_inputhandler = prepare.Input(**_args)
values,_matrix = _inputhandler.convert()
args = {"real":_matrix,"context":_args['context']}
_map = {}
if 'store' in _args :
#
# This
args['store'] = copy.deepcopy(_args['store']['logs'])
args['store']['args']['doc'] = _args['context']
logger = factory.instance(**args['store'])
args['logger'] = logger
for key in _inputhandler._map :
beg = _inputhandler._map[key]['beg']
end = _inputhandler._map[key]['end']
values = _inputhandler._map[key]['values'].tolist()
_map[key] = {"beg":beg,"end":end,"values":np.array(values).astype(str).tolist()}
info = {"rows":_matrix.shape[0],"cols":_matrix.shape[1],"map":_map}
logger.write({"module":"gan-train","action":"data-prep","context":_args['context'],"input":info})
args['logs'] = _args['logs'] if 'logs' in _args else 'logs'
args ['max_epochs'] = _args['max_epochs']
args['matrix_size'] = _matrix.shape[0]
args['batch_size'] = 2000
args['partition'] = 0 if 'partition' not in _args else _args['partition']
os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0'
trainer = gan.Train(**args)
#
# @TODO: Write the map.json in the output directory for the logs
#
f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']),'w')
f.write(json.dumps(_map))
f.close()
trainer.apply()
pass
def _train (**args) :
"""
This function is intended to train the GAN in order to learn about the distribution of the features
:column columns that need to be synthesized (discrete)
@ -122,18 +181,53 @@ def train (**args) :
# If the s
trainer = gan.Train(**args)
trainer.apply()
def post(**args):
"""
This uploads the tensorflow checkpoint to a data-store (mongodb, biguqery, s3)
"""
pass
def get(**args):
"""
This function will restore a checkpoint from a persistant storage on to disk
"""
pass
def generate(**args):
def generate(**_args):
"""
This function will generate a set of records, before we must load the parameters needed
:param data
:param context
:param logs
"""
f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']))
_map = json.loads(f.read())
f.close()
if 'file' in _args :
df = pd.read_csv(_args['file'])
else:
df = _args['data'] if not isinstance(_args['data'],str) else pd.read_csv(_args['data'])
args = {"context":_args['context'],"max_epochs":_args['max_epochs'],"candidates":_args['candidates']}
args['logs'] = _args['logs'] if 'logs' in _args else 'logs'
args ['max_epochs'] = _args['max_epochs']
# args['matrix_size'] = _matrix.shape[0]
args['batch_size'] = 2000
args['partition'] = 0 if 'partition' not in _args else _args['partition']
args['row_count'] = df.shape[0]
#
# @TODO: perhaps get the space of values here ... (not sure it's a good idea)
#
_args['map'] = _map
_inputhandler = prepare.Input(**_args)
values,_matrix = _inputhandler.convert()
args['values'] = np.array(values)
if 'gpu' in _args :
os.environ['CUDA_VISIBLE_DEVICES'] = str(_args['gpu'])
handler = gan.Predict (**args)
handler.load_meta(None)
#
# Let us now format the matrices as we expect them to be
#
candidates = handler.apply(candidates=args['candidates'])
return [_inputhandler.revert(matrix=_matrix) for _matrix in candidates]
def _generate(**args):
"""
This function will generate a synthetic dataset on the basis of a model that has been learnt for the dataset
@return pandas.DataFrame

View File

@ -1,32 +0,0 @@
import pandas as pd
import data.maker
from data.params import SYS_ARGS
import json
from scipy.stats import wasserstein_distance as wd
import risk
import numpy as np
if 'config' in SYS_ARGS :
ARGS = json.loads(open(SYS_ARGS['config']).read())
if 'generate' not in SYS_ARGS :
data.maker.train(**ARGS)
else:
#
#
ARGS['no_value'] = ''
_df = data.maker.generate(**ARGS)
odf = pd.read_csv (ARGS['data'])
odf.columns = [name.lower() for name in odf.columns]
column = ARGS['column'] if isinstance(ARGS['column'],list) else [ARGS['column']]
# print (odf.head())
# print (_df.head())
print(odf.join(_df[column],rsuffix='_io'))
# print (_df[column].risk.evaluate(flag='synth'))
# print (odf[column].risk.evaluate(flag='original'))
# _x = pd.get_dummies(_df[column]).values
# y = pd.get_dummies(odf[column]).values
# N = _df.shape[0]
# print (np.mean([ wd(_x[i],y[i])for i in range(0,N)]))
# print (wd(_x[0],y[0]) )
# column = SYS_ARGS['column']
# odf = open(SYS_ARGS['data'])

View File

@ -9,7 +9,7 @@ import pandas as pd
from google.oauth2 import service_account
from google.cloud import bigquery as bq
import data.maker
import copy
from data.params import SYS_ARGS
#
@ -69,53 +69,45 @@ class Components :
This function will perform training on the basis of a given pointer that reads data
"""
#
# @TODO: we need to log something here about the parameters being passed
# pointer = args['reader'] if 'reader' in args else lambda: Components.get(**args)
df = args['data']
schema = None
if 'file' in args :
#
# Now we can parse the arguments and submit the entire thing to training
#
logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
log_folder = args['logs'] if 'logs' in args else 'logs'
PART_SIZE = int(args['part_size'])
partition = args['partition']
log_folder = os.sep.join([log_folder,args['context'],str(partition)])
_args = {"batch_size":2000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
if 'batch_size' in args :
_args['batch_size'] = int(args['batch_size'])
_args['matrix_size'] = args['matrix_size'] if 'matrix_size' in args else 128 #
# We ask the process to assume 1 gpu given the system number of GPU and that these tasks can run in parallel
#
if int(args['num_gpu']) > 1 :
_args['gpu'] = int(args['gpu']) if int(args['gpu']) < 8 else np.random.choice(np.arange(8)).astype(int)
df = pd.read_csv(args['file'])
del args['file']
elif 'data' not in args :
reader = factory.instance(**args['store']['source'])
if 'row_limit' in args :
df = reader.read(sql=args['sql'],limit=args['row_limit'])
else:
df = reader.read(sql=args['sql'])
schema = reader.meta(table=args['from']) if hasattr(reader,'meta') and 'from' in args else None
else:
_args['gpu'] = 0
_args['num_gpu'] = 1
os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu'])
_args['partition'] = int(partition)
_args['continuous']= args['continuous'] if 'continuous' in args else []
_args['store'] = {'type':'mongo.MongoWriter','args':{'dbname':'aou','doc':args['context']}}
_args['data'] = args['data']
df = args['data']
# print (['partition ',partition,df.value_source_concept_id.unique()])
#
# @log :
# Logging information about the training process for this partition (or not)
#
info = {"rows":df.shape[0],"cols":df.shape[1], "partition":int(partition),"logs":_args['logs']}
# df = df.fillna('')
if schema :
_schema = {}
for _item in schema :
_type = int
_value = 0
if _item.field_type == 'FLOAT' :
_type =float
elif _item.field_type != 'INTEGER' :
_type = str
_value = ''
_schema[_item.name] = _type
df[_item.name] = df[_item.name].fillna(_value).astype(_type)
args['schema'] = _schema
# df[_item.name] = df[_item.name].astype(_type)
_args = copy.deepcopy(args)
# _args['store'] = args['store']['source']
_args['data'] = df
logger.write({"module":"train","action":"train","input":info})
data.maker.train(**_args)
if 'autopilot' in ( list(args.keys())) :
print (['autopilot mode enabled ....'])
print (['autopilot mode enabled ....',args['context']])
self.generate(args)
pass
@ -129,141 +121,167 @@ class Components :
"""
This function will generate data and store it to a given,
"""
logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
log_folder = args['logs'] if 'logs' in args else 'logs'
partition = args['partition'] if 'partition' in args else ''
log_folder = os.sep.join([log_folder,args['context'],str(partition)])
store = args['store']['logs']
store['doc'] = args['context']
logger = factory.instance(**store) #type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
_args = {"batch_size":2000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
ostore = args['store']['target']
writer = factory.instance(**ostore)
# log_folder = args['logs'] if 'logs' in args else 'logs'
# partition = args['partition'] if 'partition' in args else ''
# log_folder = os.sep.join([log_folder,args['context'],str(partition)])
# _args = {"batch_size":2000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
# _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
# _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1
if 'batch_size' in args :
_args['batch_size'] = int(args['batch_size'])
# if 'batch_size' in args :
# _args['batch_size'] = int(args['batch_size'])
if int(args['num_gpu']) > 1 :
_args['gpu'] = int(args['gpu']) if int(args['gpu']) < 8 else np.random.choice(np.arange(8)).astype(int)
else:
_args['gpu'] = 0
_args['num_gpu'] = 1
os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu'])
# _args['no_value']= args['no_value']
_args['matrix_size'] = args['matrix_size'] if 'matrix_size' in args else 128
# if int(args['num_gpu']) > 1 :
# _args['gpu'] = int(args['gpu']) if int(args['gpu']) < 8 else np.random.choice(np.arange(8)).astype(int)
# else:
# _args['gpu'] = 0
# _args['num_gpu'] = 1
# os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu'])
# # _args['no_value']= args['no_value']
# _args['matrix_size'] = args['matrix_size'] if 'matrix_size' in args else 128
# MAX_ROWS = args['max_rows'] if 'max_rows' in args else 0
PART_SIZE = int(args['part_size']) if 'part_size' in args else 8
# # MAX_ROWS = args['max_rows'] if 'max_rows' in args else 0
# PART_SIZE = int(args['part_size']) if 'part_size' in args else 8
# credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
# _args['data'] = pd.read_gbq(SQL,credentials=credentials,dialect='standard').dropna()
# reader = args['reader']
# df = reader()
df = args['reader']() if 'reader' in args else args['data']
schema = args['schema'] if 'schema' in args else None
if 'file' in args :
# if 'slice' in args and 'max_rows' in args['slice']:
# max_rows = args['slice']['max_rows']
# if df.shape[0] > max_rows :
# print (".. slicing ")
# i = np.random.choice(df.shape[0],max_rows,replace=False)
# df = df.iloc[i]
df = pd.read_csv(args['file'])
else:
if 'data' not in args :
reader = factory.instance(**args['store']['source'])
if 'row_limit' in args :
df = reader.read(sql=args['sql'],limit=args['row_limit'])
else:
df = reader.read(sql=args['sql'])
if 'schema' not in args and hasattr(reader,'meta'):
schema = reader.meta(table=args['from'])
else:
#
# This will account for autopilot mode ...
df = args['data']
# bounds = Components.split(df,MAX_ROWS,PART_SIZE)
# if partition != '' :
# columns = args['columns']
# df = np.array_split(df[columns].values,PART_SIZE)
# df = pd.DataFrame(df[ int (partition) ],columns = columns)
# max_rows = int(args['partition_max_rows']) if 'partition_max_rows' in args else 1000000
# N = np.divide(df.shape[0],max_rows).astype(int) + 1
info = {"name":args['columns'],"parition":int(partition),"gpu":_args["gpu"],"rows":int(df.shape[0]),"cols":int(df.shape[1]),"space":df[args['columns'][0]].unique().size, "part_size":int(PART_SIZE)}
logger.write({"module":"generate","action":"partition","input":info})
_args['partition'] = int(partition)
_args['continuous']= args['continuous'] if 'continuous' in args else []
#
# How many rows sub-partition must we divide this into ?
# let us fix the data types here every _id field will be an np.int64...
#
schema = args['schema']
for item in schema :
if item.field_type == 'INTEGER' and df[item.name].dtype != np.int64:
df[item.name] = np.array(df[item.name].values,dtype=np.int64)
elif item.field_type == 'STRING' and df[item.name].dtype != object :
df[item.name] = np.array(df[item.name],dtype=object)
# for name in df.columns.tolist():
# if name.endswith('_id') :
# if df[name].isnull().sum() > 0 and name not in ['unique_device_id']:
# df[name].fillna(np.nan_to_num(np.nan),inplace=True)
# df[name] = df[name].astype(int)
_info = {"module":"gan-prep","action":"read","shape":{"rows":df.shape[0],"columns":df.shape[0]}}
_dc = pd.DataFrame()
# for mdf in df :
_args['data'] = df
args['data'] = df
args['candidates'] = 1 if 'candidates' not in args else int(args['candidates'])
_dc = _dc.append(data.maker.generate(**_args))
candidates = (data.maker.generate(**args))
if 'sql.BQWriter' in ostore['type'] :
#table = ".".join([ostore['['dataset'],args['context']])
# writer = factory.instance(**ostore)
_columns = None
skip_columns = []
_schema = [{"name":field.name,"type":field.field_type,"description":field.description} for field in schema] if schema else []
for _df in candidates :
#
# we need to format the fields here to make sure we have something cohesive
#
#
# We need to post the generate the data in order to :
# 1. compare immediately
# 2. synthetic copy
#
if not skip_columns :
# _columns = set(df.columns) - set(_df.columns)
if 'ignore' in args and 'columns' in args['ignore'] :
cols = _dc.columns.tolist()
for name in args['ignore']['columns'] :
for _name in _df.columns:
if _name in name:
skip_columns.append(_name)
#
# We perform a series of set operations to insure that the following conditions are met:
# - the synthetic dataset only has fields that need to be synthesized
# - The original dataset has all the fields except those that need to be synthesized
#
data_comp = _args['data'][args['columns']].join(_dc[args['columns']],rsuffix='_io') #-- will be used for comparison (store this in big query)
#
# performing basic analytics on the synthetic data generated (easy to quickly asses)
#
info = {"module":"generate","action":"io.metrics","input":{"rows":data_comp.shape[0],"partition":partition,"logs":[]}}
_df = _df[list(set(_df.columns) - set(skip_columns))]
#
# @TODO: Send data over to a process for analytics
if set(df.columns) & set(_df.columns) :
_columns = set(df.columns) - set(_df.columns)
df = df[_columns]
base_cols = list(set(_args['data'].columns) - set(args['columns'])) #-- rebuilt the dataset (and store it)
cols = _dc.columns.tolist()
for name in cols :
_args['data'][name] = _dc[name]
#
# Let us merge the dataset here and and have a comprehensive dataset
#
#-- Let us store all of this into bigquery
prefix = args['notify']+'.'+_args['context']
partition = str(partition)
table = '_'.join([prefix,partition,'io']).replace('__','_')
folder = os.sep.join([args['logs'],args['context'],partition,'output'])
if 'file' in args :
_df = pd.DataFrame.join(df,_df)
_fname = os.sep.join([folder,table.replace('_io','_full_io.csv')])
_pname = os.sep.join([folder,table])+'.csv'
data_comp.to_csv( _pname,index=False)
_args['data'].to_csv(_fname,index=False)
_id = 'path'
writer.write(_df,schema=_schema,table=args['from'])
# writer.write(df,table=table)
pass
else:
pass
credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
_pname = os.sep.join([folder,table+'.csv'])
_fname = table.replace('_io','_full_io')
partial = '.'.join(['io',args['context']+'_partial_io'])
complete= '.'.join(['io',args['context']+'_full_io'])
data_comp.to_csv(_pname,index=False)
if 'dump' in args :
print (_args['data'].head())
else:
Components.lock.acquire()
data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000)
_args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000)
Components.lock.release()
_id = 'dataset'
info = {"full":{_id:_fname,"rows":_args['data'].shape[0]},"partial":{"path":_pname,"rows":data_comp.shape[0]} }
if partition :
info ['partition'] = int(partition)
logger.write({"module":"generate","action":"write","input":info} )
# #
# # We need to post the generate the data in order to :
# # 1. compare immediately
# # 2. synthetic copy
# #
# cols = _dc.columns.tolist()
# data_comp = _args['data'][args['columns']].join(_dc[args['columns']],rsuffix='_io') #-- will be used for comparison (store this in big query)
# #
# # performing basic analytics on the synthetic data generated (easy to quickly asses)
# #
# info = {"module":"generate","action":"io.metrics","input":{"rows":data_comp.shape[0],"partition":partition,"logs":[]}}
# #
# # @TODO: Send data over to a process for analytics
# base_cols = list(set(_args['data'].columns) - set(args['columns'])) #-- rebuilt the dataset (and store it)
# cols = _dc.columns.tolist()
# for name in cols :
# _args['data'][name] = _dc[name]
# #
# #-- Let us store all of this into bigquery
# prefix = args['notify']+'.'+_args['context']
# partition = str(partition)
# table = '_'.join([prefix,partition,'io']).replace('__','_')
# folder = os.sep.join([args['logs'],args['context'],partition,'output'])
# if 'file' in args :
# _fname = os.sep.join([folder,table.replace('_io','_full_io.csv')])
# _pname = os.sep.join([folder,table])+'.csv'
# data_comp.to_csv( _pname,index=False)
# _args['data'].to_csv(_fname,index=False)
# _id = 'path'
# else:
# credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
# _pname = os.sep.join([folder,table+'.csv'])
# _fname = table.replace('_io','_full_io')
# partial = '.'.join(['io',args['context']+'_partial_io'])
# complete= '.'.join(['io',args['context']+'_full_io'])
# data_comp.to_csv(_pname,index=False)
# if 'dump' in args :
# print (_args['data'].head())
# else:
# Components.lock.acquire()
# data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000)
# _args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000)
# Components.lock.release()
# _id = 'dataset'
# info = {"full":{_id:_fname,"rows":_args['data'].shape[0]},"partial":{"path":_pname,"rows":data_comp.shape[0]} }
# if partition :
# info ['partition'] = int(partition)
# logger.write({"module":"generate","action":"write","input":info} )
@ -308,98 +326,95 @@ if __name__ == '__main__' :
# Log what was initiated so we have context of this processing ...
#
# if 'listen' not in SYS_ARGS :
if 'file' in args :
DATA = pd.read_csv(args['file']) ;
schema = []
else:
DATA = Components().get(args)
client = bq.Client.from_service_account_json(args["private_key"])
schema = client.get_table(client.dataset(args['dataset']).table(args['from'])).schema
# if 'file' in args :
# DATA = pd.read_csv(args['file']) ;
# schema = []
# else:
# DATA = Components().get(args)
# client = bq.Client.from_service_account_json(args["private_key"])
# schema = client.get_table(client.dataset(args['dataset']).table(args['from'])).schema
COLUMNS = DATA.columns
DATA = np.array_split(DATA,PART_SIZE)
args['schema'] = schema
# COLUMNS = DATA.columns
# DATA = np.array_split(DATA,PART_SIZE)
# args['schema'] = schema
if 'generate' in SYS_ARGS :
#
# Let us see if we have partitions given the log folder
content = os.listdir( os.sep.join([args['logs'],args['context']]))
content = os.listdir( os.sep.join([args['logs'],'train',args['context']]))
generator = Components()
if ''.join(content).isnumeric() :
#
# we have partitions we are working with
# if ''.join(content).isnumeric() :
# #
# # we have partitions we are working with
jobs = []
# jobs = []
# columns = DATA.columns.tolist()
# # columns = DATA.columns.tolist()
# DATA = np.array_split(DATA,PART_SIZE)
# # DATA = np.array_split(DATA,PART_SIZE)
for index in range(0,PART_SIZE) :
if 'focus' in args and int(args['focus']) != index :
#
# This handles failures/recoveries for whatever reason
# If we are only interested in generating data for a given partition
continue
# index = id.index(id)
# for index in range(0,PART_SIZE) :
# if 'focus' in args and int(args['focus']) != index :
# #
# # This handles failures/recoveries for whatever reason
# # If we are only interested in generating data for a given partition
# continue
# # index = id.index(id)
args['partition'] = index
args['data'] = DATA[index]
if int(args['num_gpu']) > 1 :
args['gpu'] = index
else:
args['gpu']=0
# args['partition'] = index
# args['data'] = DATA[index]
# if int(args['num_gpu']) > 1 :
# args['gpu'] = index
# else:
# args['gpu']=0
make = lambda _args: (Components()).generate(_args)
job = Process(target=make,args=(args,))
job.name = 'generator # '+str(index)
job.start()
jobs.append(job)
# if len(jobs) == 1 :
# job.join()
# make = lambda _args: (Components()).generate(_args)
# job = Process(target=make,args=(args,))
# job.name = 'generator # '+str(index)
# job.start()
# jobs.append(job)
# # if len(jobs) == 1 :
# # job.join()
print (["Started ",len(jobs),"generators" if len(jobs)>1 else "generator" ])
while len(jobs)> 0 :
jobs = [job for job in jobs if job.is_alive()]
time.sleep(2)
# print (["Started ",len(jobs),"generators" if len(jobs)>1 else "generator" ])
# while len(jobs)> 0 :
# jobs = [job for job in jobs if job.is_alive()]
# time.sleep(2)
# generator.generate(args)
else:
generator.generate(args)
# # generator.generate(args)
# else:
# generator.generate(args)
# Components.generate(args)
elif 'shuffle' in SYS_ARGS:
generator.generate(args)
for data in DATA :
args['data'] = data
_df = (Components()).shuffle(args)
else:
# DATA = np.array_split(DATA,PART_SIZE)
agent = Components()
agent.train(**args)
# jobs = []
# for index in range(0,PART_SIZE) :
# if 'focus' in args and int(args['focus']) != index :
# continue
# args['part_size'] = PART_SIZE
# args['partition'] = index
# args['data'] = DATA[index]
# if int(args['num_gpu']) > 1 :
# args['gpu'] = index
# else:
# args['gpu']=0
jobs = []
for index in range(0,PART_SIZE) :
if 'focus' in args and int(args['focus']) != index :
continue
args['part_size'] = PART_SIZE
args['partition'] = index
args['data'] = DATA[index]
if int(args['num_gpu']) > 1 :
args['gpu'] = index
else:
args['gpu']=0
make = lambda _args: (Components()).train(**_args)
job = Process(target=make,args=( dict(args),))
job.name = 'Trainer # ' + str(index)
job.start()
jobs.append(job)
# args['gpu']
print (["Started ",len(jobs),"trainers" if len(jobs)>1 else "trainer" ])
while len(jobs)> 0 :
jobs = [job for job in jobs if job.is_alive()]
time.sleep(2)
# make = lambda _args: (Components()).train(**_args)
# job = Process(target=make,args=( dict(args),))
# job.name = 'Trainer # ' + str(index)
# job.start()
# jobs.append(job)
# # args['gpu']
# print (["Started ",len(jobs),"trainers" if len(jobs)>1 else "trainer" ])
# while len(jobs)> 0 :
# jobs = [job for job in jobs if job.is_alive()]
# time.sleep(2)
# trainer = Components()
# trainer.train(**args)