bug fix around shape of candidate data to generate

This commit is contained in:
Steve Nyemba 2020-02-25 11:41:40 -06:00
parent c1a500fe4c
commit 553ee75a06
2 changed files with 39 additions and 18 deletions

View File

@ -166,7 +166,15 @@ class GNet :
return _object return _object
def mkdir (self,path): def mkdir (self,path):
if not os.path.exists(path) : if not os.path.exists(path) :
os.mkdir(path) if os.sep in path :
pass
root = []
for loc in path.split(os.sep) :
root.append(loc)
os.mkdir(os.sep.join(root))
else:
os.mkdir(path)
def normalize(self,**args): def normalize(self,**args):
@ -520,8 +528,10 @@ class Predict(GNet):
""" """
def __init__(self,**args): def __init__(self,**args):
GNet.__init__(self,**args) GNet.__init__(self,**args)
self.generator = Generator(**args) self.generator = Generator(**args)
self.values = args['values'] self.values = args['values']
self.ROW_COUNT = args['row_count']
self.MISSING_VALUES = args['no_value']
def load_meta(self, column): def load_meta(self, column):
super().load_meta(column) super().load_meta(column)
self.generator.load_meta(column) self.generator.load_meta(column)
@ -532,8 +542,8 @@ class Predict(GNet):
model_dir = os.sep.join([self.train_dir,suffix+'-'+str(self.MAX_EPOCHS)]) model_dir = os.sep.join([self.train_dir,suffix+'-'+str(self.MAX_EPOCHS)])
demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo'] demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo']
tf.compat.v1.reset_default_graph() tf.compat.v1.reset_default_graph()
z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM]) z = tf.random.normal(shape=[self.ROW_COUNT, self.Z_DIM])
y = tf.compat.v1.placeholder(shape=[self.BATCHSIZE_PER_GPU, self.NUM_LABELS], dtype=tf.int32) y = tf.compat.v1.placeholder(shape=[self.ROW_COUNT, self.NUM_LABELS], dtype=tf.int32)
if self._LABEL is not None : if self._LABEL is not None :
ma = [[i] for i in np.arange(self.NUM_LABELS - 2)] ma = [[i] for i in np.arange(self.NUM_LABELS - 2)]
label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32))) label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32)))
@ -556,7 +566,7 @@ class Predict(GNet):
labels = None labels = None
found = [] found = []
ratio = []
for i in np.arange(CANDIDATE_COUNT) : for i in np.arange(CANDIDATE_COUNT) :
if labels : if labels :
f = sess.run(fake,feed_dict={y:labels}) f = sess.run(fake,feed_dict={y:labels})
@ -569,10 +579,11 @@ class Predict(GNet):
df = ( pd.DataFrame(np.round(f).astype(np.int32))) df = ( pd.DataFrame(np.round(f).astype(np.int32)))
p = 0 not in df.sum(axis=1).values p = 0 not in df.sum(axis=1).values
x = df.sum(axis=1).values x = df.sum(axis=1).values
print ( [np.sum(x),x.size])
if np.divide( np.sum(x), x.size) : if np.divide( np.sum(x), x.size) > .9 or p:
ratio.append(np.divide( np.sum(x), x.size))
found.append(df) found.append(df)
if len(found) == NTH_VALID_CANDIDATE or i == CANDIDATE_COUNT: if i == CANDIDATE_COUNT:
break break
else: else:
continue continue
@ -582,8 +593,9 @@ class Predict(GNet):
# #
# In case we are dealing with actual values like diagnosis codes we can perform # In case we are dealing with actual values like diagnosis codes we can perform
# #
INDEX = np.random.choice(np.arange(len(found)),1)[0]
df = found[np.random.choice(np.arange(len(found)),1)[0]] INDEX = ratio.index(np.max(ratio))
df = found[INDEX]
columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']] columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']]
# r = np.zeros((self.ROW_COUNT,len(columns))) # r = np.zeros((self.ROW_COUNT,len(columns)))
@ -592,9 +604,20 @@ class Predict(GNet):
if len(found): if len(found):
print (len(found),NTH_VALID_CANDIDATE) print (len(found),NTH_VALID_CANDIDATE)
# x = df * self.values # x = df * self.values
#
df = pd.DataFrame( df.apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1)) # let's get the missing rows (if any) ...
#
ii = df.apply(lambda row: np.sum(row) == 0 ,axis=1)
if ii :
#
#@TODO Have this be a configurable variable
missing = np.repeat(0, np.where(ii==1)[0].size)
else:
missing = []
i = np.where(ii == 0)[0]
df = pd.DataFrame( df.iloc.apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1))
df.columns = columns df.columns = columns
df = df[columns[0]].append(pd.Series(missing))

View File

@ -77,25 +77,23 @@ def generate(**args):
df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data']) df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
column = args['column'] if (isinstance(args['column'],list)) else [args['column']] column = args['column'] if (isinstance(args['column'],list)) else [args['column']]
column_id = args['id'] # column_id = args['id']
# #
#@TODO: #@TODO:
# If the identifier is not present, we should fine a way to determine or make one # If the identifier is not present, we should fine a way to determine or make one
# #
# args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values
bwrangler = Binary()
# args['label'] = bwrangler.Export(df[[column_id]])
_df = df.copy() _df = df.copy()
for col in column : for col in column :
args['context'] = col args['context'] = col
args['column'] = col args['column'] = col
values = df[col].unique().tolist() values = df[col].unique().tolist()
# values.sort()
args['values'] = values args['values'] = values
args['row_count'] = df.shape[0]
# #
# we can determine the cardinalities here so we know what to allow or disallow # we can determine the cardinalities here so we know what to allow or disallow
handler = gan.Predict (**args) handler = gan.Predict (**args)
handler.load_meta(col) handler.load_meta(col)
# handler.ROW_COUNT = df[col].shape[0]
r = handler.apply() r = handler.apply()
# print (r) # print (r)
_df[col] = r[col] _df[col] = r[col]