bug fix with binary matrix generation

This commit is contained in:
Steve Nyemba 2020-02-18 02:59:39 -06:00
parent ce55848cc8
commit 0f0c2642c2
3 changed files with 15 additions and 13 deletions

View File

@ -191,12 +191,13 @@ class Binary :
# #
# This will give us a map of how each column was mapped to a bitstream # This will give us a map of how each column was mapped to a bitstream
_map = df.fillna(np.nan).apply(lambda column: self.__stream(column),axis=0) # _map = df.fillna(np.nan).apply(lambda column: self.__stream(column),axis=0)
_map = df.fillna('').apply(lambda column: self.__stream(column),axis=0)
# #
# We will merge this to have a healthy matrix # We will merge this to have a healthy matrix
_matrix = _map.apply(lambda row: list(list(itertools.chain(*row.values.tolist()))),axis=1) _matrix = _map.apply(lambda row: list(list(itertools.chain(*row.values.tolist()))),axis=1)
_matrix = np.matrix([list(item) for item in _matrix]) _matrix = np.matrix([list(item) for item in _matrix]).astype(np.float32)
# #
# let's format the map so we don't have an unreasonable amount of data # let's format the map so we don't have an unreasonable amount of data
# #
@ -210,7 +211,8 @@ class Binary :
_m[name] = {"start":beg,"end":end} _m[name] = {"start":beg,"end":end}
beg = end beg = end
return _m,_matrix.astype(np.float32) # return _m,_matrix.astype(np.float32)
return _matrix
def Import(self,df,values,_map): def Import(self,df,values,_map):
""" """

View File

@ -397,17 +397,13 @@ class Train (GNet):
labels_placeholder = tf.compat.v1.placeholder(shape=self._LABEL.shape, dtype=tf.float32) labels_placeholder = tf.compat.v1.placeholder(shape=self._LABEL.shape, dtype=tf.float32)
dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder)) dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder))
dataset = dataset.repeat(10000) dataset = dataset.repeat(10000)
dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU) dataset = dataset.batch(batch_size=3000)
dataset = dataset.prefetch(1) dataset = dataset.prefetch(1)
# iterator = dataset.make_initializable_iterator() # iterator = dataset.make_initializable_iterator()
iterator = tf.compat.v1.data.make_initializable_iterator(dataset) iterator = tf.compat.v1.data.make_initializable_iterator(dataset)
# next_element = iterator.get_next()
# init_op = iterator.initializer
return iterator, features_placeholder, labels_placeholder return iterator, features_placeholder, labels_placeholder
def network(self,**args): def network(self,**args):
# def graph(stage, opt):
# global_step = tf.get_variable(stage+'_step', [], initializer=tf.constant_initializer(0), trainable=False)
stage = args['stage'] stage = args['stage']
opt = args['opt'] opt = args['opt']
tower_grads = [] tower_grads = []
@ -540,8 +536,6 @@ class Predict(GNet):
# The code below will insure we have some acceptable cardinal relationships between id and synthetic values # The code below will insure we have some acceptable cardinal relationships between id and synthetic values
# #
df = ( pd.DataFrame(np.round(f).astype(np.int32))) df = ( pd.DataFrame(np.round(f).astype(np.int32)))
print (df.head())
print ()
p = 0 not in df.sum(axis=1).values p = 0 not in df.sum(axis=1).values
if p: if p:

View File

@ -12,6 +12,7 @@ import pandas as pd
import numpy as np import numpy as np
import data.gan as gan import data.gan as gan
from transport import factory from transport import factory
from data.bridge import Binary
import threading as thread import threading as thread
def train (**args) : def train (**args) :
""" """
@ -32,9 +33,12 @@ def train (**args) :
# If we have several columns we will proceed one at a time (it could be done in separate threads) # If we have several columns we will proceed one at a time (it could be done in separate threads)
# @TODO : Consider performing this task on several threads/GPUs simulataneously # @TODO : Consider performing this task on several threads/GPUs simulataneously
# #
args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values handler = Binary()
# args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values
args['label'] = handler.Export(df[[column_id]])
for col in column : for col in column :
args['real'] = pd.get_dummies(df[col]).astype(np.float32).values # args['real'] = pd.get_dummies(df[col]).astype(np.float32).values
args['real'] = handler.Export(df[[col]])
args['column'] = col args['column'] = col
args['context'] = col args['context'] = col
context = args['context'] context = args['context']
@ -77,7 +81,9 @@ def generate(**args):
#@TODO: #@TODO:
# If the identifier is not present, we should fine a way to determine or make one # If the identifier is not present, we should fine a way to determine or make one
# #
args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values # args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values
bwrangler = Binary()
args['label'] = bwrangler.Export(df[[column_id]])
_df = df.copy() _df = df.copy()
for col in column : for col in column :
args['context'] = col args['context'] = col