bug fix with binary matrix generation

This commit is contained in:
Steve Nyemba 2020-02-18 02:59:39 -06:00
parent ce55848cc8
commit 0f0c2642c2
3 changed files with 15 additions and 13 deletions

View File

@ -191,12 +191,13 @@ class Binary :
#
# This will give us a map of how each column was mapped to a bitstream
_map = df.fillna(np.nan).apply(lambda column: self.__stream(column),axis=0)
# _map = df.fillna(np.nan).apply(lambda column: self.__stream(column),axis=0)
_map = df.fillna('').apply(lambda column: self.__stream(column),axis=0)
#
# We will merge this to have a healthy matrix
_matrix = _map.apply(lambda row: list(list(itertools.chain(*row.values.tolist()))),axis=1)
_matrix = np.matrix([list(item) for item in _matrix])
_matrix = np.matrix([list(item) for item in _matrix]).astype(np.float32)
#
# let's format the map so we don't have an unreasonable amount of data
#
@ -210,7 +211,8 @@ class Binary :
_m[name] = {"start":beg,"end":end}
beg = end
return _m,_matrix.astype(np.float32)
# return _m,_matrix.astype(np.float32)
return _matrix
def Import(self,df,values,_map):
"""

View File

@ -397,17 +397,13 @@ class Train (GNet):
labels_placeholder = tf.compat.v1.placeholder(shape=self._LABEL.shape, dtype=tf.float32)
dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder))
dataset = dataset.repeat(10000)
dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU)
dataset = dataset.batch(batch_size=3000)
dataset = dataset.prefetch(1)
# iterator = dataset.make_initializable_iterator()
iterator = tf.compat.v1.data.make_initializable_iterator(dataset)
# next_element = iterator.get_next()
# init_op = iterator.initializer
return iterator, features_placeholder, labels_placeholder
def network(self,**args):
# def graph(stage, opt):
# global_step = tf.get_variable(stage+'_step', [], initializer=tf.constant_initializer(0), trainable=False)
stage = args['stage']
opt = args['opt']
tower_grads = []
@ -540,8 +536,6 @@ class Predict(GNet):
# The code below will insure we have some acceptable cardinal relationships between id and synthetic values
#
df = ( pd.DataFrame(np.round(f).astype(np.int32)))
print (df.head())
print ()
p = 0 not in df.sum(axis=1).values
if p:

View File

@ -12,6 +12,7 @@ import pandas as pd
import numpy as np
import data.gan as gan
from transport import factory
from data.bridge import Binary
import threading as thread
def train (**args) :
"""
@ -32,9 +33,12 @@ def train (**args) :
# If we have several columns we will proceed one at a time (it could be done in separate threads)
# @TODO : Consider performing this task on several threads/GPUs simulataneously
#
args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values
handler = Binary()
# args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values
args['label'] = handler.Export(df[[column_id]])
for col in column :
args['real'] = pd.get_dummies(df[col]).astype(np.float32).values
# args['real'] = pd.get_dummies(df[col]).astype(np.float32).values
args['real'] = handler.Export(df[[col]])
args['column'] = col
args['context'] = col
context = args['context']
@ -77,7 +81,9 @@ def generate(**args):
#@TODO:
# If the identifier is not present, we should fine a way to determine or make one
#
args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values
# args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values
bwrangler = Binary()
args['label'] = bwrangler.Export(df[[column_id]])
_df = df.copy()
for col in column :
args['context'] = col