Handling of continous values
This commit is contained in:
parent
bd6fb03f8d
commit
3fbd68309f
|
@ -604,7 +604,7 @@ class Predict(GNet):
|
|||
r = np.zeros(self.ROW_COUNT)
|
||||
df.columns = self.values
|
||||
if len(found):
|
||||
print (len(found),NTH_VALID_CANDIDATE)
|
||||
# print (len(found),NTH_VALID_CANDIDATE)
|
||||
# x = df * self.values
|
||||
#
|
||||
# let's get the missing rows (if any) ...
|
||||
|
@ -704,10 +704,10 @@ if __name__ == '__main__' :
|
|||
p = Predict(context=context,label=LABEL,values=values,column=column)
|
||||
p.load_meta(column)
|
||||
r = p.apply()
|
||||
print (df)
|
||||
print ()
|
||||
# print (df)
|
||||
# print ()
|
||||
df[column] = r[column]
|
||||
print (df)
|
||||
# print (df)
|
||||
|
||||
|
||||
else:
|
||||
|
|
|
@ -14,6 +14,68 @@ import data.gan as gan
|
|||
from transport import factory
|
||||
from data.bridge import Binary
|
||||
import threading as thread
|
||||
class ContinuousToDiscrete :
|
||||
@staticmethod
|
||||
def binary(X,n=4) :
|
||||
"""
|
||||
This function will convert a continous stream of information into a variety a bit stream of bins
|
||||
"""
|
||||
# BOUNDS = np.repeat(np.divide(X.max(),n),n).cumsum().tolist()
|
||||
|
||||
BOUNDS = ContinuousToDiscrete.bounds(X,n)
|
||||
|
||||
# _map = [{"index":BOUNDS.index(i),"ubound":i} for i in BOUNDS]
|
||||
_matrix = []
|
||||
m = []
|
||||
for value in X :
|
||||
x_ = np.zeros(n)
|
||||
_matrix.append(x_)
|
||||
for row in BOUNDS :
|
||||
|
||||
if value>= row.left and value <= row.right :
|
||||
index = BOUNDS.index(row)
|
||||
x_[index] = 1
|
||||
break
|
||||
|
||||
return _matrix
|
||||
|
||||
@staticmethod
|
||||
def bounds(x,n):
|
||||
return list(pd.cut(np.array(x),n).categories)
|
||||
|
||||
|
||||
|
||||
@staticmethod
|
||||
def continuous(X,BIN_SIZE=4) :
|
||||
"""
|
||||
This function will approximate a binary vector given boundary information
|
||||
:X binary matrix
|
||||
:BIN_SIZE
|
||||
"""
|
||||
BOUNDS = ContinuousToDiscrete.bounds(X,BIN_SIZE)
|
||||
|
||||
values = []
|
||||
_BINARY= ContinuousToDiscrete.binary(X,BIN_SIZE)
|
||||
# # print (BOUNDS)
|
||||
|
||||
# values = []
|
||||
for row in _BINARY :
|
||||
# ubound = BOUNDS[row.index(1)]
|
||||
index = np.where(row == 1)[0][0]
|
||||
|
||||
ubound = BOUNDS[ index ].right
|
||||
lbound = BOUNDS[ index ].left
|
||||
|
||||
x_ = np.round(np.random.uniform(lbound,ubound),3).astype(float)
|
||||
values.append(x_)
|
||||
|
||||
lbound = ubound
|
||||
|
||||
return values
|
||||
|
||||
|
||||
|
||||
|
||||
def train (**args) :
|
||||
"""
|
||||
This function is intended to train the GAN in order to learn about the distribution of the features
|
||||
|
@ -24,22 +86,30 @@ def train (**args) :
|
|||
:context label of what we are synthesizing
|
||||
"""
|
||||
column = args['column'] if (isinstance(args['column'],list)) else [args['column']]
|
||||
|
||||
CONTINUOUS = args['continuous'] if 'continuous' in args else []
|
||||
# column_id = args['id']
|
||||
df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
|
||||
df.columns = [name.lower() for name in df.columns]
|
||||
|
||||
#
|
||||
# @TODO:
|
||||
# Consider sequential training of sub population for extremely large datasets
|
||||
#
|
||||
|
||||
#
|
||||
# If we have several columns we will proceed one at a time (it could be done in separate threads)
|
||||
# @TODO : Consider performing this task on several threads/GPUs simulataneously
|
||||
#
|
||||
handler = Binary()
|
||||
# args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values
|
||||
# args['label'] = handler.Export(df[[column_id]])
|
||||
# args['label'] = np.ones(df.shape[0]).reshape(df.shape[0],1)
|
||||
for col in column :
|
||||
args['real'] = pd.get_dummies(df[col].fillna('')).astype(np.float32).values
|
||||
# args['real'] = handler.Export(df[[col]])
|
||||
for col in column :
|
||||
# args['real'] = pd.get_dummies(df[col].fillna('')).astype(np.float32).values
|
||||
# if 'float' not in df[col].dtypes.name :
|
||||
# args['real'] = pd.get_dummies(df[col].fillna('')).astype(np.float32).values
|
||||
if 'float' in df[col].dtypes.name and col in CONTINUOUS:
|
||||
BIN_SIZE = 10 if 'bin_size' not in args else int(args['bin_size'])
|
||||
args['real'] = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32)
|
||||
else:
|
||||
args['real'] = pd.get_dummies(df[col].fillna('')).astype(np.float32).values
|
||||
|
||||
|
||||
args['column'] = col
|
||||
args['context'] = col
|
||||
context = args['context']
|
||||
|
@ -75,7 +145,7 @@ def generate(**args):
|
|||
"""
|
||||
# df = args['data']
|
||||
df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
|
||||
|
||||
CONTINUOUS = args['continous'] if 'continuous' in args else []
|
||||
column = args['column'] if (isinstance(args['column'],list)) else [args['column']]
|
||||
# column_id = args['id']
|
||||
#
|
||||
|
@ -86,18 +156,26 @@ def generate(**args):
|
|||
for col in column :
|
||||
args['context'] = col
|
||||
args['column'] = col
|
||||
values = df[col].unique().tolist()
|
||||
args['values'] = values
|
||||
args['row_count'] = df.shape[0]
|
||||
|
||||
if 'float' in df[col].dtypes.name or col in CONTINUOUS :
|
||||
#
|
||||
# We should create the bins for the values we are observing here
|
||||
BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size'])
|
||||
values = ContinuousToDiscrete.continuous(df[col].values,BIN_SIZE)
|
||||
else:
|
||||
values = df[col].unique().tolist()
|
||||
|
||||
args['values'] = values
|
||||
args['row_count'] = df.shape[0]
|
||||
#
|
||||
# we can determine the cardinalities here so we know what to allow or disallow
|
||||
handler = gan.Predict (**args)
|
||||
handler.load_meta(col)
|
||||
# handler.ROW_COUNT = df[col].shape[0]
|
||||
r = handler.apply()
|
||||
# print (r)
|
||||
#
|
||||
print ([_df.shape,len(r[col])])
|
||||
r = handler.apply()
|
||||
_df[col] = r[col]
|
||||
#
|
||||
# @TODO: log basic stats about the synthetic attribute
|
||||
#
|
||||
|
||||
# break
|
||||
return _df
|
|
@ -17,9 +17,9 @@ if 'config' in SYS_ARGS :
|
|||
odf = pd.read_csv (ARGS['data'])
|
||||
odf.columns = [name.lower() for name in odf.columns]
|
||||
column = ARGS['column'] if isinstance(ARGS['column'],list) else [ARGS['column']]
|
||||
print (odf.head())
|
||||
print (_df.head())
|
||||
# print(pd.merge(odf,_df,rsuffix='_io'))
|
||||
# print (odf.head())
|
||||
# print (_df.head())
|
||||
print(odf.join(_df[column],rsuffix='_io'))
|
||||
# print (_df[column].risk.evaluate(flag='synth'))
|
||||
# print (odf[column].risk.evaluate(flag='original'))
|
||||
# _x = pd.get_dummies(_df[column]).values
|
||||
|
|
Loading…
Reference in New Issue