bug fix: zeros matrix and continuous variables
This commit is contained in:
parent
20ee62178a
commit
e0601edea5
|
@ -16,6 +16,9 @@ import numpy as np
|
|||
# import cupy as cp
|
||||
import sys
|
||||
import os
|
||||
#
|
||||
# The following is to address the issue over creating a large matrix ...
|
||||
#
|
||||
# from multiprocessing import Process, Queue
|
||||
|
||||
# if 'GPU' in os.environ :
|
||||
|
@ -230,8 +233,11 @@ class Input :
|
|||
cols = np.array(cols)
|
||||
row_count = len(rows)
|
||||
# if 'GPU' not in os.environ :
|
||||
_matrix = np.zeros([row_count,cols.size])
|
||||
|
||||
# _matrix = np.zeros([row_count,cols.size],dtype=int)
|
||||
#
|
||||
# @NOTE: For some reason, there is an out of memory error created here, this seems to fix it (go figure)
|
||||
#
|
||||
_matrix = np.array([np.zeros(cols.size) for i in np.arange(row_count)])
|
||||
[np.put(_matrix[i], np.where(cols == rows[i]) ,1)for i in np.arange(row_count) if np.where(cols == rows[i])[0].size > 0]
|
||||
# else:
|
||||
# _matrix = cp.zeros([row_count,cols.size])
|
||||
|
|
68
pipeline.py
68
pipeline.py
|
@ -122,10 +122,20 @@ class Components :
|
|||
_args = copy.deepcopy(args)
|
||||
# _args['store'] = args['store']['source']
|
||||
_args['data'] = df
|
||||
#
|
||||
# The columns that are continuous should also be skipped because they don't need to be synthesied (like-that)
|
||||
if 'continuous' in args :
|
||||
x_cols = args['continuous']
|
||||
else:
|
||||
x_cols = []
|
||||
|
||||
if 'ignore' in args and 'columns' in args['ignore'] :
|
||||
_cols = self.get_ignore(data=df,columns=args['ignore']['columns'])
|
||||
_args['data'] = df[ list(set(df.columns)- set(_cols))]
|
||||
|
||||
#
|
||||
# We need to make sure that continuous columns are removed
|
||||
if x_cols :
|
||||
_args['data'] = df[list(set(df.columns) - set(x_cols))]
|
||||
data.maker.train(**_args)
|
||||
|
||||
if 'autopilot' in ( list(args.keys())) :
|
||||
|
@ -136,7 +146,26 @@ class Components :
|
|||
|
||||
pass
|
||||
|
||||
def post(self,args):
|
||||
def approximate(self,values):
|
||||
"""
|
||||
:param values array of values to be approximated
|
||||
"""
|
||||
if values.dtype in [int,float] :
|
||||
r = np.random.dirichlet(values)
|
||||
x = []
|
||||
_type = values.dtype
|
||||
for index in np.arange(values.size) :
|
||||
|
||||
if np.random.choice([0,1],1)[0] :
|
||||
value = values[index] + (values[index] * r[index])
|
||||
else :
|
||||
value = values[index] - (values[index] * r[index])
|
||||
value = int(value) if _type == int else np.round(value,2)
|
||||
x.append( value)
|
||||
np.random.shuffle(x)
|
||||
return np.array(x)
|
||||
else:
|
||||
return values
|
||||
pass
|
||||
|
||||
|
||||
|
@ -179,9 +208,22 @@ class Components :
|
|||
_dc = pd.DataFrame()
|
||||
# for mdf in df :
|
||||
args['data'] = df
|
||||
#
|
||||
# The columns that are continuous should also be skipped because they don't need to be synthesied (like-that)
|
||||
if 'continuous' in args :
|
||||
x_cols = args['continuous']
|
||||
else:
|
||||
x_cols = []
|
||||
|
||||
if 'ignore' in args and 'columns' in args['ignore'] :
|
||||
_cols = self.get_ignore(data=df,columns=args['ignore']['columns'])
|
||||
args['data'] = df[ list(set(df.columns)- set(_cols))]
|
||||
#
|
||||
# We need to remove the continuous columns from the data-frame
|
||||
# @TODO: Abstract this !!
|
||||
#
|
||||
if x_cols :
|
||||
args['data'] = df[list(set(df.columns) - set(x_cols))]
|
||||
|
||||
args['candidates'] = 1 if 'candidates' not in args else int(args['candidates'])
|
||||
|
||||
|
@ -192,7 +234,10 @@ class Components :
|
|||
_columns = None
|
||||
skip_columns = []
|
||||
_schema = schema
|
||||
cols = [_item['name'] for _item in _schema]
|
||||
if schema :
|
||||
cols = [_item['name'] for _item in _schema]
|
||||
else:
|
||||
cols = df.columns
|
||||
for _df in candidates :
|
||||
#
|
||||
# we need to format the fields here to make sure we have something cohesive
|
||||
|
@ -206,6 +251,9 @@ class Components :
|
|||
# for _name in _df.columns:
|
||||
# if _name in name:
|
||||
# skip_columns.append(_name)
|
||||
if x_cols :
|
||||
for _col in x_cols :
|
||||
_df[_col] = self.approximate(df[_col])
|
||||
#
|
||||
# We perform a series of set operations to insure that the following conditions are met:
|
||||
# - the synthetic dataset only has fields that need to be synthesized
|
||||
|
@ -222,10 +270,16 @@ class Components :
|
|||
# Let us merge the dataset here and and have a comprehensive dataset
|
||||
|
||||
_df = pd.DataFrame.join(df,_df)
|
||||
for _item in _schema :
|
||||
if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] :
|
||||
_df[_item['name']] = _df[_item['name']].astype(str)
|
||||
writer.write(_df[cols],schema=_schema,table=args['from'])
|
||||
if _schema :
|
||||
for _item in _schema :
|
||||
if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] :
|
||||
_df[_item['name']] = _df[_item['name']].astype(str)
|
||||
|
||||
pass
|
||||
if _schema :
|
||||
writer.write(_df[cols],schema=_schema,table=args['from'])
|
||||
else:
|
||||
writer.write(_df[cols],table=args['from'])
|
||||
# writer.write(df,table=table)
|
||||
pass
|
||||
else:
|
||||
|
|
2
setup.py
2
setup.py
|
@ -5,7 +5,7 @@ import sys
|
|||
def read(fname):
|
||||
return open(os.path.join(os.path.dirname(__file__), fname)).read()
|
||||
args = {"name":"data-maker",
|
||||
"version":"1.4.3",
|
||||
"version":"1.4.4",
|
||||
"author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
|
||||
"packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
|
||||
args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
|
||||
|
|
Loading…
Reference in New Issue