bug fix: zeros matrix and continuous variables
This commit is contained in:
parent
20ee62178a
commit
e0601edea5
|
@ -16,6 +16,9 @@ import numpy as np
|
||||||
# import cupy as cp
|
# import cupy as cp
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
#
|
||||||
|
# The following is to address the issue over creating a large matrix ...
|
||||||
|
#
|
||||||
# from multiprocessing import Process, Queue
|
# from multiprocessing import Process, Queue
|
||||||
|
|
||||||
# if 'GPU' in os.environ :
|
# if 'GPU' in os.environ :
|
||||||
|
@ -230,8 +233,11 @@ class Input :
|
||||||
cols = np.array(cols)
|
cols = np.array(cols)
|
||||||
row_count = len(rows)
|
row_count = len(rows)
|
||||||
# if 'GPU' not in os.environ :
|
# if 'GPU' not in os.environ :
|
||||||
_matrix = np.zeros([row_count,cols.size])
|
# _matrix = np.zeros([row_count,cols.size],dtype=int)
|
||||||
|
#
|
||||||
|
# @NOTE: For some reason, there is an out of memory error created here, this seems to fix it (go figure)
|
||||||
|
#
|
||||||
|
_matrix = np.array([np.zeros(cols.size) for i in np.arange(row_count)])
|
||||||
[np.put(_matrix[i], np.where(cols == rows[i]) ,1)for i in np.arange(row_count) if np.where(cols == rows[i])[0].size > 0]
|
[np.put(_matrix[i], np.where(cols == rows[i]) ,1)for i in np.arange(row_count) if np.where(cols == rows[i])[0].size > 0]
|
||||||
# else:
|
# else:
|
||||||
# _matrix = cp.zeros([row_count,cols.size])
|
# _matrix = cp.zeros([row_count,cols.size])
|
||||||
|
|
58
pipeline.py
58
pipeline.py
|
@ -122,10 +122,20 @@ class Components :
|
||||||
_args = copy.deepcopy(args)
|
_args = copy.deepcopy(args)
|
||||||
# _args['store'] = args['store']['source']
|
# _args['store'] = args['store']['source']
|
||||||
_args['data'] = df
|
_args['data'] = df
|
||||||
|
#
|
||||||
|
# The columns that are continuous should also be skipped because they don't need to be synthesied (like-that)
|
||||||
|
if 'continuous' in args :
|
||||||
|
x_cols = args['continuous']
|
||||||
|
else:
|
||||||
|
x_cols = []
|
||||||
|
|
||||||
if 'ignore' in args and 'columns' in args['ignore'] :
|
if 'ignore' in args and 'columns' in args['ignore'] :
|
||||||
_cols = self.get_ignore(data=df,columns=args['ignore']['columns'])
|
_cols = self.get_ignore(data=df,columns=args['ignore']['columns'])
|
||||||
_args['data'] = df[ list(set(df.columns)- set(_cols))]
|
_args['data'] = df[ list(set(df.columns)- set(_cols))]
|
||||||
|
#
|
||||||
|
# We need to make sure that continuous columns are removed
|
||||||
|
if x_cols :
|
||||||
|
_args['data'] = df[list(set(df.columns) - set(x_cols))]
|
||||||
data.maker.train(**_args)
|
data.maker.train(**_args)
|
||||||
|
|
||||||
if 'autopilot' in ( list(args.keys())) :
|
if 'autopilot' in ( list(args.keys())) :
|
||||||
|
@ -136,7 +146,26 @@ class Components :
|
||||||
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def post(self,args):
|
def approximate(self,values):
|
||||||
|
"""
|
||||||
|
:param values array of values to be approximated
|
||||||
|
"""
|
||||||
|
if values.dtype in [int,float] :
|
||||||
|
r = np.random.dirichlet(values)
|
||||||
|
x = []
|
||||||
|
_type = values.dtype
|
||||||
|
for index in np.arange(values.size) :
|
||||||
|
|
||||||
|
if np.random.choice([0,1],1)[0] :
|
||||||
|
value = values[index] + (values[index] * r[index])
|
||||||
|
else :
|
||||||
|
value = values[index] - (values[index] * r[index])
|
||||||
|
value = int(value) if _type == int else np.round(value,2)
|
||||||
|
x.append( value)
|
||||||
|
np.random.shuffle(x)
|
||||||
|
return np.array(x)
|
||||||
|
else:
|
||||||
|
return values
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@ -179,9 +208,22 @@ class Components :
|
||||||
_dc = pd.DataFrame()
|
_dc = pd.DataFrame()
|
||||||
# for mdf in df :
|
# for mdf in df :
|
||||||
args['data'] = df
|
args['data'] = df
|
||||||
|
#
|
||||||
|
# The columns that are continuous should also be skipped because they don't need to be synthesied (like-that)
|
||||||
|
if 'continuous' in args :
|
||||||
|
x_cols = args['continuous']
|
||||||
|
else:
|
||||||
|
x_cols = []
|
||||||
|
|
||||||
if 'ignore' in args and 'columns' in args['ignore'] :
|
if 'ignore' in args and 'columns' in args['ignore'] :
|
||||||
_cols = self.get_ignore(data=df,columns=args['ignore']['columns'])
|
_cols = self.get_ignore(data=df,columns=args['ignore']['columns'])
|
||||||
args['data'] = df[ list(set(df.columns)- set(_cols))]
|
args['data'] = df[ list(set(df.columns)- set(_cols))]
|
||||||
|
#
|
||||||
|
# We need to remove the continuous columns from the data-frame
|
||||||
|
# @TODO: Abstract this !!
|
||||||
|
#
|
||||||
|
if x_cols :
|
||||||
|
args['data'] = df[list(set(df.columns) - set(x_cols))]
|
||||||
|
|
||||||
args['candidates'] = 1 if 'candidates' not in args else int(args['candidates'])
|
args['candidates'] = 1 if 'candidates' not in args else int(args['candidates'])
|
||||||
|
|
||||||
|
@ -192,7 +234,10 @@ class Components :
|
||||||
_columns = None
|
_columns = None
|
||||||
skip_columns = []
|
skip_columns = []
|
||||||
_schema = schema
|
_schema = schema
|
||||||
|
if schema :
|
||||||
cols = [_item['name'] for _item in _schema]
|
cols = [_item['name'] for _item in _schema]
|
||||||
|
else:
|
||||||
|
cols = df.columns
|
||||||
for _df in candidates :
|
for _df in candidates :
|
||||||
#
|
#
|
||||||
# we need to format the fields here to make sure we have something cohesive
|
# we need to format the fields here to make sure we have something cohesive
|
||||||
|
@ -206,6 +251,9 @@ class Components :
|
||||||
# for _name in _df.columns:
|
# for _name in _df.columns:
|
||||||
# if _name in name:
|
# if _name in name:
|
||||||
# skip_columns.append(_name)
|
# skip_columns.append(_name)
|
||||||
|
if x_cols :
|
||||||
|
for _col in x_cols :
|
||||||
|
_df[_col] = self.approximate(df[_col])
|
||||||
#
|
#
|
||||||
# We perform a series of set operations to insure that the following conditions are met:
|
# We perform a series of set operations to insure that the following conditions are met:
|
||||||
# - the synthetic dataset only has fields that need to be synthesized
|
# - the synthetic dataset only has fields that need to be synthesized
|
||||||
|
@ -222,10 +270,16 @@ class Components :
|
||||||
# Let us merge the dataset here and and have a comprehensive dataset
|
# Let us merge the dataset here and and have a comprehensive dataset
|
||||||
|
|
||||||
_df = pd.DataFrame.join(df,_df)
|
_df = pd.DataFrame.join(df,_df)
|
||||||
|
if _schema :
|
||||||
for _item in _schema :
|
for _item in _schema :
|
||||||
if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] :
|
if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] :
|
||||||
_df[_item['name']] = _df[_item['name']].astype(str)
|
_df[_item['name']] = _df[_item['name']].astype(str)
|
||||||
|
|
||||||
|
pass
|
||||||
|
if _schema :
|
||||||
writer.write(_df[cols],schema=_schema,table=args['from'])
|
writer.write(_df[cols],schema=_schema,table=args['from'])
|
||||||
|
else:
|
||||||
|
writer.write(_df[cols],table=args['from'])
|
||||||
# writer.write(df,table=table)
|
# writer.write(df,table=table)
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -5,7 +5,7 @@ import sys
|
||||||
def read(fname):
|
def read(fname):
|
||||||
return open(os.path.join(os.path.dirname(__file__), fname)).read()
|
return open(os.path.join(os.path.dirname(__file__), fname)).read()
|
||||||
args = {"name":"data-maker",
|
args = {"name":"data-maker",
|
||||||
"version":"1.4.3",
|
"version":"1.4.4",
|
||||||
"author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
|
"author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
|
||||||
"packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
|
"packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
|
||||||
args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
|
args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
|
||||||
|
|
Loading…
Reference in New Issue