bug fix: zeros matrix and continuous variables

This commit is contained in:
Steve Nyemba 2021-04-01 12:14:51 -05:00
parent 20ee62178a
commit e0601edea5
3 changed files with 71 additions and 11 deletions

View File

@ -16,6 +16,9 @@ import numpy as np
# import cupy as cp # import cupy as cp
import sys import sys
import os import os
#
# The following is to address the issue over creating a large matrix ...
#
# from multiprocessing import Process, Queue # from multiprocessing import Process, Queue
# if 'GPU' in os.environ : # if 'GPU' in os.environ :
@ -230,8 +233,11 @@ class Input :
cols = np.array(cols) cols = np.array(cols)
row_count = len(rows) row_count = len(rows)
# if 'GPU' not in os.environ : # if 'GPU' not in os.environ :
_matrix = np.zeros([row_count,cols.size]) # _matrix = np.zeros([row_count,cols.size],dtype=int)
#
# @NOTE: For some reason, there is an out of memory error created here, this seems to fix it (go figure)
#
_matrix = np.array([np.zeros(cols.size) for i in np.arange(row_count)])
[np.put(_matrix[i], np.where(cols == rows[i]) ,1)for i in np.arange(row_count) if np.where(cols == rows[i])[0].size > 0] [np.put(_matrix[i], np.where(cols == rows[i]) ,1)for i in np.arange(row_count) if np.where(cols == rows[i])[0].size > 0]
# else: # else:
# _matrix = cp.zeros([row_count,cols.size]) # _matrix = cp.zeros([row_count,cols.size])

View File

@ -122,10 +122,20 @@ class Components :
_args = copy.deepcopy(args) _args = copy.deepcopy(args)
# _args['store'] = args['store']['source'] # _args['store'] = args['store']['source']
_args['data'] = df _args['data'] = df
#
# The columns that are continuous should also be skipped because they don't need to be synthesied (like-that)
if 'continuous' in args :
x_cols = args['continuous']
else:
x_cols = []
if 'ignore' in args and 'columns' in args['ignore'] : if 'ignore' in args and 'columns' in args['ignore'] :
_cols = self.get_ignore(data=df,columns=args['ignore']['columns']) _cols = self.get_ignore(data=df,columns=args['ignore']['columns'])
_args['data'] = df[ list(set(df.columns)- set(_cols))] _args['data'] = df[ list(set(df.columns)- set(_cols))]
#
# We need to make sure that continuous columns are removed
if x_cols :
_args['data'] = df[list(set(df.columns) - set(x_cols))]
data.maker.train(**_args) data.maker.train(**_args)
if 'autopilot' in ( list(args.keys())) : if 'autopilot' in ( list(args.keys())) :
@ -136,7 +146,26 @@ class Components :
pass pass
def post(self,args): def approximate(self,values):
"""
:param values array of values to be approximated
"""
if values.dtype in [int,float] :
r = np.random.dirichlet(values)
x = []
_type = values.dtype
for index in np.arange(values.size) :
if np.random.choice([0,1],1)[0] :
value = values[index] + (values[index] * r[index])
else :
value = values[index] - (values[index] * r[index])
value = int(value) if _type == int else np.round(value,2)
x.append( value)
np.random.shuffle(x)
return np.array(x)
else:
return values
pass pass
@ -179,9 +208,22 @@ class Components :
_dc = pd.DataFrame() _dc = pd.DataFrame()
# for mdf in df : # for mdf in df :
args['data'] = df args['data'] = df
#
# The columns that are continuous should also be skipped because they don't need to be synthesied (like-that)
if 'continuous' in args :
x_cols = args['continuous']
else:
x_cols = []
if 'ignore' in args and 'columns' in args['ignore'] : if 'ignore' in args and 'columns' in args['ignore'] :
_cols = self.get_ignore(data=df,columns=args['ignore']['columns']) _cols = self.get_ignore(data=df,columns=args['ignore']['columns'])
args['data'] = df[ list(set(df.columns)- set(_cols))] args['data'] = df[ list(set(df.columns)- set(_cols))]
#
# We need to remove the continuous columns from the data-frame
# @TODO: Abstract this !!
#
if x_cols :
args['data'] = df[list(set(df.columns) - set(x_cols))]
args['candidates'] = 1 if 'candidates' not in args else int(args['candidates']) args['candidates'] = 1 if 'candidates' not in args else int(args['candidates'])
@ -192,7 +234,10 @@ class Components :
_columns = None _columns = None
skip_columns = [] skip_columns = []
_schema = schema _schema = schema
if schema :
cols = [_item['name'] for _item in _schema] cols = [_item['name'] for _item in _schema]
else:
cols = df.columns
for _df in candidates : for _df in candidates :
# #
# we need to format the fields here to make sure we have something cohesive # we need to format the fields here to make sure we have something cohesive
@ -206,6 +251,9 @@ class Components :
# for _name in _df.columns: # for _name in _df.columns:
# if _name in name: # if _name in name:
# skip_columns.append(_name) # skip_columns.append(_name)
if x_cols :
for _col in x_cols :
_df[_col] = self.approximate(df[_col])
# #
# We perform a series of set operations to insure that the following conditions are met: # We perform a series of set operations to insure that the following conditions are met:
# - the synthetic dataset only has fields that need to be synthesized # - the synthetic dataset only has fields that need to be synthesized
@ -222,10 +270,16 @@ class Components :
# Let us merge the dataset here and and have a comprehensive dataset # Let us merge the dataset here and and have a comprehensive dataset
_df = pd.DataFrame.join(df,_df) _df = pd.DataFrame.join(df,_df)
if _schema :
for _item in _schema : for _item in _schema :
if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] : if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] :
_df[_item['name']] = _df[_item['name']].astype(str) _df[_item['name']] = _df[_item['name']].astype(str)
pass
if _schema :
writer.write(_df[cols],schema=_schema,table=args['from']) writer.write(_df[cols],schema=_schema,table=args['from'])
else:
writer.write(_df[cols],table=args['from'])
# writer.write(df,table=table) # writer.write(df,table=table)
pass pass
else: else:

View File

@ -5,7 +5,7 @@ import sys
def read(fname): def read(fname):
return open(os.path.join(os.path.dirname(__file__), fname)).read() return open(os.path.join(os.path.dirname(__file__), fname)).read()
args = {"name":"data-maker", args = {"name":"data-maker",
"version":"1.4.3", "version":"1.4.4",
"author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
"packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']