From e0601edea547a28d06c6b82fe313a4f4e5930542 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 1 Apr 2021 12:14:51 -0500 Subject: [PATCH] bug fix: zeros matrix and continuous variables --- data/maker/prepare/__init__.py | 10 ++++- pipeline.py | 70 ++++++++++++++++++++++++++++++---- setup.py | 2 +- 3 files changed, 71 insertions(+), 11 deletions(-) diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py index 9fb0fa7..e15c63b 100644 --- a/data/maker/prepare/__init__.py +++ b/data/maker/prepare/__init__.py @@ -16,6 +16,9 @@ import numpy as np # import cupy as cp import sys import os +# +# The following is to address the issue over creating a large matrix ... +# # from multiprocessing import Process, Queue # if 'GPU' in os.environ : @@ -230,8 +233,11 @@ class Input : cols = np.array(cols) row_count = len(rows) # if 'GPU' not in os.environ : - _matrix = np.zeros([row_count,cols.size]) - + # _matrix = np.zeros([row_count,cols.size],dtype=int) + # + # @NOTE: For some reason, there is an out of memory error created here, this seems to fix it (go figure) + # + _matrix = np.array([np.zeros(cols.size) for i in np.arange(row_count)]) [np.put(_matrix[i], np.where(cols == rows[i]) ,1)for i in np.arange(row_count) if np.where(cols == rows[i])[0].size > 0] # else: # _matrix = cp.zeros([row_count,cols.size]) diff --git a/pipeline.py b/pipeline.py index 49b2039..a38029d 100644 --- a/pipeline.py +++ b/pipeline.py @@ -122,10 +122,20 @@ class Components : _args = copy.deepcopy(args) # _args['store'] = args['store']['source'] _args['data'] = df + # + # The columns that are continuous should also be skipped because they don't need to be synthesied (like-that) + if 'continuous' in args : + x_cols = args['continuous'] + else: + x_cols = [] + if 'ignore' in args and 'columns' in args['ignore'] : _cols = self.get_ignore(data=df,columns=args['ignore']['columns']) _args['data'] = df[ list(set(df.columns)- set(_cols))] - + # + # We need to make sure that continuous columns are removed + if x_cols : + _args['data'] = df[list(set(df.columns) - set(x_cols))] data.maker.train(**_args) if 'autopilot' in ( list(args.keys())) : @@ -136,7 +146,26 @@ class Components : pass - def post(self,args): + def approximate(self,values): + """ + :param values array of values to be approximated + """ + if values.dtype in [int,float] : + r = np.random.dirichlet(values) + x = [] + _type = values.dtype + for index in np.arange(values.size) : + + if np.random.choice([0,1],1)[0] : + value = values[index] + (values[index] * r[index]) + else : + value = values[index] - (values[index] * r[index]) + value = int(value) if _type == int else np.round(value,2) + x.append( value) + np.random.shuffle(x) + return np.array(x) + else: + return values pass @@ -179,10 +208,23 @@ class Components : _dc = pd.DataFrame() # for mdf in df : args['data'] = df + # + # The columns that are continuous should also be skipped because they don't need to be synthesied (like-that) + if 'continuous' in args : + x_cols = args['continuous'] + else: + x_cols = [] + if 'ignore' in args and 'columns' in args['ignore'] : _cols = self.get_ignore(data=df,columns=args['ignore']['columns']) args['data'] = df[ list(set(df.columns)- set(_cols))] - + # + # We need to remove the continuous columns from the data-frame + # @TODO: Abstract this !! + # + if x_cols : + args['data'] = df[list(set(df.columns) - set(x_cols))] + args['candidates'] = 1 if 'candidates' not in args else int(args['candidates']) candidates = (data.maker.generate(**args)) @@ -192,7 +234,10 @@ class Components : _columns = None skip_columns = [] _schema = schema - cols = [_item['name'] for _item in _schema] + if schema : + cols = [_item['name'] for _item in _schema] + else: + cols = df.columns for _df in candidates : # # we need to format the fields here to make sure we have something cohesive @@ -206,6 +251,9 @@ class Components : # for _name in _df.columns: # if _name in name: # skip_columns.append(_name) + if x_cols : + for _col in x_cols : + _df[_col] = self.approximate(df[_col]) # # We perform a series of set operations to insure that the following conditions are met: # - the synthetic dataset only has fields that need to be synthesized @@ -222,10 +270,16 @@ class Components : # Let us merge the dataset here and and have a comprehensive dataset _df = pd.DataFrame.join(df,_df) - for _item in _schema : - if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] : - _df[_item['name']] = _df[_item['name']].astype(str) - writer.write(_df[cols],schema=_schema,table=args['from']) + if _schema : + for _item in _schema : + if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] : + _df[_item['name']] = _df[_item['name']].astype(str) + + pass + if _schema : + writer.write(_df[cols],schema=_schema,table=args['from']) + else: + writer.write(_df[cols],table=args['from']) # writer.write(df,table=table) pass else: diff --git a/setup.py b/setup.py index 450d0d9..544f4b3 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = {"name":"data-maker", - "version":"1.4.3", + "version":"1.4.4", "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']