From 821cec8dd77be3843503fdb788883fd9ee38a614 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 14 Apr 2020 01:54:11 -0500 Subject: [PATCH] fixed issue around data-types/casting misbehavior with pandas and missing values --- data/gan.py | 11 ++++------- data/maker/__init__.py | 7 ++++++- pipeline.py | 5 +++-- setup.py | 2 +- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/data/gan.py b/data/gan.py index 5fc7032..8a0c7a7 100644 --- a/data/gan.py +++ b/data/gan.py @@ -647,13 +647,8 @@ class Predict(GNet): info['ratio'] = __ratio info['partition'] = self.PARTITION self.logger.write({"module":"gan-generate","action":"generate","input":info}) - df.columns = self.values - if len(found) or df.columns.size == len(self.values): - # print (len(found),NTH_VALID_CANDIDATE) - # x = df * self.values - # - # let's get the missing rows (if any) ... - # + # df.columns = self.values + if len(found) or df.columns.size <= len(self.values): ii = df.apply(lambda row: np.sum(row) == 0 ,axis=1) # print ([' **** ',ii.sum()]) @@ -669,6 +664,8 @@ class Predict(GNet): # Log the findings here in terms of ratio, missing, candidate count # print ([np.max(ratio),len(missing),len(found),i]) i = np.where(ii == 0)[0] + + df = pd.DataFrame( df.iloc[i].apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1)) df.columns = columns df = df[columns[0]].append(pd.Series(missing)) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 378c226..25392f9 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -190,7 +190,7 @@ def generate(**args): # BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size']) NO_VALUE = dict(args['no_value']) if type(args['no_value']) == dict else args['no_value'] - + _df = df.copy() for col in column : args['context'] = col @@ -237,6 +237,11 @@ def generate(**args): _df[col] = r[col] # + # Let's cast the type to the original type (it makes the data more usable) + # + otype = df[col].dtype + _df[col] = _df[col].astype(otype) + # # @TODO: log basic stats about the synthetic attribute # # print (r)s diff --git a/pipeline.py b/pipeline.py index 12746fa..c678a89 100644 --- a/pipeline.py +++ b/pipeline.py @@ -195,8 +195,7 @@ class Components : if name.endswith('_id') : if df[name].isnull().sum() > 0 : - df[name].fillna(0,inplace=True) - else: + df[name].fillna(np.nan_to_num(np.nan),inplace=True) df[name] = df[name].astype(int) @@ -253,9 +252,11 @@ class Components : print (_args['data'].head()) else: Components.lock.acquire() + data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000) INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append' + print (_args['data'].dtypes) _args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000) Components.lock.release() _id = 'dataset' diff --git a/setup.py b/setup.py index 71e14e0..207cb6f 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.2.8","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.2.9","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'