fixed issue around data-types/casting misbehavior with pandas and missing values

2020-04-14 01:54:11 -05:00 · 2020-04-14 01:54:11 -05:00 · 821cec8dd7
parent 50da909867
commit 821cec8dd7
4 changed files with 14 additions and 11 deletions
--- a/data/gan.py
+++ b/data/gan.py
@ -647,13 +647,8 @@ class Predict(GNet):
                                        info['ratio'] = __ratio
                                info['partition'] = self.PARTITION
                                self.logger.write({"module":"gan-generate","action":"generate","input":info})
-                        df.columns = self.values
+                        # df.columns = self.values
-                        if len(found) or df.columns.size == len(self.values):
+                        if len(found) or df.columns.size <= len(self.values):
                                # print (len(found),NTH_VALID_CANDIDATE)    
                                # x = df * self.values 
                                #
                                # let's get the missing rows (if any) ...
                                #
                                ii = df.apply(lambda row: np.sum(row) == 0 ,axis=1)
                                # print ([' **** ',ii.sum()])
@ -669,6 +664,8 @@ class Predict(GNet):
                                #       Log the findings here in terms of ratio, missing, candidate count
                                # print ([np.max(ratio),len(missing),len(found),i])
                                i = np.where(ii == 0)[0]
                                df =  pd.DataFrame( df.iloc[i].apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1))
                                df.columns = columns
                                df = df[columns[0]].append(pd.Series(missing))
--- a/data/maker/init.py
+++ b/data/maker/init.py
@ -237,6 +237,11 @@ def generate(**args):
        _df[col]    = r[col]
        #
        # Let's cast the type to the original type (it makes the data more usable)
        #
        otype       = df[col].dtype
        _df[col]    = _df[col].astype(otype)
        #
        # @TODO: log basic stats about the synthetic attribute
        #
        # print (r)s
--- a/pipeline.py
+++ b/pipeline.py
@ -195,8 +195,7 @@ class Components :
 			if name.endswith('_id') :
 				if df[name].isnull().sum() > 0 :
-					df[name].fillna(0,inplace=True)
+					df[name].fillna(np.nan_to_num(np.nan),inplace=True)					
 				else:
 					df[name] = df[name].astype(int)
@ -253,9 +252,11 @@ class Components :
 				print (_args['data'].head())
 			else:
 				Components.lock.acquire()
 				data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000)	
 				INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append'	
 				print (_args['data'].dtypes)
 				_args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000)
 				Components.lock.release()
 			_id = 'dataset'
--- a/setup.py
+++ b/setup.py
@ -4,7 +4,7 @@ import sys
 def read(fname):
    return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.2.8","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.2.9","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
        "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'