fixed issue around data-types/casting misbehavior with pandas and missing values

This commit is contained in:
Steve Nyemba 2020-04-14 01:54:11 -05:00
parent 50da909867
commit 821cec8dd7
4 changed files with 14 additions and 11 deletions

View File

@ -647,13 +647,8 @@ class Predict(GNet):
info['ratio'] = __ratio
info['partition'] = self.PARTITION
self.logger.write({"module":"gan-generate","action":"generate","input":info})
df.columns = self.values
if len(found) or df.columns.size == len(self.values):
# print (len(found),NTH_VALID_CANDIDATE)
# x = df * self.values
#
# let's get the missing rows (if any) ...
#
# df.columns = self.values
if len(found) or df.columns.size <= len(self.values):
ii = df.apply(lambda row: np.sum(row) == 0 ,axis=1)
# print ([' **** ',ii.sum()])
@ -669,6 +664,8 @@ class Predict(GNet):
# Log the findings here in terms of ratio, missing, candidate count
# print ([np.max(ratio),len(missing),len(found),i])
i = np.where(ii == 0)[0]
df = pd.DataFrame( df.iloc[i].apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1))
df.columns = columns
df = df[columns[0]].append(pd.Series(missing))

View File

@ -237,6 +237,11 @@ def generate(**args):
_df[col] = r[col]
#
# Let's cast the type to the original type (it makes the data more usable)
#
otype = df[col].dtype
_df[col] = _df[col].astype(otype)
#
# @TODO: log basic stats about the synthetic attribute
#
# print (r)s

View File

@ -195,8 +195,7 @@ class Components :
if name.endswith('_id') :
if df[name].isnull().sum() > 0 :
df[name].fillna(0,inplace=True)
else:
df[name].fillna(np.nan_to_num(np.nan),inplace=True)
df[name] = df[name].astype(int)
@ -253,9 +252,11 @@ class Components :
print (_args['data'].head())
else:
Components.lock.acquire()
data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000)
INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append'
print (_args['data'].dtypes)
_args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000)
Components.lock.release()
_id = 'dataset'

View File

@ -4,7 +4,7 @@ import sys
def read(fname):
return open(os.path.join(os.path.dirname(__file__), fname)).read()
args = {"name":"data-maker","version":"1.2.8","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
args = {"name":"data-maker","version":"1.2.9","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
"packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'