fixed issue around data-types/casting misbehavior with pandas and missing values
This commit is contained in:
parent
50da909867
commit
821cec8dd7
11
data/gan.py
11
data/gan.py
|
@ -647,13 +647,8 @@ class Predict(GNet):
|
||||||
info['ratio'] = __ratio
|
info['ratio'] = __ratio
|
||||||
info['partition'] = self.PARTITION
|
info['partition'] = self.PARTITION
|
||||||
self.logger.write({"module":"gan-generate","action":"generate","input":info})
|
self.logger.write({"module":"gan-generate","action":"generate","input":info})
|
||||||
df.columns = self.values
|
# df.columns = self.values
|
||||||
if len(found) or df.columns.size == len(self.values):
|
if len(found) or df.columns.size <= len(self.values):
|
||||||
# print (len(found),NTH_VALID_CANDIDATE)
|
|
||||||
# x = df * self.values
|
|
||||||
#
|
|
||||||
# let's get the missing rows (if any) ...
|
|
||||||
#
|
|
||||||
ii = df.apply(lambda row: np.sum(row) == 0 ,axis=1)
|
ii = df.apply(lambda row: np.sum(row) == 0 ,axis=1)
|
||||||
# print ([' **** ',ii.sum()])
|
# print ([' **** ',ii.sum()])
|
||||||
|
|
||||||
|
@ -669,6 +664,8 @@ class Predict(GNet):
|
||||||
# Log the findings here in terms of ratio, missing, candidate count
|
# Log the findings here in terms of ratio, missing, candidate count
|
||||||
# print ([np.max(ratio),len(missing),len(found),i])
|
# print ([np.max(ratio),len(missing),len(found),i])
|
||||||
i = np.where(ii == 0)[0]
|
i = np.where(ii == 0)[0]
|
||||||
|
|
||||||
|
|
||||||
df = pd.DataFrame( df.iloc[i].apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1))
|
df = pd.DataFrame( df.iloc[i].apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1))
|
||||||
df.columns = columns
|
df.columns = columns
|
||||||
df = df[columns[0]].append(pd.Series(missing))
|
df = df[columns[0]].append(pd.Series(missing))
|
||||||
|
|
|
@ -237,6 +237,11 @@ def generate(**args):
|
||||||
|
|
||||||
_df[col] = r[col]
|
_df[col] = r[col]
|
||||||
#
|
#
|
||||||
|
# Let's cast the type to the original type (it makes the data more usable)
|
||||||
|
#
|
||||||
|
otype = df[col].dtype
|
||||||
|
_df[col] = _df[col].astype(otype)
|
||||||
|
#
|
||||||
# @TODO: log basic stats about the synthetic attribute
|
# @TODO: log basic stats about the synthetic attribute
|
||||||
#
|
#
|
||||||
# print (r)s
|
# print (r)s
|
||||||
|
|
|
@ -195,8 +195,7 @@ class Components :
|
||||||
|
|
||||||
if name.endswith('_id') :
|
if name.endswith('_id') :
|
||||||
if df[name].isnull().sum() > 0 :
|
if df[name].isnull().sum() > 0 :
|
||||||
df[name].fillna(0,inplace=True)
|
df[name].fillna(np.nan_to_num(np.nan),inplace=True)
|
||||||
else:
|
|
||||||
df[name] = df[name].astype(int)
|
df[name] = df[name].astype(int)
|
||||||
|
|
||||||
|
|
||||||
|
@ -253,9 +252,11 @@ class Components :
|
||||||
print (_args['data'].head())
|
print (_args['data'].head())
|
||||||
else:
|
else:
|
||||||
Components.lock.acquire()
|
Components.lock.acquire()
|
||||||
|
|
||||||
data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000)
|
data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000)
|
||||||
|
|
||||||
INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append'
|
INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append'
|
||||||
|
print (_args['data'].dtypes)
|
||||||
_args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000)
|
_args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000)
|
||||||
Components.lock.release()
|
Components.lock.release()
|
||||||
_id = 'dataset'
|
_id = 'dataset'
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -4,7 +4,7 @@ import sys
|
||||||
|
|
||||||
def read(fname):
|
def read(fname):
|
||||||
return open(os.path.join(os.path.dirname(__file__), fname)).read()
|
return open(os.path.join(os.path.dirname(__file__), fname)).read()
|
||||||
args = {"name":"data-maker","version":"1.2.8","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
|
args = {"name":"data-maker","version":"1.2.9","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
|
||||||
"packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
|
"packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
|
||||||
args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
|
args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
|
||||||
args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'
|
args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'
|
||||||
|
|
Loading…
Reference in New Issue