fixed issue around data-types/casting misbehavior with pandas and missing values
This commit is contained in:
parent
50da909867
commit
821cec8dd7
11
data/gan.py
11
data/gan.py
|
@ -647,13 +647,8 @@ class Predict(GNet):
|
|||
info['ratio'] = __ratio
|
||||
info['partition'] = self.PARTITION
|
||||
self.logger.write({"module":"gan-generate","action":"generate","input":info})
|
||||
df.columns = self.values
|
||||
if len(found) or df.columns.size == len(self.values):
|
||||
# print (len(found),NTH_VALID_CANDIDATE)
|
||||
# x = df * self.values
|
||||
#
|
||||
# let's get the missing rows (if any) ...
|
||||
#
|
||||
# df.columns = self.values
|
||||
if len(found) or df.columns.size <= len(self.values):
|
||||
ii = df.apply(lambda row: np.sum(row) == 0 ,axis=1)
|
||||
# print ([' **** ',ii.sum()])
|
||||
|
||||
|
@ -669,6 +664,8 @@ class Predict(GNet):
|
|||
# Log the findings here in terms of ratio, missing, candidate count
|
||||
# print ([np.max(ratio),len(missing),len(found),i])
|
||||
i = np.where(ii == 0)[0]
|
||||
|
||||
|
||||
df = pd.DataFrame( df.iloc[i].apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1))
|
||||
df.columns = columns
|
||||
df = df[columns[0]].append(pd.Series(missing))
|
||||
|
|
|
@ -190,7 +190,7 @@ def generate(**args):
|
|||
#
|
||||
BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size'])
|
||||
NO_VALUE = dict(args['no_value']) if type(args['no_value']) == dict else args['no_value']
|
||||
|
||||
|
||||
_df = df.copy()
|
||||
for col in column :
|
||||
args['context'] = col
|
||||
|
@ -237,6 +237,11 @@ def generate(**args):
|
|||
|
||||
_df[col] = r[col]
|
||||
#
|
||||
# Let's cast the type to the original type (it makes the data more usable)
|
||||
#
|
||||
otype = df[col].dtype
|
||||
_df[col] = _df[col].astype(otype)
|
||||
#
|
||||
# @TODO: log basic stats about the synthetic attribute
|
||||
#
|
||||
# print (r)s
|
||||
|
|
|
@ -195,8 +195,7 @@ class Components :
|
|||
|
||||
if name.endswith('_id') :
|
||||
if df[name].isnull().sum() > 0 :
|
||||
df[name].fillna(0,inplace=True)
|
||||
else:
|
||||
df[name].fillna(np.nan_to_num(np.nan),inplace=True)
|
||||
df[name] = df[name].astype(int)
|
||||
|
||||
|
||||
|
@ -253,9 +252,11 @@ class Components :
|
|||
print (_args['data'].head())
|
||||
else:
|
||||
Components.lock.acquire()
|
||||
|
||||
data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000)
|
||||
|
||||
INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append'
|
||||
print (_args['data'].dtypes)
|
||||
_args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000)
|
||||
Components.lock.release()
|
||||
_id = 'dataset'
|
||||
|
|
2
setup.py
2
setup.py
|
@ -4,7 +4,7 @@ import sys
|
|||
|
||||
def read(fname):
|
||||
return open(os.path.join(os.path.dirname(__file__), fname)).read()
|
||||
args = {"name":"data-maker","version":"1.2.8","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
|
||||
args = {"name":"data-maker","version":"1.2.9","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
|
||||
"packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
|
||||
args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
|
||||
args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'
|
||||
|
|
Loading…
Reference in New Issue