bug fix: with column count
This commit is contained in:
parent
bddba3d908
commit
b8f59f85d5
|
@ -160,20 +160,17 @@ class Binary :
|
||||||
"""
|
"""
|
||||||
# values = np.unique(column)
|
# values = np.unique(column)
|
||||||
|
|
||||||
values = column.dropna().unique()
|
# values = column.dropna().unique()
|
||||||
values.sort()
|
|
||||||
|
# values.sort()
|
||||||
|
# column = column.values
|
||||||
|
values = self.get_column(column,size)
|
||||||
column = column.values
|
column = column.values
|
||||||
#
|
#
|
||||||
# Let's treat the case of missing values i.e nulls
|
# Let's treat the case of missing values i.e nulls
|
||||||
#
|
#
|
||||||
row_count,col_count = column.size,values.size
|
row_count,col_count = column.size,values.size
|
||||||
# if row_count * col_count > size and row_count < size:
|
# if row_count * col_count > size and row_count < size:
|
||||||
if col_count > size :
|
|
||||||
# N = np.divide(size,row_count).astype(int)
|
|
||||||
# N =
|
|
||||||
i = np.random.choice(col_count,size)
|
|
||||||
values = values[-i]
|
|
||||||
col_count = size
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -196,7 +193,17 @@ class Binary :
|
||||||
return pd.DataFrame(matrix,columns=values)
|
return pd.DataFrame(matrix,columns=values)
|
||||||
def apply(self,column,size):
|
def apply(self,column,size):
|
||||||
return self.__stream(column,size)
|
return self.__stream(column,size)
|
||||||
def get_column_values(self,column,size=-1):
|
def get_column(self,column,size=-1):
|
||||||
|
"""
|
||||||
|
This function will return the columns that are available for processing ...
|
||||||
|
"""
|
||||||
|
values = column.dropna().value_counts().index
|
||||||
|
if size > 0 :
|
||||||
|
values = values[:size]
|
||||||
|
values.sort_values()
|
||||||
|
return values
|
||||||
|
|
||||||
|
def _get_column_values(self,column,size=-1):
|
||||||
values = column.dropna().unique()
|
values = column.dropna().unique()
|
||||||
values.sort()
|
values.sort()
|
||||||
|
|
||||||
|
@ -204,7 +211,7 @@ class Binary :
|
||||||
# Let's treat the case of missing values i.e nulls
|
# Let's treat the case of missing values i.e nulls
|
||||||
#
|
#
|
||||||
row_count,col_count = column.size,values.size
|
row_count,col_count = column.size,values.size
|
||||||
if col_count > size :
|
if col_count > size and size > 0:
|
||||||
# N = np.divide(size,row_count).astype(int)
|
# N = np.divide(size,row_count).astype(int)
|
||||||
# N =
|
# N =
|
||||||
i = np.random.choice(col_count,size)
|
i = np.random.choice(col_count,size)
|
||||||
|
@ -270,8 +277,8 @@ if __name__ == '__main__' :
|
||||||
--export will export data to a specified location
|
--export will export data to a specified location
|
||||||
"""
|
"""
|
||||||
df = pd.read_csv('sample.csv')
|
df = pd.read_csv('sample.csv')
|
||||||
print ( pd.get_dummies(df.race))
|
print ( df.race.value_counts())
|
||||||
print ( (Binary()).apply(df.race, 2))
|
print ( (Binary()).apply(df['race'], 3))
|
||||||
|
|
||||||
# has_basic = 'dataset' in SYS_ARGS.keys() and 'table' in SYS_ARGS.keys() and 'key' in SYS_ARGS.keys()
|
# has_basic = 'dataset' in SYS_ARGS.keys() and 'table' in SYS_ARGS.keys() and 'key' in SYS_ARGS.keys()
|
||||||
# has_action= 'export' in SYS_ARGS.keys() or 'pseudo' in SYS_ARGS.keys()
|
# has_action= 'export' in SYS_ARGS.keys() or 'pseudo' in SYS_ARGS.keys()
|
||||||
|
|
|
@ -136,7 +136,7 @@ def train (**args) :
|
||||||
# print (df[col].dtypes)
|
# print (df[col].dtypes)
|
||||||
# print (df[col].dropna/(axis=1).unique())
|
# print (df[col].dropna/(axis=1).unique())
|
||||||
# args['real'] = pd.get_dummies(df[col].dropna()).astype(np.float32).values
|
# args['real'] = pd.get_dummies(df[col].dropna()).astype(np.float32).values
|
||||||
msize = args['matrix_size'] if 'matrix_size' in args else 128
|
msize = args['matrix_size'] if 'matrix_size' in args else -1
|
||||||
args['real'] = (Binary()).apply(df[col],msize)
|
args['real'] = (Binary()).apply(df[col],msize)
|
||||||
|
|
||||||
|
|
||||||
|
@ -210,8 +210,8 @@ def generate(**args):
|
||||||
|
|
||||||
# else:
|
# else:
|
||||||
# values = df[col].dropna().unique().tolist()
|
# values = df[col].dropna().unique().tolist()
|
||||||
msize = args['matrix_size'] if 'matrix_size' in args else 128
|
msize = args['matrix_size'] if 'matrix_size' in args else -1
|
||||||
values = bhandler.get_column_values(df[col],msize)
|
values = bhandler.get_column(df[col],msize)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -4,7 +4,7 @@ import sys
|
||||||
|
|
||||||
def read(fname):
|
def read(fname):
|
||||||
return open(os.path.join(os.path.dirname(__file__), fname)).read()
|
return open(os.path.join(os.path.dirname(__file__), fname)).read()
|
||||||
args = {"name":"data-maker","version":"1.3.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
|
args = {"name":"data-maker","version":"1.3.1","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
|
||||||
"packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
|
"packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
|
||||||
args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
|
args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
|
||||||
args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'
|
args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'
|
||||||
|
|
Loading…
Reference in New Issue