bug fix: with column count

2020-04-15 09:18:06 -05:00 · 2020-04-15 09:18:06 -05:00 · b8f59f85d5
parent bddba3d908
commit b8f59f85d5
3 changed files with 23 additions and 16 deletions
--- a/data/bridge.py
+++ b/data/bridge.py
@ -160,20 +160,17 @@ class Binary :
        """
        # values = np.unique(column)    
        
-        values = column.dropna().unique() 
-        values.sort()
+        # values = column.dropna().unique() 
+        
+        # values.sort()
+        # column = column.values
+        values = self.get_column(column,size)
        column = column.values
        #
        # Let's treat the case of missing values i.e nulls 
        #       
        row_count,col_count = column.size,values.size
        # if row_count * col_count > size and row_count < size:
-        if col_count > size :
-            # N = np.divide(size,row_count).astype(int) 
-            # N = 
-            i = np.random.choice(col_count,size)
-            values = values[-i]
-            col_count = size
            

       
@ -196,7 +193,17 @@ class Binary :
        return pd.DataFrame(matrix,columns=values)
    def apply(self,column,size):
        return self.__stream(column,size)
-    def get_column_values(self,column,size=-1):
+    def get_column(self,column,size=-1):
+        """
+        This function will return the columns that are available for processing ...
+        """
+        values = column.dropna().value_counts().index
+        if size > 0 :
+            values = values[:size]
+            values.sort_values()
+        return values
+            
+    def _get_column_values(self,column,size=-1):
        values = column.dropna().unique() 
        values.sort()
        
@ -204,7 +211,7 @@ class Binary :
        # Let's treat the case of missing values i.e nulls 
        #       
        row_count,col_count = column.size,values.size
-        if col_count > size :
+        if col_count > size and size  > 0:
            # N = np.divide(size,row_count).astype(int) 
            # N = 
            i = np.random.choice(col_count,size)
@ -270,8 +277,8 @@ if __name__ == '__main__' :
        --export    will export data to a specified location
    """
    df = pd.read_csv('sample.csv')
-    print ( pd.get_dummies(df.race))
-    print ( (Binary()).apply(df.race, 2))
+    print ( df.race.value_counts())
+    print ( (Binary()).apply(df['race'], 3))

    # has_basic = 'dataset' in SYS_ARGS.keys() and 'table' in SYS_ARGS.keys() and 'key' in SYS_ARGS.keys()
    # has_action= 'export' in SYS_ARGS.keys() or 'pseudo' in SYS_ARGS.keys()
--- a/data/maker/init.py
+++ b/data/maker/init.py
@ -136,7 +136,7 @@ def train (**args) :
            # print (df[col].dtypes)
            # print (df[col].dropna/(axis=1).unique())
        # args['real']  = pd.get_dummies(df[col].dropna()).astype(np.float32).values
-        msize = args['matrix_size'] if 'matrix_size' in args else 128
+        msize = args['matrix_size'] if 'matrix_size' in args else -1        
        args['real'] = (Binary()).apply(df[col],msize)

            
@ -210,8 +210,8 @@ def generate(**args):
            
        # else:
        # values          = df[col].dropna().unique().tolist()
-        msize = args['matrix_size'] if 'matrix_size' in args else 128
-        values = bhandler.get_column_values(df[col],msize)
+        msize = args['matrix_size'] if 'matrix_size' in args else -1        
+        values = bhandler.get_column(df[col],msize)

        
        
--- a/setup.py
+++ b/setup.py
@ -4,7 +4,7 @@ import sys

 def read(fname):
    return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.3.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.3.1","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
        "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'