Handling of continous values

2020-02-28 21:37:26 -06:00 · 2020-02-28 21:37:26 -06:00 · 3fbd68309f
parent bd6fb03f8d
commit 3fbd68309f
3 changed files with 103 additions and 25 deletions
--- a/data/gan.py
+++ b/data/gan.py
@ -604,7 +604,7 @@ class Predict(GNet):
                        r = np.zeros(self.ROW_COUNT)
                        df.columns = self.values
                        if len(found):
-                                print (len(found),NTH_VALID_CANDIDATE)    
+                                # print (len(found),NTH_VALID_CANDIDATE)    
                                # x = df * self.values 
                                #
                                # let's get the missing rows (if any) ...
@ -704,10 +704,10 @@ if __name__ == '__main__' :
                p = Predict(context=context,label=LABEL,values=values,column=column)
                p.load_meta(column)
                r = p.apply()
-                print (df)
-                print ()
+                # print (df)
+                # print ()
                df[column] = r[column]
-                print (df)
+                # print (df)
                
                
        else:
--- a/data/maker/init.py
+++ b/data/maker/init.py
@ -14,6 +14,68 @@ import data.gan as gan
 from transport import factory
 from data.bridge import Binary
 import threading as thread
+class ContinuousToDiscrete :
+    @staticmethod
+    def binary(X,n=4) :
+        """
+        This function will convert a continous stream of information into a variety a bit stream of bins
+        """
+        # BOUNDS = np.repeat(np.divide(X.max(),n),n).cumsum().tolist()
+        
+        BOUNDS = ContinuousToDiscrete.bounds(X,n)
+        
+        # _map = [{"index":BOUNDS.index(i),"ubound":i} for i in BOUNDS]
+        _matrix = []
+        m = []
+        for value in X :
+            x_ = np.zeros(n)
+            _matrix.append(x_)
+            for row in BOUNDS :
+            
+                if value>= row.left and value <= row.right :
+                    index = BOUNDS.index(row)
+                    x_[index]  = 1
+                    break
+
+        return _matrix
+    
+    @staticmethod
+    def bounds(x,n):
+        return list(pd.cut(np.array(x),n).categories)
+        
+
+        
+    @staticmethod
+    def continuous(X,BIN_SIZE=4) :
+        """
+        This function will approximate a binary vector given boundary information
+        :X  binary matrix
+        :BIN_SIZE
+        """
+        BOUNDS = ContinuousToDiscrete.bounds(X,BIN_SIZE)
+        
+        values = []
+        _BINARY= ContinuousToDiscrete.binary(X,BIN_SIZE)
+        # # print (BOUNDS)
+        
+        # values = []
+        for row in _BINARY :
+            # ubound = BOUNDS[row.index(1)]
+            index = np.where(row == 1)[0][0]
+            
+            ubound = BOUNDS[ index ].right
+            lbound = BOUNDS[ index ].left
+            
+            x_ = np.round(np.random.uniform(lbound,ubound),3).astype(float)            
+            values.append(x_)
+            
+            lbound = ubound
+        
+        return values
+            
+
+
+    
 def train (**args) :
    """
    This function is intended to train the GAN in order to learn about the distribution of the features
@ -24,22 +86,30 @@ def train (**args) :
    :context    label of what we are synthesizing
    """
    column      = args['column'] if (isinstance(args['column'],list)) else [args['column']]
-    
+    CONTINUOUS  = args['continuous'] if 'continuous' in args else []
    # column_id   = args['id']
    df          = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
    df.columns = [name.lower() for name in df.columns]
-
+    #
+    # @TODO:
+    # Consider sequential training of sub population for extremely large datasets
+    #
+    
    #
    # If we have several columns we will proceed one at a time (it could be done in separate threads)
    # @TODO : Consider performing this task on several threads/GPUs simulataneously
    # 
-    handler = Binary()
-    # args['label']      = pd.get_dummies(df[column_id]).astype(np.float32).values
-    # args['label']   = handler.Export(df[[column_id]])
-    # args['label'] = np.ones(df.shape[0]).reshape(df.shape[0],1)
-    for col in column :    
-        args['real']        = pd.get_dummies(df[col].fillna('')).astype(np.float32).values 
-        # args['real']    = handler.Export(df[[col]])
+    for col in column : 
+        # args['real']        = pd.get_dummies(df[col].fillna('')).astype(np.float32).values 
+        # if 'float' not in df[col].dtypes.name :
+            # args['real']        = pd.get_dummies(df[col].fillna('')).astype(np.float32).values 
+        if 'float' in df[col].dtypes.name and col in CONTINUOUS:
+            BIN_SIZE = 10 if 'bin_size' not in args else int(args['bin_size'])
+            args['real']    = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32)
+        else:
+            args['real']        = pd.get_dummies(df[col].fillna('')).astype(np.float32).values 
+        
+
        args['column']  = col
        args['context'] = col
        context     = args['context']
@ -75,7 +145,7 @@ def generate(**args):
    """
    # df      = args['data']
    df      = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
-    
+    CONTINUOUS = args['continous'] if 'continuous' in args else []
    column      = args['column'] if (isinstance(args['column'],list)) else [args['column']]
    # column_id   = args['id']
    #
@ -86,18 +156,26 @@ def generate(**args):
    for col in column :
        args['context'] = col
        args['column']  = col
-        values          = df[col].unique().tolist()
-        args['values']  = values
-        args['row_count'] = df.shape[0]
+        
+        if 'float' in df[col].dtypes.name or col in CONTINUOUS :
+            #
+            # We should create the bins for the values we are observing here
+            BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size'])
+            values = ContinuousToDiscrete.continuous(df[col].values,BIN_SIZE)
+        else:
+            values          = df[col].unique().tolist()
+        
+        args['values']      = values    
+        args['row_count']   = df.shape[0]
        #
        # we can determine the cardinalities here so we know what to allow or disallow
        handler         = gan.Predict (**args)
        handler.load_meta(col)
-        # handler.ROW_COUNT = df[col].shape[0]
-        r       =  handler.apply()        
-        # print (r)      
-        # 
-        print ([_df.shape,len(r[col])])  
+        r       =  handler.apply()                
        _df[col] = r[col]
+        #
+        # @TODO: log basic stats about the synthetic attribute
+        #
+        
        # break
    return _df
--- a/data/maker/main.py
+++ b/data/maker/main.py
@ -17,9 +17,9 @@ if 'config' in SYS_ARGS :
        odf = pd.read_csv (ARGS['data'])
        odf.columns = [name.lower() for name in odf.columns]
        column = ARGS['column']  if isinstance(ARGS['column'],list) else [ARGS['column']]
-        print (odf.head())
-        print (_df.head())
-        # print(pd.merge(odf,_df,rsuffix='_io'))
+        # print (odf.head())
+        # print (_df.head())
+        print(odf.join(_df[column],rsuffix='_io'))
        # print (_df[column].risk.evaluate(flag='synth'))
        # print (odf[column].risk.evaluate(flag='original'))
        # _x = pd.get_dummies(_df[column]).values