bug fixes and optimizations

2020-04-01 00:21:51 -05:00 · 2020-04-01 00:21:51 -05:00 · 4c297679dc
parent 459afa2890
commit 4c297679dc
3 changed files with 29 additions and 21 deletions
--- a/data/maker/init.py
+++ b/data/maker/init.py
@ -27,22 +27,25 @@ class ContinuousToDiscrete :
        values = np.array(X).astype(np.float32)
        BOUNDS = ContinuousToDiscrete.bounds(values,n)
        # _map = [{"index":BOUNDS.index(i),"ubound":i} for i in BOUNDS]
-        _matrix = []
+        # _matrix = []
-        m = []
+        # m = []
-        for value in X :
+        # for value in X :
-            x_ = np.zeros(n)
+        #     x_ = np.zeros(n)
-            for row in BOUNDS :
+        #     for row in BOUNDS :
        #         if value>= row.left and value <= row.right :
        #             index = BOUNDS.index(row)
        #             x_[index]  = 1
        #             break
        #     _matrix += x_.tolist()
        # #
        # # for items in BOUNDS :
        # #   index = BOUNDS.index(items)
        # return np.array(_matrix).reshape(len(X),n)
        matrix = np.repeat(np.zeros(n),len(X)).reshape(len(X),n)
                if value>= row.left and value <= row.right :
                    index = BOUNDS.index(row)
                    x_[index]  = 1
                    break
            _matrix += x_.tolist()
        #
        # for items in BOUNDS :
        #   index = BOUNDS.index(items)
        return np.array(_matrix).reshape(len(X),n)
    @staticmethod
    def bounds(x,n):
@ -65,9 +68,15 @@ class ContinuousToDiscrete :
        # _BINARY= ContinuousToDiscrete.binary(X,BIN_SIZE)
        # # # print (BOUNDS)
        l = {}
-        for value in X :
+        for i in np.arange(len(X)): #value in X :
            values += [ np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP) for item in BOUNDS if  value >= item.left and value <= item.right ]
            value = X[i]
            for item in BOUNDS :
                if value >= item.left and value <= item.right :
                    values += [np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP)]
                    break
            # values += [ np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP) for item in BOUNDS if  value >= item.left and value <= item.right ]
        # # values = []
@ -223,11 +232,10 @@ def generate(**args):
                i = np.where (i == False)[0]                
            else:
                i = np.where( r[col] != None)[0]            
-            _approx = ContinuousToDiscrete.continuous(r[col][i],BIN_SIZE)                        
+            _approx = ContinuousToDiscrete.continuous(r[col][i],BIN_SIZE)  #-- approximating based on arbitrary bins                                
            r[col][i] = _approx
-        _df[col]    = r[col] #ContinuousToDiscrete.continuous(r[col],BIN_SIZE) if col in CONTINUOUS else r[col]
+        _df[col]    = r[col]
        # _df[col]    = r[col]
        #
        # @TODO: log basic stats about the synthetic attribute
        #
--- a/pipeline.py
+++ b/pipeline.py
@ -47,7 +47,7 @@ class Components :
 		logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
 		logger.write({"module":"bigquery","action":"read","input":{"sql":SQL}})
 		credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
-		df = pd.read_gbq(SQL,credentials=credentials,dialect='standard').astype(object)
+		df = pd.read_gbq(SQL,credentials=credentials,dialect='standard')
 		return df
 		# return lambda: pd.read_gbq(SQL,credentials=credentials,dialect='standard')[args['columns']].dropna()
--- a/setup.py
+++ b/setup.py
@ -4,7 +4,7 @@ import sys
 def read(fname):
    return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.2.5","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.2.6","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
        "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'