removing conditions, it blows up computational space

2020-02-18 12:25:47 -06:00 · 2020-02-18 12:25:47 -06:00 · 4a25af6b13
parent dab3ab7bf7
commit 4a25af6b13
3 changed files with 62 additions and 36 deletions
--- a/data/gan.py
+++ b/data/gan.py
@ -72,7 +72,7 @@ class GNet :
                elif 'label' in args and len(args['label']) == 1 :
                        self.NUM_LABELS = args['label'].shape[0]
                else:
-                        self.NUM_LABELS = 8
+                        self.NUM_LABELS = None
                # self.Z_DIM = 128 #self.X_SPACE_SIZE     
                self.Z_DIM = 128  #-- used as rows down stream
                self.G_STRUCTURE = [self.Z_DIM,self.Z_DIM]
@ -180,14 +180,19 @@ class GNet :
                shift   = [0] if self.__class__.__name__.lower() == 'generator' else [1] #-- not sure what this is doing
                mean, var       = tf.nn.moments(inputs, shift, keep_dims=True)
                shape           = inputs.shape[1].value
-                offset_m        = self.get.variables(shape=[n_labels,shape], name='offset'+name,
-                                                                        initializer=tf.zeros_initializer)
-                scale_m = self.get.variables(shape=[n_labels,shape], name='scale'+name,
-                                                                initializer=tf.ones_initializer)
-                
-                offset  = tf.nn.embedding_lookup(offset_m, labels)
-                scale   = tf.nn.embedding_lookup(scale_m, labels)
-                result  = tf.nn.batch_normalization(inputs, mean, var, offset, scale, 1e-8)
+                if labels is not None:
+                        offset_m        = self.get.variables(shape=[1,shape], name='offset'+name,
+                                                                                initializer=tf.zeros_initializer)
+                        scale_m = self.get.variables(shape=[n_labels,shape], name='scale'+name,
+                                                                        initializer=tf.ones_initializer)
+                        offset  = tf.nn.embedding_lookup(offset_m, labels)
+                        scale   = tf.nn.embedding_lookup(scale_m, labels)
+
+                else:
+                        offset = None
+                        scale = None
+
+                result  = tf.nn.batch_normalization(inputs, mean, var,offset,scale, 1e-8)
                return result

        def _variable_on_cpu(self,**args):
@ -248,7 +253,7 @@ class Generator (GNet):
                x               = args['inputs']
                tmp_dim = self.Z_DIM if 'dim' not in args else args['dim']
                label   = args['label']
-                
+                print (self.NUM_LABELS)
                with tf.compat.v1.variable_scope('G', reuse=tf.compat.v1.AUTO_REUSE , regularizer=l2_regularizer(0.00001)):
                        for i, dim in enumerate(self.G_STRUCTURE[:-1]):
                                kernel = self.get.variables(name='W_' + str(i), shape=[tmp_dim, dim])
@ -331,7 +336,7 @@ class Train (GNet):
                self.generator = Generator(**args)
                self.discriminator = Discriminator(**args)
                self._REAL = args['real']
-                self._LABEL= args['label']
+                self._LABEL= args['label'] if 'label' in args else None
                self.column = args['column']
                # print ([" *** ",self.BATCHSIZE_PER_GPU])
                
@ -340,7 +345,7 @@ class Train (GNet):
                        
                        self.logger.write( self.meta )
                
-                self.log (real_shape=list(self._REAL.shape),label_shape = list(self._LABEL.shape),meta_data=self.meta)
+                # self.log (real_shape=list(self._REAL.shape),label_shape = self._LABEL.shape,meta_data=self.meta)
        def load_meta(self, column):
                """
                This function will delegate the calls to load meta data to it's dependents
@ -363,13 +368,16 @@ class Train (GNet):
                stage   = args['stage']
                real    = args['real']
                label   = args['label']
-                label   = tf.cast(label, tf.int32)
-                #
-                # @TODO: Ziqi needs to explain what's going on here
-                m = [[i] for i in np.arange(self._LABEL.shape[1]-2)]
-                label   = label[:, 1] * len(m) + tf.squeeze(
-                        tf.matmul(label[:, 2:], tf.constant(m, dtype=tf.int32))
-                        )
+
+                
+                if label is not None :
+                        label   = tf.cast(label, tf.int32)
+                        #
+                        # @TODO: Ziqi needs to explain what's going on here
+                        m = [[i] for i in np.arange(self._LABEL.shape[1]-2)]
+                        label   = label[:, 1] * len(m) + tf.squeeze(
+                                tf.matmul(label[:, 2:], tf.constant(m, dtype=tf.int32))
+                                )
                # label = label[:,1] * 4 + tf.squeeze( label[:,2]*[[0],[1],[2],[3]] )
                z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM])
                
@ -394,8 +402,13 @@ class Train (GNet):
                This function seems to produce 
                """
                features_placeholder = tf.compat.v1.placeholder(shape=self._REAL.shape, dtype=tf.float32)
-                labels_placeholder = tf.compat.v1.placeholder(shape=self._LABEL.shape, dtype=tf.float32)
-                dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder))
+                LABEL_SHAPE = [None,None] if self._LABEL is None else self._LABEL.shape
+                labels_placeholder = tf.compat.v1.placeholder(shape=LABEL_SHAPE, dtype=tf.float32)
+                if self._LABEL is not None :
+                        dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder))
+                else :
+                        dataset = tf.data.Dataset.from_tensor_slices(features_placeholder)
+                # labels_placeholder = None
                dataset = dataset.repeat(10000)
                dataset = dataset.batch(batch_size=3000)
                dataset = dataset.prefetch(1)
@ -413,7 +426,10 @@ class Train (GNet):
                        for i in range(self.NUM_GPUS):
                                with tf.device('/gpu:%d' % i):
                                        with tf.name_scope('%s_%d' % ('TOWER', i)) as scope:
-                                                (real, label) = iterator.get_next()
+                                                if self._LABEL is not None :
+                                                        (real, label) = iterator.get_next()
+                                                else:
+                                                        real = iterator.get_next()
                                                loss, w = self.loss(scope=scope, stage=stage, real=self._REAL, label=self._LABEL)
                                                #tf.get_variable_scope().reuse_variables()
                                                tf.compat.v1.get_variable_scope().reuse_variables()
@ -450,11 +466,12 @@ class Train (GNet):
                        #with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
                        with tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
                                sess.run(init)
+                                
                                sess.run(iterator_d.initializer,
-                                                feed_dict={features_placeholder_d: REAL, labels_placeholder_d: LABEL})
+                                                        feed_dict={features_placeholder_d: REAL})
                                sess.run(iterator_g.initializer,
-                                                feed_dict={features_placeholder_g: REAL, labels_placeholder_g: LABEL})
-
+                                                        feed_dict={features_placeholder_g: REAL})
+                                
                                for epoch in range(1, self.MAX_EPOCHS + 1):
                                        start_time = time.time()
                                        w_sum = 0
@ -511,9 +528,11 @@ class Predict(GNet):
                tf.compat.v1.reset_default_graph()
                z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM])
                y = tf.compat.v1.placeholder(shape=[self.BATCHSIZE_PER_GPU, self.NUM_LABELS], dtype=tf.int32)
-                ma = [[i] for i in np.arange(self.NUM_LABELS - 2)]
-                label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32)))
-                
+                if self._LABEL is not None :
+                        ma = [[i] for i in np.arange(self.NUM_LABELS - 2)]
+                        label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32)))
+                else:
+                        label = None
                fake    = self.generator.network(inputs=z, label=label)
                init    = tf.compat.v1.global_variables_initializer()
                saver   = tf.compat.v1.train.Saver()
@ -524,13 +543,19 @@ class Predict(GNet):
                        
                        # sess.run(init)
                        saver.restore(sess, model_dir)
-                        labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) )
+                        if self._LABEL is not None :
+                                labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) )
+                                labels= demo
+                        else:
+                                labels = None
                        
                        found = []
-                        labels= demo
+                        
                        for i in np.arange(CANDIDATE_COUNT) :
-                                
-                                f = sess.run(fake,feed_dict={y:labels})
+                                if labels :
+                                        f = sess.run(fake,feed_dict={y:labels})
+                                else:
+                                        f = sess.run(fake)
                                #
                                # if we are dealing with numeric values only we can perform a simple marginal sum against the indexes
                                # The code below will insure we have some acceptable cardinal relationships between id and synthetic values
--- a/data/maker/init.py
+++ b/data/maker/init.py
@ -25,7 +25,7 @@ def train (**args) :
    """
    column      = args['column'] if (isinstance(args['column'],list)) else [args['column']]
    
-    column_id   = args['id']
+    # column_id   = args['id']
    df          = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
    df.columns = [name.lower() for name in df.columns]

@ -35,7 +35,8 @@ def train (**args) :
    # 
    handler = Binary()
    # args['label']      = pd.get_dummies(df[column_id]).astype(np.float32).values
-    args['label']   = handler.Export(df[[column_id]])
+    # args['label']   = handler.Export(df[[column_id]])
+    # args['label'] = np.ones(df.shape[0]).reshape(df.shape[0],1)
    for col in column :    
        # args['real']        = pd.get_dummies(df[col]).astype(np.float32).values 
        args['real']    = handler.Export(df[[col]])
@ -83,7 +84,7 @@ def generate(**args):
    #
    # args['label']  = pd.get_dummies(df[column_id]).astype(np.float32).values
    bwrangler = Binary()
-    args['label']   = bwrangler.Export(df[[column_id]])
+    # args['label']   = bwrangler.Export(df[[column_id]])
    _df     = df.copy()
    for col in column :
        args['context'] = col
--- a/setup.py
+++ b/setup.py
@ -7,7 +7,7 @@ def read(fname):
 args = {"name":"data-maker","version":"1.1.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
        "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
-args['url'] = 'https://hiplab.mc.vanderbilt.edu/aou/data-maker.git'
+args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'

 if sys.version_info[0] == 2 :
    args['use_2to3'] = False