From f1e2fe369963c398e1a3b5bbb5784b6b1f86fa87 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 24 Apr 2023 16:36:25 -0500 Subject: [PATCH 1/6] bug fixes: stages, other training/generation --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8ad1b09..7efb8c5 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.6.4", +args = {"name":"data-maker","version":"1.6.6", "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://github.com/lnyemba/data-transport.git','tensorflow'] From e5af702ddb4c04fa668f84018879c379f597e26d Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 24 Apr 2023 16:37:08 -0500 Subject: [PATCH 2/6] bug fixes: stages, other training/generation --- data/gan.py | 16 ++++++++++++--- data/maker/__init__.py | 31 +++++++++++++++++++++------- data/maker/apply.py | 2 +- data/maker/state/__init__.py | 39 +++++++++++++++++++++++++++++++----- 4 files changed, 72 insertions(+), 16 deletions(-) diff --git a/data/gan.py b/data/gan.py index d2cc3ea..a794ebe 100644 --- a/data/gan.py +++ b/data/gan.py @@ -103,11 +103,12 @@ class GNet : CHECKPOINT_SKIPS = int(args['checkpoint_skips']) if 'checkpoint_skips' in args else int(self.MAX_EPOCHS/10) CHECKPOINT_SKIPS = 1 if CHECKPOINT_SKIPS < 1 else CHECKPOINT_SKIPS + # if self.MAX_EPOCHS < 2*CHECKPOINT_SKIPS : # CHECKPOINT_SKIPS = 2 # self.CHECKPOINTS = [1,self.MAX_EPOCHS] + np.repeat( np.divide(self.MAX_EPOCHS,CHECKPOINT_SKIPS),CHECKPOINT_SKIPS ).cumsum().astype(int).tolist() self.CHECKPOINTS = np.repeat(CHECKPOINT_SKIPS, self.MAX_EPOCHS/ CHECKPOINT_SKIPS).cumsum().astype(int).tolist() - + self.ROW_COUNT = args['real'].shape[0] if 'real' in args else 100 self.CONTEXT = args['context'] self.ATTRIBUTES = {"id":args['column_id'] if 'column_id' in args else None,"synthetic":args['column'] if 'column' in args else None} @@ -287,8 +288,17 @@ class Generator (GNet): """ def __init__(self,**args): - GNet.__init__(self,**args) - self.discriminator = Discriminator(**args) + if 'trainer' not in args : + GNet.__init__(self,**args) + self.discriminator = Discriminator(**args) + else: + _args = {} + _trainer = args['trainer'] + for key in vars(_trainer) : + value = getattr(_trainer,key) + setattr(self,key,value) + _args[key] = value + self.discriminator = Discriminator(**_args) def loss(self,**args): fake = args['fake'] label = args['label'] diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 7b3a347..5053e9b 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -33,6 +33,7 @@ class Learner(Process): super(Learner, self).__init__() + self._arch = {'init':_args} self.ndx = 0 self._queue = Queue() self.lock = RLock() @@ -44,6 +45,8 @@ class Learner(Process): self.gpu = None self.info = _args['info'] + if 'context' not in self.info : + self.info['context'] = self.info['from'] self.columns = self.info['columns'] if 'columns' in self.info else None self.store = _args['store'] @@ -97,9 +100,12 @@ class Learner(Process): # __info = (pd.DataFrame(self._states)[['name','path','args']]).to_dict(orient='records') if self._states : __info = {} - + # print (self._states) for key in self._states : - __info[key] = [{"name":_item['name'],"args":_item['args'],"path":_item['path']} for _item in self._states[key]] + _pipeline = self._states[key] + + # __info[key] = ([{'name':_payload['name']} for _payload in _pipeline]) + __info[key] = [{"name":_item['name'],"args":_item['args'],"path":_item['path']} for _item in self._states[key] if _item ] self.log(object='state-space',action='load',input=__info) @@ -270,18 +276,23 @@ class Trainer(Learner): # _epochs = [_e for _e in gTrain.logs['epochs'] if _e['path'] != ''] _epochs.sort(key=lambda _item: _item['loss'],reverse=False) - + _args['network_args']['max_epochs'] = _epochs[0]['epochs'] self.log(action='autopilot',input={'epoch':_epochs[0]}) - g = Generator(**_args) + # g.run() end = datetime.now() #.strftime('%Y-%m-%d %H:%M:%S') _min = float((end-beg).seconds/ 60) _logs = {'action':'train','input':{'start':beg.strftime('%Y-%m-%d %H:%M:%S'),'minutes':_min,"unique_counts":self._encoder._io[0]}} self.log(**_logs) - self._g = g - if self.autopilot : + + if self.autopilot : + + # g = Generator(**_args) + + g = Generator(**self._arch['init']) + self._g = g self._g.run() # #@TODO Find a way to have the data in the object .... @@ -300,10 +311,15 @@ class Generator (Learner): # # We need to load the mapping information for the space we are working with ... # + + self.network_args['candidates'] = int(_args['candidates']) if 'candidates' in _args else 1 - filename = os.sep.join([self.network_args['logs'],'output',self.network_args['context'],'map.json']) + # filename = os.sep.join([self.network_args['logs'],'output',self.network_args['context'],'map.json']) + _suffix = self.network_args['context'] + filename = os.sep.join([self.network_args['logs'],'output',self.network_args['context'],'meta-',_suffix,'.json']) self.log(**{'action':'init-map','input':{'filename':filename,'exists':os.path.exists(filename)}}) if os.path.exists(filename): + file = open(filename) self._map = json.loads(file.read()) file.close() @@ -580,6 +596,7 @@ class factory : """ + # if _args['apply'] in [apply.RANDOM] : pthread = Shuffle(**_args) diff --git a/data/maker/apply.py b/data/maker/apply.py index bb6a085..58ae094 100644 --- a/data/maker/apply.py +++ b/data/maker/apply.py @@ -69,7 +69,7 @@ class Date(Post): """ """ - pass + pass class Approximate(Post): def apply(**_args): pass diff --git a/data/maker/state/__init__.py b/data/maker/state/__init__.py index adf9837..f1b8da0 100644 --- a/data/maker/state/__init__.py +++ b/data/maker/state/__init__.py @@ -31,12 +31,22 @@ class State : continue pointer = _item['module'] - _args = _item['args'] + + if type(pointer).__name__ != 'function': + _args = _item['args'] if 'args' in _item else {} + else: + pointer = _item['module'] + + _args = _item['args'] if 'args' in _item else {} + _data = pointer(_data,_args) return _data @staticmethod def instance(_args): + """ + + """ pre = [] post=[] @@ -45,8 +55,20 @@ class State : # # If the item has a path property is should be ignored path = _args[key]['path'] if 'path' in _args[key] else '' - out[key] = [ State._build(dict(_item,**{'path':path})) if 'path' not in _item else State._build(_item) for _item in _args[key]['pipeline']] - + # out[key] = [ State._build(dict(_item,**{'path':path})) if 'path' not in _item else State._build(_item) for _item in _args[key]['pipeline']] + out[key] = [] + for _item in _args[key]['pipeline'] : + + if type(_item).__name__ == 'function': + _stageInfo = {'module':_item,'name':_item.__name__,'args':{},'path':''} + pass + else: + if 'path' in _item : + _stageInfo = State._build(dict(_item,**{'path':path})) + else : + _stageInfo= State._build(_item) + out[key].append(_stageInfo) + # print ([out]) return out # if 'pre' in _args: # path = _args['pre']['path'] if 'path' in _args['pre'] else '' @@ -68,11 +90,18 @@ class State : pass @staticmethod def _build(_args): - + """ + This function builds the object {module,path} where module is extracted from a file (if needed) + :param _args dictionary containing attributes that can be value pair + It can also be a function + """ + # + # In the advent an actual pointer is passed we should do the following + _info = State._extract(_args) # _info = dict(_args,**_info) - _info['module'] = State._instance(_info) + _info['module'] = State._instance(_info) return _info if _info['module'] is not None else None @staticmethod From 78a650b29b4df5ae8e15c404f6eb617e71b6c6f3 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 25 Apr 2023 15:07:35 -0500 Subject: [PATCH 3/6] bug fixes ... --- data/gan.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/data/gan.py b/data/gan.py index a794ebe..7860e0f 100644 --- a/data/gan.py +++ b/data/gan.py @@ -667,7 +667,9 @@ class Predict(GNet): fake = self.generator.network(inputs=z, label=label) init = tf.compat.v1.global_variables_initializer() - saver = tf.compat.v1.train.Saver() + print ([self.CHECKPOINTS]) + # saver = tf.compat.v1.train.Saver() + saver = tf.compat.v1.train.Saver(max_to_keep=len(self.CHECKPOINTS)) df = pd.DataFrame() CANDIDATE_COUNT = args['candidates'] if 'candidates' in args else 1 #0 if self.ROW_COUNT < 1000 else 100 candidates = [] From ef43f20e9c45d6b73edd8e5898131874aa663a67 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 7 Jun 2023 11:59:19 -0500 Subject: [PATCH 4/6] matrix processing: special case --- data/maker/__init__.py | 5 ++++- setup.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 5053e9b..09bdb4c 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -501,7 +501,10 @@ class Generator (Learner): N = 0 for _iodf in _candidates : _df = self._df.copy() - _df[self.columns] = _iodf[self.columns] + if self.columns : + _df[self.columns] = _iodf[self.columns] + + N += _df.shape[0] if self._states and 'post' in self._states: _df = State.apply(_df,self._states['post']) diff --git a/setup.py b/setup.py index 7efb8c5..505dae7 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.6.6", +args = {"name":"data-maker","version":"1.6.8", "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://github.com/lnyemba/data-transport.git','tensorflow'] From b9596edd8ec6498538b7c1d527a505fe762b9e9d Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 25 Jul 2023 14:30:20 -0500 Subject: [PATCH 5/6] bug fix: random shuffle improvements --- data/maker/__init__.py | 59 ++++++++++++++++++++++++++++++------------ data/maker/version.py | 1 + setup.py | 4 +-- version.py | 1 + 4 files changed, 47 insertions(+), 18 deletions(-) create mode 100644 data/maker/version.py create mode 120000 version.py diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 09bdb4c..8327eea 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -22,7 +22,7 @@ import nujson as json from multiprocessing import Process, RLock from datetime import datetime, timedelta from multiprocessing import Queue - +from version import __version__ import time @@ -179,6 +179,7 @@ class Learner(Process): for name in columns : # # randomly sampling 5 elements to make sense of data-types + if self._df[name].size < 5 : continue _index = np.random.choice(np.arange(self._df[name].size),5,False) @@ -552,27 +553,53 @@ class Shuffle(Generator): """ def __init__(self,**_args): super().__init__(**_args) + if 'data' not in _args : + reader = transport.factory.instance(**self.store['source']) + self._df = reader.read(sql=self.info['sql']) + def run(self): np.random.seed(1) + self.initalize() - _index = np.arange(self._df.shape[0]) - np.random.shuffle(_index) - np.random.shuffle(_index) - _iocolumns = self.info['columns'] - _ocolumns = list(set(self._df.columns) - set(_iocolumns) ) - # _iodf = pd.DataFrame(self._df[_ocolumns],self._df.loc[_index][_iocolumns],index=np.arange(_index.size)) - _iodf = pd.DataFrame(self._df[_iocolumns].copy(),index = np.arange(_index.size)) - # self._df = self._df.loc[_index][_ocolumns].join(_iodf) - self._df = self._df.loc[_index][_ocolumns] - self._df.index = np.arange(self._df.shape[0]) - self._df = self._df.join(_iodf) # - # The following is a full shuffle - self._df = self._df.loc[_index] - self._df.index = np.arange(self._df.shape[0]) - + # If we are given lists of columns instead of a list-of-list + # unpack the list + _invColumns = [] + _colNames = [] + _ucolNames= [] + for _item in self.info['columns'] : + if type(_item) == list : + _invColumns.append(_item) + elif _item in self._df.columns.tolist(): + _colNames.append(_item) + # + # At this point we build the matrix of elements we are interested in considering the any unspecified column + # + if _colNames : + _invColumns.append(_colNames) + _ucolNames = list(set(self._df.columns) - set(_colNames)) + if _ucolNames : + _invColumns += [ [_name] for _name in _ucolNames] + + _xdf = pd.DataFrame() + _xdf = pd.DataFrame() + _index = np.arange(self._df.shape[0]) + for _columns in _invColumns : + + _tmpdf = self._df[_columns].copy()[_columns] + np.random.shuffle(_index) + + _tmpdf = _tmpdf.iloc[_index] + + if _xdf.shape[0] == 0 : + _xdf = _tmpdf + else: + _xdf = _xdf.join(_tmpdf) + + _xdf = _xdf[self._df.columns] + self._df = _xdf _log = {'action':'io-data','input':{'candidates':1,'rows':int(self._df.shape[0])}} self.log(**_log) try: diff --git a/data/maker/version.py b/data/maker/version.py new file mode 100644 index 0000000..6e0eb49 --- /dev/null +++ b/data/maker/version.py @@ -0,0 +1 @@ +__version__='1.7.0' diff --git a/setup.py b/setup.py index 505dae7..a30ac52 100644 --- a/setup.py +++ b/setup.py @@ -1,10 +1,10 @@ from setuptools import setup, find_packages import os import sys - +import version def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.6.8", +args = {"name":"data-maker","version":version.__version__, "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://github.com/lnyemba/data-transport.git','tensorflow'] diff --git a/version.py b/version.py new file mode 120000 index 0000000..85fd196 --- /dev/null +++ b/version.py @@ -0,0 +1 @@ +data/maker/version.py \ No newline at end of file From c1c17bc59d1ecfe703dbaf67fc69102786d794ef Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 25 Jul 2023 14:40:45 -0500 Subject: [PATCH 6/6] bug fix: random shuffle --- data/maker/__init__.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 8327eea..28d224c 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -559,7 +559,7 @@ class Shuffle(Generator): def run(self): - np.random.seed(1) + self.initalize() # @@ -589,10 +589,12 @@ class Shuffle(Generator): for _columns in _invColumns : _tmpdf = self._df[_columns].copy()[_columns] + np.random.seed(1) np.random.shuffle(_index) - - _tmpdf = _tmpdf.iloc[_index] - + print (_columns,_index) + # _values = _tmpdf.values[_index] + #_tmpdf = _tmpdf.iloc[_index] + _tmpdf = pd.DataFrame(_tmpdf.values[_index],columns=_columns) if _xdf.shape[0] == 0 : _xdf = _tmpdf else: