Compare commits

..

7 Commits

Author SHA1 Message Date
Steve Nyemba c865e59ff4 Merge branch 'dev' 2023-07-25 14:42:50 -05:00
Steve Nyemba c1c17bc59d bug fix: random shuffle 2023-07-25 14:40:45 -05:00
Steve Nyemba b9596edd8e bug fix: random shuffle improvements 2023-07-25 14:30:20 -05:00
Steve Nyemba ef43f20e9c matrix processing: special case 2023-06-07 11:59:19 -05:00
Steve Nyemba 78a650b29b bug fixes ... 2023-04-25 15:07:35 -05:00
Steve Nyemba e5af702ddb bug fixes: stages, other training/generation 2023-04-24 16:37:08 -05:00
Steve Nyemba f1e2fe3699 bug fixes: stages, other training/generation 2023-04-24 16:36:25 -05:00
7 changed files with 129 additions and 37 deletions

View File

@ -103,11 +103,12 @@ class GNet :
CHECKPOINT_SKIPS = int(args['checkpoint_skips']) if 'checkpoint_skips' in args else int(self.MAX_EPOCHS/10) CHECKPOINT_SKIPS = int(args['checkpoint_skips']) if 'checkpoint_skips' in args else int(self.MAX_EPOCHS/10)
CHECKPOINT_SKIPS = 1 if CHECKPOINT_SKIPS < 1 else CHECKPOINT_SKIPS CHECKPOINT_SKIPS = 1 if CHECKPOINT_SKIPS < 1 else CHECKPOINT_SKIPS
# if self.MAX_EPOCHS < 2*CHECKPOINT_SKIPS : # if self.MAX_EPOCHS < 2*CHECKPOINT_SKIPS :
# CHECKPOINT_SKIPS = 2 # CHECKPOINT_SKIPS = 2
# self.CHECKPOINTS = [1,self.MAX_EPOCHS] + np.repeat( np.divide(self.MAX_EPOCHS,CHECKPOINT_SKIPS),CHECKPOINT_SKIPS ).cumsum().astype(int).tolist() # self.CHECKPOINTS = [1,self.MAX_EPOCHS] + np.repeat( np.divide(self.MAX_EPOCHS,CHECKPOINT_SKIPS),CHECKPOINT_SKIPS ).cumsum().astype(int).tolist()
self.CHECKPOINTS = np.repeat(CHECKPOINT_SKIPS, self.MAX_EPOCHS/ CHECKPOINT_SKIPS).cumsum().astype(int).tolist() self.CHECKPOINTS = np.repeat(CHECKPOINT_SKIPS, self.MAX_EPOCHS/ CHECKPOINT_SKIPS).cumsum().astype(int).tolist()
self.ROW_COUNT = args['real'].shape[0] if 'real' in args else 100 self.ROW_COUNT = args['real'].shape[0] if 'real' in args else 100
self.CONTEXT = args['context'] self.CONTEXT = args['context']
self.ATTRIBUTES = {"id":args['column_id'] if 'column_id' in args else None,"synthetic":args['column'] if 'column' in args else None} self.ATTRIBUTES = {"id":args['column_id'] if 'column_id' in args else None,"synthetic":args['column'] if 'column' in args else None}
@ -287,8 +288,17 @@ class Generator (GNet):
""" """
def __init__(self,**args): def __init__(self,**args):
GNet.__init__(self,**args) if 'trainer' not in args :
self.discriminator = Discriminator(**args) GNet.__init__(self,**args)
self.discriminator = Discriminator(**args)
else:
_args = {}
_trainer = args['trainer']
for key in vars(_trainer) :
value = getattr(_trainer,key)
setattr(self,key,value)
_args[key] = value
self.discriminator = Discriminator(**_args)
def loss(self,**args): def loss(self,**args):
fake = args['fake'] fake = args['fake']
label = args['label'] label = args['label']
@ -657,7 +667,9 @@ class Predict(GNet):
fake = self.generator.network(inputs=z, label=label) fake = self.generator.network(inputs=z, label=label)
init = tf.compat.v1.global_variables_initializer() init = tf.compat.v1.global_variables_initializer()
saver = tf.compat.v1.train.Saver() print ([self.CHECKPOINTS])
# saver = tf.compat.v1.train.Saver()
saver = tf.compat.v1.train.Saver(max_to_keep=len(self.CHECKPOINTS))
df = pd.DataFrame() df = pd.DataFrame()
CANDIDATE_COUNT = args['candidates'] if 'candidates' in args else 1 #0 if self.ROW_COUNT < 1000 else 100 CANDIDATE_COUNT = args['candidates'] if 'candidates' in args else 1 #0 if self.ROW_COUNT < 1000 else 100
candidates = [] candidates = []

View File

@ -22,7 +22,7 @@ import nujson as json
from multiprocessing import Process, RLock from multiprocessing import Process, RLock
from datetime import datetime, timedelta from datetime import datetime, timedelta
from multiprocessing import Queue from multiprocessing import Queue
from version import __version__
import time import time
@ -33,6 +33,7 @@ class Learner(Process):
super(Learner, self).__init__() super(Learner, self).__init__()
self._arch = {'init':_args}
self.ndx = 0 self.ndx = 0
self._queue = Queue() self._queue = Queue()
self.lock = RLock() self.lock = RLock()
@ -44,6 +45,8 @@ class Learner(Process):
self.gpu = None self.gpu = None
self.info = _args['info'] self.info = _args['info']
if 'context' not in self.info :
self.info['context'] = self.info['from']
self.columns = self.info['columns'] if 'columns' in self.info else None self.columns = self.info['columns'] if 'columns' in self.info else None
self.store = _args['store'] self.store = _args['store']
@ -97,9 +100,12 @@ class Learner(Process):
# __info = (pd.DataFrame(self._states)[['name','path','args']]).to_dict(orient='records') # __info = (pd.DataFrame(self._states)[['name','path','args']]).to_dict(orient='records')
if self._states : if self._states :
__info = {} __info = {}
# print (self._states)
for key in self._states : for key in self._states :
__info[key] = [{"name":_item['name'],"args":_item['args'],"path":_item['path']} for _item in self._states[key]] _pipeline = self._states[key]
# __info[key] = ([{'name':_payload['name']} for _payload in _pipeline])
__info[key] = [{"name":_item['name'],"args":_item['args'],"path":_item['path']} for _item in self._states[key] if _item ]
self.log(object='state-space',action='load',input=__info) self.log(object='state-space',action='load',input=__info)
@ -173,6 +179,7 @@ class Learner(Process):
for name in columns : for name in columns :
# #
# randomly sampling 5 elements to make sense of data-types # randomly sampling 5 elements to make sense of data-types
if self._df[name].size < 5 : if self._df[name].size < 5 :
continue continue
_index = np.random.choice(np.arange(self._df[name].size),5,False) _index = np.random.choice(np.arange(self._df[name].size),5,False)
@ -270,18 +277,23 @@ class Trainer(Learner):
# #
_epochs = [_e for _e in gTrain.logs['epochs'] if _e['path'] != ''] _epochs = [_e for _e in gTrain.logs['epochs'] if _e['path'] != '']
_epochs.sort(key=lambda _item: _item['loss'],reverse=False) _epochs.sort(key=lambda _item: _item['loss'],reverse=False)
_args['network_args']['max_epochs'] = _epochs[0]['epochs'] _args['network_args']['max_epochs'] = _epochs[0]['epochs']
self.log(action='autopilot',input={'epoch':_epochs[0]}) self.log(action='autopilot',input={'epoch':_epochs[0]})
g = Generator(**_args)
# g.run() # g.run()
end = datetime.now() #.strftime('%Y-%m-%d %H:%M:%S') end = datetime.now() #.strftime('%Y-%m-%d %H:%M:%S')
_min = float((end-beg).seconds/ 60) _min = float((end-beg).seconds/ 60)
_logs = {'action':'train','input':{'start':beg.strftime('%Y-%m-%d %H:%M:%S'),'minutes':_min,"unique_counts":self._encoder._io[0]}} _logs = {'action':'train','input':{'start':beg.strftime('%Y-%m-%d %H:%M:%S'),'minutes':_min,"unique_counts":self._encoder._io[0]}}
self.log(**_logs) self.log(**_logs)
self._g = g
if self.autopilot : if self.autopilot :
# g = Generator(**_args)
g = Generator(**self._arch['init'])
self._g = g
self._g.run() self._g.run()
# #
#@TODO Find a way to have the data in the object .... #@TODO Find a way to have the data in the object ....
@ -300,10 +312,15 @@ class Generator (Learner):
# #
# We need to load the mapping information for the space we are working with ... # We need to load the mapping information for the space we are working with ...
# #
self.network_args['candidates'] = int(_args['candidates']) if 'candidates' in _args else 1 self.network_args['candidates'] = int(_args['candidates']) if 'candidates' in _args else 1
filename = os.sep.join([self.network_args['logs'],'output',self.network_args['context'],'map.json']) # filename = os.sep.join([self.network_args['logs'],'output',self.network_args['context'],'map.json'])
_suffix = self.network_args['context']
filename = os.sep.join([self.network_args['logs'],'output',self.network_args['context'],'meta-',_suffix,'.json'])
self.log(**{'action':'init-map','input':{'filename':filename,'exists':os.path.exists(filename)}}) self.log(**{'action':'init-map','input':{'filename':filename,'exists':os.path.exists(filename)}})
if os.path.exists(filename): if os.path.exists(filename):
file = open(filename) file = open(filename)
self._map = json.loads(file.read()) self._map = json.loads(file.read())
file.close() file.close()
@ -485,7 +502,10 @@ class Generator (Learner):
N = 0 N = 0
for _iodf in _candidates : for _iodf in _candidates :
_df = self._df.copy() _df = self._df.copy()
_df[self.columns] = _iodf[self.columns] if self.columns :
_df[self.columns] = _iodf[self.columns]
N += _df.shape[0] N += _df.shape[0]
if self._states and 'post' in self._states: if self._states and 'post' in self._states:
_df = State.apply(_df,self._states['post']) _df = State.apply(_df,self._states['post'])
@ -533,27 +553,55 @@ class Shuffle(Generator):
""" """
def __init__(self,**_args): def __init__(self,**_args):
super().__init__(**_args) super().__init__(**_args)
if 'data' not in _args :
reader = transport.factory.instance(**self.store['source'])
self._df = reader.read(sql=self.info['sql'])
def run(self): def run(self):
np.random.seed(1)
self.initalize() self.initalize()
_index = np.arange(self._df.shape[0])
np.random.shuffle(_index)
np.random.shuffle(_index)
_iocolumns = self.info['columns']
_ocolumns = list(set(self._df.columns) - set(_iocolumns) )
# _iodf = pd.DataFrame(self._df[_ocolumns],self._df.loc[_index][_iocolumns],index=np.arange(_index.size))
_iodf = pd.DataFrame(self._df[_iocolumns].copy(),index = np.arange(_index.size))
# self._df = self._df.loc[_index][_ocolumns].join(_iodf)
self._df = self._df.loc[_index][_ocolumns]
self._df.index = np.arange(self._df.shape[0])
self._df = self._df.join(_iodf)
# #
# The following is a full shuffle # If we are given lists of columns instead of a list-of-list
self._df = self._df.loc[_index] # unpack the list
self._df.index = np.arange(self._df.shape[0]) _invColumns = []
_colNames = []
_ucolNames= []
for _item in self.info['columns'] :
if type(_item) == list :
_invColumns.append(_item)
elif _item in self._df.columns.tolist():
_colNames.append(_item)
#
# At this point we build the matrix of elements we are interested in considering the any unspecified column
#
if _colNames :
_invColumns.append(_colNames)
_ucolNames = list(set(self._df.columns) - set(_colNames))
if _ucolNames :
_invColumns += [ [_name] for _name in _ucolNames]
_xdf = pd.DataFrame()
_xdf = pd.DataFrame()
_index = np.arange(self._df.shape[0])
for _columns in _invColumns :
_tmpdf = self._df[_columns].copy()[_columns]
np.random.seed(1)
np.random.shuffle(_index)
print (_columns,_index)
# _values = _tmpdf.values[_index]
#_tmpdf = _tmpdf.iloc[_index]
_tmpdf = pd.DataFrame(_tmpdf.values[_index],columns=_columns)
if _xdf.shape[0] == 0 :
_xdf = _tmpdf
else:
_xdf = _xdf.join(_tmpdf)
_xdf = _xdf[self._df.columns]
self._df = _xdf
_log = {'action':'io-data','input':{'candidates':1,'rows':int(self._df.shape[0])}} _log = {'action':'io-data','input':{'candidates':1,'rows':int(self._df.shape[0])}}
self.log(**_log) self.log(**_log)
try: try:
@ -580,6 +628,7 @@ class factory :
""" """
#
if _args['apply'] in [apply.RANDOM] : if _args['apply'] in [apply.RANDOM] :
pthread = Shuffle(**_args) pthread = Shuffle(**_args)

View File

@ -69,7 +69,7 @@ class Date(Post):
""" """
""" """
pass pass
class Approximate(Post): class Approximate(Post):
def apply(**_args): def apply(**_args):
pass pass

View File

@ -31,12 +31,22 @@ class State :
continue continue
pointer = _item['module'] pointer = _item['module']
_args = _item['args']
if type(pointer).__name__ != 'function':
_args = _item['args'] if 'args' in _item else {}
else:
pointer = _item['module']
_args = _item['args'] if 'args' in _item else {}
_data = pointer(_data,_args) _data = pointer(_data,_args)
return _data return _data
@staticmethod @staticmethod
def instance(_args): def instance(_args):
"""
"""
pre = [] pre = []
post=[] post=[]
@ -45,8 +55,20 @@ class State :
# #
# If the item has a path property is should be ignored # If the item has a path property is should be ignored
path = _args[key]['path'] if 'path' in _args[key] else '' path = _args[key]['path'] if 'path' in _args[key] else ''
out[key] = [ State._build(dict(_item,**{'path':path})) if 'path' not in _item else State._build(_item) for _item in _args[key]['pipeline']] # out[key] = [ State._build(dict(_item,**{'path':path})) if 'path' not in _item else State._build(_item) for _item in _args[key]['pipeline']]
out[key] = []
for _item in _args[key]['pipeline'] :
if type(_item).__name__ == 'function':
_stageInfo = {'module':_item,'name':_item.__name__,'args':{},'path':''}
pass
else:
if 'path' in _item :
_stageInfo = State._build(dict(_item,**{'path':path}))
else :
_stageInfo= State._build(_item)
out[key].append(_stageInfo)
# print ([out])
return out return out
# if 'pre' in _args: # if 'pre' in _args:
# path = _args['pre']['path'] if 'path' in _args['pre'] else '' # path = _args['pre']['path'] if 'path' in _args['pre'] else ''
@ -68,11 +90,18 @@ class State :
pass pass
@staticmethod @staticmethod
def _build(_args): def _build(_args):
"""
This function builds the object {module,path} where module is extracted from a file (if needed)
:param _args dictionary containing attributes that can be value pair
It can also be a function
"""
#
# In the advent an actual pointer is passed we should do the following
_info = State._extract(_args) _info = State._extract(_args)
# _info = dict(_args,**_info) # _info = dict(_args,**_info)
_info['module'] = State._instance(_info) _info['module'] = State._instance(_info)
return _info if _info['module'] is not None else None return _info if _info['module'] is not None else None
@staticmethod @staticmethod

1
data/maker/version.py Normal file
View File

@ -0,0 +1 @@
__version__='1.7.0'

View File

@ -1,10 +1,10 @@
from setuptools import setup, find_packages from setuptools import setup, find_packages
import os import os
import sys import sys
import version
def read(fname): def read(fname):
return open(os.path.join(os.path.dirname(__file__), fname)).read() return open(os.path.join(os.path.dirname(__file__), fname)).read()
args = {"name":"data-maker","version":"1.6.4", args = {"name":"data-maker","version":version.__version__,
"author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT", "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT",
"packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
args["install_requires"] = ['data-transport@git+https://github.com/lnyemba/data-transport.git','tensorflow'] args["install_requires"] = ['data-transport@git+https://github.com/lnyemba/data-transport.git','tensorflow']

1
version.py Symbolic link
View File

@ -0,0 +1 @@
data/maker/version.py