Compare commits

..

7 Commits

Author SHA1 Message Date
Steve Nyemba c865e59ff4 Merge branch 'dev' 2023-07-25 14:42:50 -05:00
Steve Nyemba c1c17bc59d bug fix: random shuffle 2023-07-25 14:40:45 -05:00
Steve Nyemba b9596edd8e bug fix: random shuffle improvements 2023-07-25 14:30:20 -05:00
Steve Nyemba ef43f20e9c matrix processing: special case 2023-06-07 11:59:19 -05:00
Steve Nyemba 78a650b29b bug fixes ... 2023-04-25 15:07:35 -05:00
Steve Nyemba e5af702ddb bug fixes: stages, other training/generation 2023-04-24 16:37:08 -05:00
Steve Nyemba f1e2fe3699 bug fixes: stages, other training/generation 2023-04-24 16:36:25 -05:00
7 changed files with 129 additions and 37 deletions

View File

@ -103,11 +103,12 @@ class GNet :
CHECKPOINT_SKIPS = int(args['checkpoint_skips']) if 'checkpoint_skips' in args else int(self.MAX_EPOCHS/10)
CHECKPOINT_SKIPS = 1 if CHECKPOINT_SKIPS < 1 else CHECKPOINT_SKIPS
# if self.MAX_EPOCHS < 2*CHECKPOINT_SKIPS :
# CHECKPOINT_SKIPS = 2
# self.CHECKPOINTS = [1,self.MAX_EPOCHS] + np.repeat( np.divide(self.MAX_EPOCHS,CHECKPOINT_SKIPS),CHECKPOINT_SKIPS ).cumsum().astype(int).tolist()
self.CHECKPOINTS = np.repeat(CHECKPOINT_SKIPS, self.MAX_EPOCHS/ CHECKPOINT_SKIPS).cumsum().astype(int).tolist()
self.ROW_COUNT = args['real'].shape[0] if 'real' in args else 100
self.CONTEXT = args['context']
self.ATTRIBUTES = {"id":args['column_id'] if 'column_id' in args else None,"synthetic":args['column'] if 'column' in args else None}
@ -287,8 +288,17 @@ class Generator (GNet):
"""
def __init__(self,**args):
GNet.__init__(self,**args)
self.discriminator = Discriminator(**args)
if 'trainer' not in args :
GNet.__init__(self,**args)
self.discriminator = Discriminator(**args)
else:
_args = {}
_trainer = args['trainer']
for key in vars(_trainer) :
value = getattr(_trainer,key)
setattr(self,key,value)
_args[key] = value
self.discriminator = Discriminator(**_args)
def loss(self,**args):
fake = args['fake']
label = args['label']
@ -657,7 +667,9 @@ class Predict(GNet):
fake = self.generator.network(inputs=z, label=label)
init = tf.compat.v1.global_variables_initializer()
saver = tf.compat.v1.train.Saver()
print ([self.CHECKPOINTS])
# saver = tf.compat.v1.train.Saver()
saver = tf.compat.v1.train.Saver(max_to_keep=len(self.CHECKPOINTS))
df = pd.DataFrame()
CANDIDATE_COUNT = args['candidates'] if 'candidates' in args else 1 #0 if self.ROW_COUNT < 1000 else 100
candidates = []

View File

@ -22,7 +22,7 @@ import nujson as json
from multiprocessing import Process, RLock
from datetime import datetime, timedelta
from multiprocessing import Queue
from version import __version__
import time
@ -33,6 +33,7 @@ class Learner(Process):
super(Learner, self).__init__()
self._arch = {'init':_args}
self.ndx = 0
self._queue = Queue()
self.lock = RLock()
@ -44,6 +45,8 @@ class Learner(Process):
self.gpu = None
self.info = _args['info']
if 'context' not in self.info :
self.info['context'] = self.info['from']
self.columns = self.info['columns'] if 'columns' in self.info else None
self.store = _args['store']
@ -97,9 +100,12 @@ class Learner(Process):
# __info = (pd.DataFrame(self._states)[['name','path','args']]).to_dict(orient='records')
if self._states :
__info = {}
# print (self._states)
for key in self._states :
__info[key] = [{"name":_item['name'],"args":_item['args'],"path":_item['path']} for _item in self._states[key]]
_pipeline = self._states[key]
# __info[key] = ([{'name':_payload['name']} for _payload in _pipeline])
__info[key] = [{"name":_item['name'],"args":_item['args'],"path":_item['path']} for _item in self._states[key] if _item ]
self.log(object='state-space',action='load',input=__info)
@ -173,6 +179,7 @@ class Learner(Process):
for name in columns :
#
# randomly sampling 5 elements to make sense of data-types
if self._df[name].size < 5 :
continue
_index = np.random.choice(np.arange(self._df[name].size),5,False)
@ -270,18 +277,23 @@ class Trainer(Learner):
#
_epochs = [_e for _e in gTrain.logs['epochs'] if _e['path'] != '']
_epochs.sort(key=lambda _item: _item['loss'],reverse=False)
_args['network_args']['max_epochs'] = _epochs[0]['epochs']
self.log(action='autopilot',input={'epoch':_epochs[0]})
g = Generator(**_args)
# g.run()
end = datetime.now() #.strftime('%Y-%m-%d %H:%M:%S')
_min = float((end-beg).seconds/ 60)
_logs = {'action':'train','input':{'start':beg.strftime('%Y-%m-%d %H:%M:%S'),'minutes':_min,"unique_counts":self._encoder._io[0]}}
self.log(**_logs)
self._g = g
if self.autopilot :
if self.autopilot :
# g = Generator(**_args)
g = Generator(**self._arch['init'])
self._g = g
self._g.run()
#
#@TODO Find a way to have the data in the object ....
@ -300,10 +312,15 @@ class Generator (Learner):
#
# We need to load the mapping information for the space we are working with ...
#
self.network_args['candidates'] = int(_args['candidates']) if 'candidates' in _args else 1
filename = os.sep.join([self.network_args['logs'],'output',self.network_args['context'],'map.json'])
# filename = os.sep.join([self.network_args['logs'],'output',self.network_args['context'],'map.json'])
_suffix = self.network_args['context']
filename = os.sep.join([self.network_args['logs'],'output',self.network_args['context'],'meta-',_suffix,'.json'])
self.log(**{'action':'init-map','input':{'filename':filename,'exists':os.path.exists(filename)}})
if os.path.exists(filename):
file = open(filename)
self._map = json.loads(file.read())
file.close()
@ -485,7 +502,10 @@ class Generator (Learner):
N = 0
for _iodf in _candidates :
_df = self._df.copy()
_df[self.columns] = _iodf[self.columns]
if self.columns :
_df[self.columns] = _iodf[self.columns]
N += _df.shape[0]
if self._states and 'post' in self._states:
_df = State.apply(_df,self._states['post'])
@ -533,27 +553,55 @@ class Shuffle(Generator):
"""
def __init__(self,**_args):
super().__init__(**_args)
if 'data' not in _args :
reader = transport.factory.instance(**self.store['source'])
self._df = reader.read(sql=self.info['sql'])
def run(self):
np.random.seed(1)
self.initalize()
_index = np.arange(self._df.shape[0])
np.random.shuffle(_index)
np.random.shuffle(_index)
_iocolumns = self.info['columns']
_ocolumns = list(set(self._df.columns) - set(_iocolumns) )
# _iodf = pd.DataFrame(self._df[_ocolumns],self._df.loc[_index][_iocolumns],index=np.arange(_index.size))
_iodf = pd.DataFrame(self._df[_iocolumns].copy(),index = np.arange(_index.size))
# self._df = self._df.loc[_index][_ocolumns].join(_iodf)
self._df = self._df.loc[_index][_ocolumns]
self._df.index = np.arange(self._df.shape[0])
self._df = self._df.join(_iodf)
#
# The following is a full shuffle
self._df = self._df.loc[_index]
self._df.index = np.arange(self._df.shape[0])
# If we are given lists of columns instead of a list-of-list
# unpack the list
_invColumns = []
_colNames = []
_ucolNames= []
for _item in self.info['columns'] :
if type(_item) == list :
_invColumns.append(_item)
elif _item in self._df.columns.tolist():
_colNames.append(_item)
#
# At this point we build the matrix of elements we are interested in considering the any unspecified column
#
if _colNames :
_invColumns.append(_colNames)
_ucolNames = list(set(self._df.columns) - set(_colNames))
if _ucolNames :
_invColumns += [ [_name] for _name in _ucolNames]
_xdf = pd.DataFrame()
_xdf = pd.DataFrame()
_index = np.arange(self._df.shape[0])
for _columns in _invColumns :
_tmpdf = self._df[_columns].copy()[_columns]
np.random.seed(1)
np.random.shuffle(_index)
print (_columns,_index)
# _values = _tmpdf.values[_index]
#_tmpdf = _tmpdf.iloc[_index]
_tmpdf = pd.DataFrame(_tmpdf.values[_index],columns=_columns)
if _xdf.shape[0] == 0 :
_xdf = _tmpdf
else:
_xdf = _xdf.join(_tmpdf)
_xdf = _xdf[self._df.columns]
self._df = _xdf
_log = {'action':'io-data','input':{'candidates':1,'rows':int(self._df.shape[0])}}
self.log(**_log)
try:
@ -580,6 +628,7 @@ class factory :
"""
#
if _args['apply'] in [apply.RANDOM] :
pthread = Shuffle(**_args)

View File

@ -69,7 +69,7 @@ class Date(Post):
"""
"""
pass
pass
class Approximate(Post):
def apply(**_args):
pass

View File

@ -31,12 +31,22 @@ class State :
continue
pointer = _item['module']
_args = _item['args']
if type(pointer).__name__ != 'function':
_args = _item['args'] if 'args' in _item else {}
else:
pointer = _item['module']
_args = _item['args'] if 'args' in _item else {}
_data = pointer(_data,_args)
return _data
@staticmethod
def instance(_args):
"""
"""
pre = []
post=[]
@ -45,8 +55,20 @@ class State :
#
# If the item has a path property is should be ignored
path = _args[key]['path'] if 'path' in _args[key] else ''
out[key] = [ State._build(dict(_item,**{'path':path})) if 'path' not in _item else State._build(_item) for _item in _args[key]['pipeline']]
# out[key] = [ State._build(dict(_item,**{'path':path})) if 'path' not in _item else State._build(_item) for _item in _args[key]['pipeline']]
out[key] = []
for _item in _args[key]['pipeline'] :
if type(_item).__name__ == 'function':
_stageInfo = {'module':_item,'name':_item.__name__,'args':{},'path':''}
pass
else:
if 'path' in _item :
_stageInfo = State._build(dict(_item,**{'path':path}))
else :
_stageInfo= State._build(_item)
out[key].append(_stageInfo)
# print ([out])
return out
# if 'pre' in _args:
# path = _args['pre']['path'] if 'path' in _args['pre'] else ''
@ -68,11 +90,18 @@ class State :
pass
@staticmethod
def _build(_args):
"""
This function builds the object {module,path} where module is extracted from a file (if needed)
:param _args dictionary containing attributes that can be value pair
It can also be a function
"""
#
# In the advent an actual pointer is passed we should do the following
_info = State._extract(_args)
# _info = dict(_args,**_info)
_info['module'] = State._instance(_info)
_info['module'] = State._instance(_info)
return _info if _info['module'] is not None else None
@staticmethod

1
data/maker/version.py Normal file
View File

@ -0,0 +1 @@
__version__='1.7.0'

View File

@ -1,10 +1,10 @@
from setuptools import setup, find_packages
import os
import sys
import version
def read(fname):
return open(os.path.join(os.path.dirname(__file__), fname)).read()
args = {"name":"data-maker","version":"1.6.4",
args = {"name":"data-maker","version":version.__version__,
"author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT",
"packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
args["install_requires"] = ['data-transport@git+https://github.com/lnyemba/data-transport.git','tensorflow']

1
version.py Symbolic link
View File

@ -0,0 +1 @@
data/maker/version.py