bug fix: misc. improvements

This commit is contained in:
Steve Nyemba 2022-01-13 15:05:00 -06:00
parent 157df9334c
commit f99af3655d
3 changed files with 30 additions and 14 deletions

View File

@ -20,7 +20,9 @@ EMBEDDED IN CODE :
""" """
import tensorflow as tf import tensorflow as tf
from tensorflow.contrib.layers import l2_regularizer # from tensorflow.contrib.layers import l2_regularizer
from tensorflow.keras import layers
from tensorflow.keras.regularizers import L2 as l2_regularizer
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import time import time
@ -34,7 +36,7 @@ import pickle
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = "0" os.environ['CUDA_VISIBLE_DEVICES'] = "0"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
tf.compat.v1.disable_eager_execution()
# STEPS_PER_EPOCH = int(SYS_ARGS['epoch']) if 'epoch' in SYS_ARGS else 256 # STEPS_PER_EPOCH = int(SYS_ARGS['epoch']) if 'epoch' in SYS_ARGS else 256
# NUM_GPUS = 1 if 'num_gpu' not in SYS_ARGS else int(SYS_ARGS['num_gpu']) # NUM_GPUS = 1 if 'num_gpu' not in SYS_ARGS else int(SYS_ARGS['num_gpu'])
# BATCHSIZE_PER_GPU = 2000 # BATCHSIZE_PER_GPU = 2000
@ -211,13 +213,14 @@ class GNet :
labels = None if 'labels' not in args else args['labels'] labels = None if 'labels' not in args else args['labels']
n_labels= None if 'n_labels' not in args else args['n_labels'] n_labels= None if 'n_labels' not in args else args['n_labels']
shift = [0] if self.__class__.__name__.lower() == 'generator' else [1] #-- not sure what this is doing shift = [0] if self.__class__.__name__.lower() == 'generator' else [1] #-- not sure what this is doing
mean, var = tf.nn.moments(inputs, shift, keep_dims=True) # mean, var = tf.nn.moments(inputs, shift, keep_dims=True)
shape = inputs.shape[1].value mean, var = tf.nn.moments(inputs, shift,keepdims=True)
# shape = inputs.shape[1].value
shape = inputs.shape[1]
if labels is not None: if labels is not None:
offset_m = self.get.variables(shape=[1,shape], name='offset'+name, offset_m = self.get.variables(shape=[1,shape], name='offset'+name,initializer=tf.zeros_initializer)
initializer=tf.zeros_initializer) scale_m = self.get.variables(shape=[n_labels,shape], name='scale'+name,initializer=tf.ones_initializer)
scale_m = self.get.variables(shape=[n_labels,shape], name='scale'+name,
initializer=tf.ones_initializer)
offset = tf.nn.embedding_lookup(offset_m, labels) offset = tf.nn.embedding_lookup(offset_m, labels)
scale = tf.nn.embedding_lookup(scale_m, labels) scale = tf.nn.embedding_lookup(scale_m, labels)
@ -595,7 +598,7 @@ class Predict(GNet):
df = pd.DataFrame() df = pd.DataFrame()
CANDIDATE_COUNT = args['candidates'] if 'candidates' in args else 1 #0 if self.ROW_COUNT < 1000 else 100 CANDIDATE_COUNT = args['candidates'] if 'candidates' in args else 1 #0 if self.ROW_COUNT < 1000 else 100
candidates = [] candidates = []
with tf.compat.v1.Session() as sess: with tf.compat.v1.Session() as sess:
saver.restore(sess, model_dir) saver.restore(sess, model_dir)
if self._LABEL is not None : if self._LABEL is not None :

View File

@ -106,6 +106,8 @@ def train (**_args):
values = _inputhandler._map[key]['values'].tolist() values = _inputhandler._map[key]['values'].tolist()
_map[key] = {"beg":beg,"end":end,"values":np.array(values).astype(str).tolist()} _map[key] = {"beg":beg,"end":end,"values":np.array(values).astype(str).tolist()}
info = {"rows":_matrix.shape[0],"cols":_matrix.shape[1],"map":_map} info = {"rows":_matrix.shape[0],"cols":_matrix.shape[1],"map":_map}
print()
# print ([_args['context'],_inputhandler._io])
logger.write({"module":"gan-train","action":"data-prep","context":_args['context'],"input":_inputhandler._io}) logger.write({"module":"gan-train","action":"data-prep","context":_args['context'],"input":_inputhandler._io})
args['logs'] = _args['logs'] if 'logs' in _args else 'logs' args['logs'] = _args['logs'] if 'logs' in _args else 'logs'
@ -142,9 +144,10 @@ def generate(**_args):
:param context :param context
:param logs :param logs
""" """
_args['logs'] = _args['logs'] if 'logs' in _args else 'logs'
partition = _args['partition'] if 'partition' in _args else None partition = _args['partition'] if 'partition' in _args else None
if not partition : if not partition :
MAP_FLDER = os.sep.join([_args['logs'],'output',_args['context']]) MAP_FOLDER = os.sep.join([_args['logs'],'output',_args['context']])
# f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json'])) # f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']))
else: else:
MAP_FOLDER = os.sep.join([_args['logs'],'output',_args['context'],str(partition)]) MAP_FOLDER = os.sep.join([_args['logs'],'output',_args['context'],str(partition)])

View File

@ -151,6 +151,7 @@ class Components :
if df.shape[0] and df.shape[0] : if df.shape[0] and df.shape[0] :
# #
# We have a full blown matrix to be processed # We have a full blown matrix to be processed
print ('-- Training --')
data.maker.train(**_args) data.maker.train(**_args)
else: else:
print ("... skipping training !!") print ("... skipping training !!")
@ -259,16 +260,23 @@ class Components :
_df[name] = _df[name].apply(lambda value: '' if str(value) == 'NaT' else str(value)[:10]) _df[name] = _df[name].apply(lambda value: '' if str(value) == 'NaT' else str(value)[:10])
#_df[name] = _df[name].dt.date #_df[name] = _df[name].dt.date
# _df[name] = pd.to_datetime(_df[name].fillna(''),errors='coerce') # _df[name] = pd.to_datetime(_df[name].fillna(''),errors='coerce')
else:
pass
_df[name] = pd.to_datetime(_df[name])
else: else:
value = 0
if _item['type'] == 'INTEGER' : if _item['type'] == 'INTEGER' :
_type = np.int64 _type = np.int64
elif _item['type'] in ['FLOAT','NUMERIC']: elif _item['type'] in ['FLOAT','NUMERIC']:
_type = np.float64 _type = np.float64
else: else:
_value = '' _value = ''
_df[name] = _df[name].fillna(_value).astype(_type) _df[name] = _df[name].fillna(_value) #.astype(_type)
columns.append(name) columns.append(name)
writer.write(_df,schema=_schema,table=args['from']) print ()
print (_df)
writer.write(_df.astype(object),schema=_schema,table=args['from'])
else: else:
writer.write(_df,table=args['from']) writer.write(_df,table=args['from'])
@ -350,7 +358,7 @@ class Components :
for _item in schema : for _item in schema :
dtype = str dtype = str
name = _item['name'] name = _item['name']
novalue = -1 novalue = 0
if _item['type'] in ['INTEGER','NUMERIC']: if _item['type'] in ['INTEGER','NUMERIC']:
dtype = np.int64 dtype = np.int64
@ -550,7 +558,7 @@ if __name__ == '__main__' :
index = f[0] if f else 0 index = f[0] if f else 0
# #
print ("..::: ",PIPELINE[index]['context']) print ("..::: ",PIPELINE[index]['context'],':::..')
args = (PIPELINE[index]) args = (PIPELINE[index])
for key in _config : for key in _config :
if key == 'pipeline' or key in args: if key == 'pipeline' or key in args:
@ -567,6 +575,7 @@ if __name__ == '__main__' :
args['batch_size'] = 2000 #if 'batch_size' not in args else int(args['batch_size']) args['batch_size'] = 2000 #if 'batch_size' not in args else int(args['batch_size'])
if 'dataset' not in args : if 'dataset' not in args :
args['dataset'] = 'combined20191004v2_deid' args['dataset'] = 'combined20191004v2_deid'
args['logs'] = args['logs'] if 'logs' in args else 'logs'
PART_SIZE = int(args['part_size']) if 'part_size' in args else 8 PART_SIZE = int(args['part_size']) if 'part_size' in args else 8
# #
# @TODO: # @TODO:
@ -599,6 +608,7 @@ if __name__ == '__main__' :
jobs.append(job) jobs.append(job)
pass pass
else: else:
generator = Components() generator = Components()
generator.generate(args) generator.generate(args)
elif 'shuffle' in SYS_ARGS : elif 'shuffle' in SYS_ARGS :