diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 59a7ff0..3e42419 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -81,7 +81,6 @@ class ContinuousToDiscrete : return values - def train (**_args): """ :params sql @@ -126,7 +125,7 @@ def train (**_args): args['matrix_size'] = _matrix.shape[0] args['batch_size'] = 2000 args['partition'] = 0 if 'partition' not in _args else _args['partition'] - os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0' + # os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0' trainer = gan.Train(**args) # @@ -215,8 +214,7 @@ def generate(**_args): _inputhandler = prepare.Input(**_args) values,_matrix = _inputhandler.convert() args['values'] = np.array(values) - if 'gpu' in _args : - os.environ['CUDA_VISIBLE_DEVICES'] = str(_args['gpu']) + handler = gan.Predict (**args) handler.load_meta(None) # @@ -226,76 +224,3 @@ def generate(**_args): candidates = handler.apply(candidates=args['candidates']) return [_inputhandler.revert(matrix=_matrix) for _matrix in candidates] - - -def _generate(**args): - """ - This function will generate a synthetic dataset on the basis of a model that has been learnt for the dataset - @return pandas.DataFrame - - :data data-frame to be synthesized - :column columns that need to be synthesized (discrete) - :id column identifying an entity - :logs location on disk where the learnt knowledge of the dataset is - """ - # df = args['data'] - df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data']) - - CONTINUOUS = args['continuous'] if 'continuous' in args else [] - column = args['column'] if (isinstance(args['column'],list)) else [args['column']] - # column_id = args['id'] - # - #@TODO: - # If the identifier is not present, we should fine a way to determine or make one - # - BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size']) - # NO_VALUE = dict(args['no_value']) if type(args['no_value']) == dict else args['no_value'] - bhandler = Binary() - _df = df.copy() - for col in column : - args['context'] = col - args['column'] = col - - msize = args['matrix_size'] if 'matrix_size' in args else -1 - values = bhandler.get_column(df[col],msize) - MISSING= bhandler.get_missing(df[col],msize) - - - - args['values'] = values - args['row_count'] = df.shape[0] - # if col in NO_VALUE : - # args['no_value'] = NO_VALUE[col] - # else: - # args['no_value'] = NO_VALUE - # novalue = NO_VALUE[col] if NO_VALUE[col] in ['na',''] else NO_VALUE[col] - # MISSING += [NO_VALUE[col]] - args['missing'] = MISSING - # - # we can determine the cardinalities here so we know what to allow or disallow - handler = gan.Predict (**args) - handler.load_meta(col) - r = handler.apply() - if col in CONTINUOUS : - r[col] = np.array(r[col]) - _approx = ContinuousToDiscrete.continuous(r[col],BIN_SIZE) #-- approximating based on arbitrary bins - r[col] = _approx - - - - _df[col] = r[col] - # - # Let's cast the type to the original type (it makes the data more usable) - # - # print (values) - # print ([col,df[col].dtype,_df[col].tolist()]) - otype = df[col].dtype - _df[col] = _df[col].astype(otype) - - # - # @TODO: log basic stats about the synthetic attribute - # - # print (r)s - # break - - return _df \ No newline at end of file diff --git a/pipeline.py b/pipeline.py index 8b1dd9e..6f39b55 100644 --- a/pipeline.py +++ b/pipeline.py @@ -81,7 +81,12 @@ class Components : terms = _args['columns'] return [name for name in _df.columns if np.sum( [int(field in name )for field in terms ]) > 0 ] return [] - + def set_gpu(self,**_args) : + if 'gpu' in _args : + gpu = _args['gpu'] if type(_args['gpu']) != str else [_args['gpu']] + _index = str(gpu[0]) + os.environ['CUDA_VISIBLE_DEVICES'] = _index + return gpu def train(self,**args): """ This function will perform training on the basis of a given pointer that reads data @@ -137,7 +142,7 @@ class Components : if x_cols : _args['data'] = df[list(set(df.columns) - set(x_cols))] if 'gpu' in args : - _args['gpu'] = args['gpu'] + _args['gpu'] = self.set_gpu(gpu=args['gpu']) data.maker.train(**_args) if 'autopilot' in ( list(args.keys())) : @@ -228,6 +233,8 @@ class Components : args['data'] = df[list(set(df.columns) - set(x_cols))] args['candidates'] = 1 if 'candidates' not in args else int(args['candidates']) + if 'gpu' in args : + args['gpu'] = self.set_gpu(gpu=args['gpu']) candidates = (data.maker.generate(**args)) if 'sql.BQWriter' in ostore['type'] :