post processing features with dates

This commit is contained in:
Steve Nyemba 2022-04-11 23:27:25 -05:00
parent ee518316c0
commit 0797e3dba1
1 changed files with 63 additions and 15 deletions

View File

@ -12,14 +12,14 @@ import pandas as pd
import numpy as np import numpy as np
import data.gan as gan import data.gan as gan
import transport import transport
from data.bridge import Binary # from data.bridge import Binary
import threading as thread import threading as thread
from data.maker import prepare from data.maker import prepare
import copy import copy
import os import os
import json import json
from multiprocessing import Process, RLock from multiprocessing import Process, RLock
from datetime import datetime, timedelta
class ContinuousToDiscrete : class ContinuousToDiscrete :
ROUND_UP = 2 ROUND_UP = 2
@ -229,7 +229,11 @@ class Learner(Process):
# self.logpath= _args['logpath'] if 'logpath' in _args else 'logs' # self.logpath= _args['logpath'] if 'logpath' in _args else 'logs'
# sel.max_epoc # sel.max_epoc
def get_schema(self): def get_schema(self):
return [{'name':self._df.dtypes.index.tolist()[i],'type':self._df.dtypes.astype(str).tolist()[i]}for i in range(self._df.dtypes.shape[0])] if self.store['source']['provider'] != 'bigquery' :
return [{'name':self._df.dtypes.index.tolist()[i],'type':self._df.dtypes.astype(str).tolist()[i]}for i in range(self._df.dtypes.shape[0])]
else:
reader = transport.factory.instance(**self.store['source'])
return reader.meta(table=self.info['from'])
def initalize(self): def initalize(self):
reader = transport.factory.instance(**self.store['source']) reader = transport.factory.instance(**self.store['source'])
_read_args= self.info _read_args= self.info
@ -319,21 +323,56 @@ class Generator (Learner):
_iomatrix = gHandler.apply() _iomatrix = gHandler.apply()
_candidates= [ self._encoder.revert(matrix=_item) for _item in _iomatrix] _candidates= [ self._encoder.revert(matrix=_item) for _item in _iomatrix]
self.post(_candidates) self.post(_candidates)
def appriximate(self,_df): def approximate(self,_df):
_columns = self.info['approximate'] _columns = self.info['approximate']
_schema = {} # _schema = {}
for _info in self.get_schema() : # for _info in self.get_schema() :
_schema[_info['name']] = _info['type'] # _schema[_info['name']] = _info['type']
for name in _columns : for name in _columns :
batches = np.array_split(_df[name].values,10) batches = np.array_split(_df[name].fillna(np.nan).values,2)
_type = np.int64 if 'int' in self.info['approximate'][name]else np.float64
x = [] x = []
for values in batches : for values in batches :
_values = np.random.dirichlet(values)
x += list(values + _values )if np.random.randint(0,2) else list(values - _values) index = np.where(values != '')
_df[name] = np.int64(x) if 'int' in _schema[name] else np.float64(x) _values = np.random.dirichlet(values[index].astype(_type))
values[index] = list(values[index] + _values )if np.random.randint(0,2) else list(values[index] - _values)
values[index] = values[index].astype(_type)
x += values.tolist()
if x :
_df[name] = x #np.array(x,dtype=np.int64) if 'int' in _type else np.arry(x,dtype=np.float64)
return _df return _df
def make_date(self,**_args) :
"""
:param year initial value
"""
if _args['year'] in ['',None,np.nan] :
return None
year = int(_args['year'])
offset = _args['offset'] if 'offset' in _args else 0
month = np.random.randint(1,13)
if month == 2:
_end = 28 if year % 4 != 0 else 29
else:
_end = 31 if month in [1,3,5,7,8,10,12] else 30
day = np.random.randint(1,_end)
#-- synthetic date
_date = datetime(year=year,month=month,day=day)
FORMAT = _args['format'] if 'format' in _args else '%Y-%m-%d'
r = []
if offset :
r = [_date.strftime(FORMAT)]
for _delta in offset :
_date = _date + timedelta(_delta)
r.append(_date.strftime(FORMAT))
return r
else:
return _date.strftime(FORMAT)
pass
def format(self,_df): def format(self,_df):
pass pass
def post(self,_candidates): def post(self,_candidates):
@ -346,9 +385,18 @@ class Generator (Learner):
_df = self._df.copy() _df = self._df.copy()
_df[self.columns] = _iodf[self.columns] _df[self.columns] = _iodf[self.columns]
if 'approximate' in self.info : if 'approximate' in self.info :
_df = self.approximate(_df)
if 'make_date' in self.info :
for name in self.info['make_date'] :
# iname = self.info['make_date']['init_field']
iname = self.info['make_date'][name]
_df = self.appriximate(_df) years = _df[iname]
writer.write(_df,schema=self.get_schema()) _dates = [self.make_date(year=year) for year in years]
if _dates :
_df[name] = _dates
writer.write(_df[['birth_datetime']+self.columns],schema=self.get_schema())
pass pass
class factory : class factory :
_infocache = {} _infocache = {}