post processing features with dates

This commit is contained in:
Steve Nyemba 2022-04-11 23:27:25 -05:00
parent ee518316c0
commit 0797e3dba1
1 changed files with 63 additions and 15 deletions

View File

@ -12,14 +12,14 @@ import pandas as pd
import numpy as np
import data.gan as gan
import transport
from data.bridge import Binary
# from data.bridge import Binary
import threading as thread
from data.maker import prepare
import copy
import os
import json
from multiprocessing import Process, RLock
from datetime import datetime, timedelta
class ContinuousToDiscrete :
ROUND_UP = 2
@ -229,7 +229,11 @@ class Learner(Process):
# self.logpath= _args['logpath'] if 'logpath' in _args else 'logs'
# sel.max_epoc
def get_schema(self):
if self.store['source']['provider'] != 'bigquery' :
return [{'name':self._df.dtypes.index.tolist()[i],'type':self._df.dtypes.astype(str).tolist()[i]}for i in range(self._df.dtypes.shape[0])]
else:
reader = transport.factory.instance(**self.store['source'])
return reader.meta(table=self.info['from'])
def initalize(self):
reader = transport.factory.instance(**self.store['source'])
_read_args= self.info
@ -319,21 +323,56 @@ class Generator (Learner):
_iomatrix = gHandler.apply()
_candidates= [ self._encoder.revert(matrix=_item) for _item in _iomatrix]
self.post(_candidates)
def appriximate(self,_df):
def approximate(self,_df):
_columns = self.info['approximate']
_schema = {}
for _info in self.get_schema() :
_schema[_info['name']] = _info['type']
# _schema = {}
# for _info in self.get_schema() :
# _schema[_info['name']] = _info['type']
for name in _columns :
batches = np.array_split(_df[name].values,10)
batches = np.array_split(_df[name].fillna(np.nan).values,2)
_type = np.int64 if 'int' in self.info['approximate'][name]else np.float64
x = []
for values in batches :
_values = np.random.dirichlet(values)
x += list(values + _values )if np.random.randint(0,2) else list(values - _values)
_df[name] = np.int64(x) if 'int' in _schema[name] else np.float64(x)
index = np.where(values != '')
_values = np.random.dirichlet(values[index].astype(_type))
values[index] = list(values[index] + _values )if np.random.randint(0,2) else list(values[index] - _values)
values[index] = values[index].astype(_type)
x += values.tolist()
if x :
_df[name] = x #np.array(x,dtype=np.int64) if 'int' in _type else np.arry(x,dtype=np.float64)
return _df
def make_date(self,**_args) :
"""
:param year initial value
"""
if _args['year'] in ['',None,np.nan] :
return None
year = int(_args['year'])
offset = _args['offset'] if 'offset' in _args else 0
month = np.random.randint(1,13)
if month == 2:
_end = 28 if year % 4 != 0 else 29
else:
_end = 31 if month in [1,3,5,7,8,10,12] else 30
day = np.random.randint(1,_end)
#-- synthetic date
_date = datetime(year=year,month=month,day=day)
FORMAT = _args['format'] if 'format' in _args else '%Y-%m-%d'
r = []
if offset :
r = [_date.strftime(FORMAT)]
for _delta in offset :
_date = _date + timedelta(_delta)
r.append(_date.strftime(FORMAT))
return r
else:
return _date.strftime(FORMAT)
pass
def format(self,_df):
pass
def post(self,_candidates):
@ -346,9 +385,18 @@ class Generator (Learner):
_df = self._df.copy()
_df[self.columns] = _iodf[self.columns]
if 'approximate' in self.info :
_df = self.approximate(_df)
if 'make_date' in self.info :
for name in self.info['make_date'] :
# iname = self.info['make_date']['init_field']
iname = self.info['make_date'][name]
_df = self.appriximate(_df)
writer.write(_df,schema=self.get_schema())
years = _df[iname]
_dates = [self.make_date(year=year) for year in years]
if _dates :
_df[name] = _dates
writer.write(_df[['birth_datetime']+self.columns],schema=self.get_schema())
pass
class factory :
_infocache = {}