post processing features with dates
This commit is contained in:
parent
ee518316c0
commit
0797e3dba1
|
@ -12,14 +12,14 @@ import pandas as pd
|
|||
import numpy as np
|
||||
import data.gan as gan
|
||||
import transport
|
||||
from data.bridge import Binary
|
||||
# from data.bridge import Binary
|
||||
import threading as thread
|
||||
from data.maker import prepare
|
||||
import copy
|
||||
import os
|
||||
import json
|
||||
from multiprocessing import Process, RLock
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
class ContinuousToDiscrete :
|
||||
ROUND_UP = 2
|
||||
|
@ -229,7 +229,11 @@ class Learner(Process):
|
|||
# self.logpath= _args['logpath'] if 'logpath' in _args else 'logs'
|
||||
# sel.max_epoc
|
||||
def get_schema(self):
|
||||
return [{'name':self._df.dtypes.index.tolist()[i],'type':self._df.dtypes.astype(str).tolist()[i]}for i in range(self._df.dtypes.shape[0])]
|
||||
if self.store['source']['provider'] != 'bigquery' :
|
||||
return [{'name':self._df.dtypes.index.tolist()[i],'type':self._df.dtypes.astype(str).tolist()[i]}for i in range(self._df.dtypes.shape[0])]
|
||||
else:
|
||||
reader = transport.factory.instance(**self.store['source'])
|
||||
return reader.meta(table=self.info['from'])
|
||||
def initalize(self):
|
||||
reader = transport.factory.instance(**self.store['source'])
|
||||
_read_args= self.info
|
||||
|
@ -319,21 +323,56 @@ class Generator (Learner):
|
|||
_iomatrix = gHandler.apply()
|
||||
_candidates= [ self._encoder.revert(matrix=_item) for _item in _iomatrix]
|
||||
self.post(_candidates)
|
||||
def appriximate(self,_df):
|
||||
def approximate(self,_df):
|
||||
_columns = self.info['approximate']
|
||||
_schema = {}
|
||||
for _info in self.get_schema() :
|
||||
_schema[_info['name']] = _info['type']
|
||||
# _schema = {}
|
||||
# for _info in self.get_schema() :
|
||||
# _schema[_info['name']] = _info['type']
|
||||
|
||||
|
||||
for name in _columns :
|
||||
batches = np.array_split(_df[name].values,10)
|
||||
batches = np.array_split(_df[name].fillna(np.nan).values,2)
|
||||
_type = np.int64 if 'int' in self.info['approximate'][name]else np.float64
|
||||
x = []
|
||||
for values in batches :
|
||||
_values = np.random.dirichlet(values)
|
||||
x += list(values + _values )if np.random.randint(0,2) else list(values - _values)
|
||||
_df[name] = np.int64(x) if 'int' in _schema[name] else np.float64(x)
|
||||
|
||||
index = np.where(values != '')
|
||||
_values = np.random.dirichlet(values[index].astype(_type))
|
||||
values[index] = list(values[index] + _values )if np.random.randint(0,2) else list(values[index] - _values)
|
||||
values[index] = values[index].astype(_type)
|
||||
x += values.tolist()
|
||||
if x :
|
||||
_df[name] = x #np.array(x,dtype=np.int64) if 'int' in _type else np.arry(x,dtype=np.float64)
|
||||
return _df
|
||||
def make_date(self,**_args) :
|
||||
"""
|
||||
:param year initial value
|
||||
"""
|
||||
if _args['year'] in ['',None,np.nan] :
|
||||
return None
|
||||
year = int(_args['year'])
|
||||
offset = _args['offset'] if 'offset' in _args else 0
|
||||
month = np.random.randint(1,13)
|
||||
if month == 2:
|
||||
_end = 28 if year % 4 != 0 else 29
|
||||
else:
|
||||
_end = 31 if month in [1,3,5,7,8,10,12] else 30
|
||||
day = np.random.randint(1,_end)
|
||||
|
||||
#-- synthetic date
|
||||
_date = datetime(year=year,month=month,day=day)
|
||||
FORMAT = _args['format'] if 'format' in _args else '%Y-%m-%d'
|
||||
r = []
|
||||
if offset :
|
||||
r = [_date.strftime(FORMAT)]
|
||||
for _delta in offset :
|
||||
_date = _date + timedelta(_delta)
|
||||
r.append(_date.strftime(FORMAT))
|
||||
return r
|
||||
else:
|
||||
return _date.strftime(FORMAT)
|
||||
|
||||
pass
|
||||
def format(self,_df):
|
||||
pass
|
||||
def post(self,_candidates):
|
||||
|
@ -345,10 +384,19 @@ class Generator (Learner):
|
|||
for _iodf in _candidates :
|
||||
_df = self._df.copy()
|
||||
_df[self.columns] = _iodf[self.columns]
|
||||
if 'approximate' in self.info :
|
||||
|
||||
_df = self.appriximate(_df)
|
||||
writer.write(_df,schema=self.get_schema())
|
||||
if 'approximate' in self.info :
|
||||
_df = self.approximate(_df)
|
||||
if 'make_date' in self.info :
|
||||
for name in self.info['make_date'] :
|
||||
# iname = self.info['make_date']['init_field']
|
||||
iname = self.info['make_date'][name]
|
||||
|
||||
years = _df[iname]
|
||||
_dates = [self.make_date(year=year) for year in years]
|
||||
if _dates :
|
||||
_df[name] = _dates
|
||||
|
||||
writer.write(_df[['birth_datetime']+self.columns],schema=self.get_schema())
|
||||
pass
|
||||
class factory :
|
||||
_infocache = {}
|
||||
|
|
Loading…
Reference in New Issue