post processing features with dates
This commit is contained in:
parent
ee518316c0
commit
0797e3dba1
|
@ -12,14 +12,14 @@ import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import data.gan as gan
|
import data.gan as gan
|
||||||
import transport
|
import transport
|
||||||
from data.bridge import Binary
|
# from data.bridge import Binary
|
||||||
import threading as thread
|
import threading as thread
|
||||||
from data.maker import prepare
|
from data.maker import prepare
|
||||||
import copy
|
import copy
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
from multiprocessing import Process, RLock
|
from multiprocessing import Process, RLock
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
class ContinuousToDiscrete :
|
class ContinuousToDiscrete :
|
||||||
ROUND_UP = 2
|
ROUND_UP = 2
|
||||||
|
@ -229,7 +229,11 @@ class Learner(Process):
|
||||||
# self.logpath= _args['logpath'] if 'logpath' in _args else 'logs'
|
# self.logpath= _args['logpath'] if 'logpath' in _args else 'logs'
|
||||||
# sel.max_epoc
|
# sel.max_epoc
|
||||||
def get_schema(self):
|
def get_schema(self):
|
||||||
return [{'name':self._df.dtypes.index.tolist()[i],'type':self._df.dtypes.astype(str).tolist()[i]}for i in range(self._df.dtypes.shape[0])]
|
if self.store['source']['provider'] != 'bigquery' :
|
||||||
|
return [{'name':self._df.dtypes.index.tolist()[i],'type':self._df.dtypes.astype(str).tolist()[i]}for i in range(self._df.dtypes.shape[0])]
|
||||||
|
else:
|
||||||
|
reader = transport.factory.instance(**self.store['source'])
|
||||||
|
return reader.meta(table=self.info['from'])
|
||||||
def initalize(self):
|
def initalize(self):
|
||||||
reader = transport.factory.instance(**self.store['source'])
|
reader = transport.factory.instance(**self.store['source'])
|
||||||
_read_args= self.info
|
_read_args= self.info
|
||||||
|
@ -319,21 +323,56 @@ class Generator (Learner):
|
||||||
_iomatrix = gHandler.apply()
|
_iomatrix = gHandler.apply()
|
||||||
_candidates= [ self._encoder.revert(matrix=_item) for _item in _iomatrix]
|
_candidates= [ self._encoder.revert(matrix=_item) for _item in _iomatrix]
|
||||||
self.post(_candidates)
|
self.post(_candidates)
|
||||||
def appriximate(self,_df):
|
def approximate(self,_df):
|
||||||
_columns = self.info['approximate']
|
_columns = self.info['approximate']
|
||||||
_schema = {}
|
# _schema = {}
|
||||||
for _info in self.get_schema() :
|
# for _info in self.get_schema() :
|
||||||
_schema[_info['name']] = _info['type']
|
# _schema[_info['name']] = _info['type']
|
||||||
|
|
||||||
|
|
||||||
for name in _columns :
|
for name in _columns :
|
||||||
batches = np.array_split(_df[name].values,10)
|
batches = np.array_split(_df[name].fillna(np.nan).values,2)
|
||||||
|
_type = np.int64 if 'int' in self.info['approximate'][name]else np.float64
|
||||||
x = []
|
x = []
|
||||||
for values in batches :
|
for values in batches :
|
||||||
_values = np.random.dirichlet(values)
|
|
||||||
x += list(values + _values )if np.random.randint(0,2) else list(values - _values)
|
index = np.where(values != '')
|
||||||
_df[name] = np.int64(x) if 'int' in _schema[name] else np.float64(x)
|
_values = np.random.dirichlet(values[index].astype(_type))
|
||||||
|
values[index] = list(values[index] + _values )if np.random.randint(0,2) else list(values[index] - _values)
|
||||||
|
values[index] = values[index].astype(_type)
|
||||||
|
x += values.tolist()
|
||||||
|
if x :
|
||||||
|
_df[name] = x #np.array(x,dtype=np.int64) if 'int' in _type else np.arry(x,dtype=np.float64)
|
||||||
return _df
|
return _df
|
||||||
|
def make_date(self,**_args) :
|
||||||
|
"""
|
||||||
|
:param year initial value
|
||||||
|
"""
|
||||||
|
if _args['year'] in ['',None,np.nan] :
|
||||||
|
return None
|
||||||
|
year = int(_args['year'])
|
||||||
|
offset = _args['offset'] if 'offset' in _args else 0
|
||||||
|
month = np.random.randint(1,13)
|
||||||
|
if month == 2:
|
||||||
|
_end = 28 if year % 4 != 0 else 29
|
||||||
|
else:
|
||||||
|
_end = 31 if month in [1,3,5,7,8,10,12] else 30
|
||||||
|
day = np.random.randint(1,_end)
|
||||||
|
|
||||||
|
#-- synthetic date
|
||||||
|
_date = datetime(year=year,month=month,day=day)
|
||||||
|
FORMAT = _args['format'] if 'format' in _args else '%Y-%m-%d'
|
||||||
|
r = []
|
||||||
|
if offset :
|
||||||
|
r = [_date.strftime(FORMAT)]
|
||||||
|
for _delta in offset :
|
||||||
|
_date = _date + timedelta(_delta)
|
||||||
|
r.append(_date.strftime(FORMAT))
|
||||||
|
return r
|
||||||
|
else:
|
||||||
|
return _date.strftime(FORMAT)
|
||||||
|
|
||||||
|
pass
|
||||||
def format(self,_df):
|
def format(self,_df):
|
||||||
pass
|
pass
|
||||||
def post(self,_candidates):
|
def post(self,_candidates):
|
||||||
|
@ -345,10 +384,19 @@ class Generator (Learner):
|
||||||
for _iodf in _candidates :
|
for _iodf in _candidates :
|
||||||
_df = self._df.copy()
|
_df = self._df.copy()
|
||||||
_df[self.columns] = _iodf[self.columns]
|
_df[self.columns] = _iodf[self.columns]
|
||||||
if 'approximate' in self.info :
|
if 'approximate' in self.info :
|
||||||
|
_df = self.approximate(_df)
|
||||||
_df = self.appriximate(_df)
|
if 'make_date' in self.info :
|
||||||
writer.write(_df,schema=self.get_schema())
|
for name in self.info['make_date'] :
|
||||||
|
# iname = self.info['make_date']['init_field']
|
||||||
|
iname = self.info['make_date'][name]
|
||||||
|
|
||||||
|
years = _df[iname]
|
||||||
|
_dates = [self.make_date(year=year) for year in years]
|
||||||
|
if _dates :
|
||||||
|
_df[name] = _dates
|
||||||
|
|
||||||
|
writer.write(_df[['birth_datetime']+self.columns],schema=self.get_schema())
|
||||||
pass
|
pass
|
||||||
class factory :
|
class factory :
|
||||||
_infocache = {}
|
_infocache = {}
|
||||||
|
|
Loading…
Reference in New Issue