post processing features with dates

2022-04-11 23:27:25 -05:00 · 2022-04-11 23:27:25 -05:00 · 0797e3dba1
parent ee518316c0
commit 0797e3dba1
1 changed files with 63 additions and 15 deletions
--- a/data/maker/init.py
+++ b/data/maker/init.py
@ -12,14 +12,14 @@ import pandas as pd
 import numpy as np
 import data.gan as gan
 import transport
-from data.bridge import Binary
+# from data.bridge import Binary
 import threading as thread
 from data.maker import prepare
 import copy
 import os
 import json
 from multiprocessing import Process, RLock
-
+from datetime import datetime, timedelta
 class ContinuousToDiscrete :
    ROUND_UP = 2
@ -229,7 +229,11 @@ class Learner(Process):
        # self.logpath= _args['logpath'] if 'logpath' in _args else 'logs'
        # sel.max_epoc
    def get_schema(self):
-        return [{'name':self._df.dtypes.index.tolist()[i],'type':self._df.dtypes.astype(str).tolist()[i]}for i in range(self._df.dtypes.shape[0])]
+        if self.store['source']['provider'] != 'bigquery' :
            return [{'name':self._df.dtypes.index.tolist()[i],'type':self._df.dtypes.astype(str).tolist()[i]}for i in range(self._df.dtypes.shape[0])]
        else:
            reader  = transport.factory.instance(**self.store['source'])
            return reader.meta(table=self.info['from'])
    def initalize(self):
        reader  = transport.factory.instance(**self.store['source'])
        _read_args= self.info
@ -319,21 +323,56 @@ class Generator (Learner):
        _iomatrix = gHandler.apply()
        _candidates= [ self._encoder.revert(matrix=_item) for _item in _iomatrix]
        self.post(_candidates)
-    def appriximate(self,_df):
+    def approximate(self,_df):
        _columns = self.info['approximate']
-        _schema = {}
+        # _schema = {}
-        for _info in self.get_schema() :
+        # for _info in self.get_schema() :
-            _schema[_info['name']] = _info['type']
+        #     _schema[_info['name']] = _info['type'] 
        for name in _columns :
-            batches = np.array_split(_df[name].values,10)
+            batches = np.array_split(_df[name].fillna(np.nan).values,2)
            _type = np.int64 if 'int' in self.info['approximate'][name]else np.float64
            x = []
            for values in batches :
-                _values = np.random.dirichlet(values)
+                
-                x += list(values + _values )if np.random.randint(0,2) else list(values - _values)
+                index = np.where(values != '')
-            _df[name] = np.int64(x) if 'int' in _schema[name] else np.float64(x)
+                _values = np.random.dirichlet(values[index].astype(_type))                                
                values[index] = list(values[index] + _values )if np.random.randint(0,2) else list(values[index] - _values)
                values[index] = values[index].astype(_type)
                x += values.tolist()
            if x :                
                _df[name] = x  #np.array(x,dtype=np.int64) if 'int' in _type  else np.arry(x,dtype=np.float64)
        return _df
    def make_date(self,**_args) :
        """
        :param year  initial value        
        """
        if _args['year'] in ['',None,np.nan] :
            return None
        year = int(_args['year'])
        offset = _args['offset'] if 'offset' in _args else 0
        month   = np.random.randint(1,13)
        if month == 2:
            _end = 28 if year % 4 != 0 else 29
        else:
            _end = 31 if month in [1,3,5,7,8,10,12] else 30
        day = np.random.randint(1,_end)
        #-- synthetic date
        _date = datetime(year=year,month=month,day=day)
        FORMAT = _args['format'] if 'format' in _args else '%Y-%m-%d'
        r = []
        if offset :
            r = [_date.strftime(FORMAT)]
            for _delta in offset :
                _date = _date + timedelta(_delta)
                r.append(_date.strftime(FORMAT))
            return r
        else:
            return _date.strftime(FORMAT)
        pass
    def format(self,_df):
        pass
    def post(self,_candidates):
@ -345,10 +384,19 @@ class Generator (Learner):
        for _iodf in _candidates :
            _df = self._df.copy()
            _df[self.columns] = _iodf[self.columns]
-            if 'approximate' in self.info :
+            if 'approximate' in self.info :                
-                
+                _df = self.approximate(_df)
-                _df = self.appriximate(_df)
+            if 'make_date' in self.info :
-            writer.write(_df,schema=self.get_schema())
+                for name in self.info['make_date'] :
                    # iname = self.info['make_date']['init_field']
                    iname = self.info['make_date'][name]
                    years = _df[iname]
                    _dates = [self.make_date(year=year) for year in years]
                    if _dates :
                        _df[name] = _dates
            writer.write(_df[['birth_datetime']+self.columns],schema=self.get_schema())
        pass
 class factory :
    _infocache = {}