post processing features with dates

2022-04-11 23:27:25 -05:00 · 2022-04-11 23:27:25 -05:00 · 0797e3dba1
parent ee518316c0
commit 0797e3dba1
1 changed files with 63 additions and 15 deletions
--- a/data/maker/init.py
+++ b/data/maker/init.py
@ -12,14 +12,14 @@ import pandas as pd
 import numpy as np
 import data.gan as gan
 import transport
-from data.bridge import Binary
+# from data.bridge import Binary
 import threading as thread
 from data.maker import prepare
 import copy
 import os
 import json
 from multiprocessing import Process, RLock
-
+from datetime import datetime, timedelta

 class ContinuousToDiscrete :
    ROUND_UP = 2
@ -229,7 +229,11 @@ class Learner(Process):
        # self.logpath= _args['logpath'] if 'logpath' in _args else 'logs'
        # sel.max_epoc
    def get_schema(self):
+        if self.store['source']['provider'] != 'bigquery' :
            return [{'name':self._df.dtypes.index.tolist()[i],'type':self._df.dtypes.astype(str).tolist()[i]}for i in range(self._df.dtypes.shape[0])]
+        else:
+            reader  = transport.factory.instance(**self.store['source'])
+            return reader.meta(table=self.info['from'])
    def initalize(self):
        reader  = transport.factory.instance(**self.store['source'])
        _read_args= self.info
@ -319,21 +323,56 @@ class Generator (Learner):
        _iomatrix = gHandler.apply()
        _candidates= [ self._encoder.revert(matrix=_item) for _item in _iomatrix]
        self.post(_candidates)
-    def appriximate(self,_df):
+    def approximate(self,_df):
        _columns = self.info['approximate']
-        _schema = {}
-        for _info in self.get_schema() :
-            _schema[_info['name']] = _info['type']
+        # _schema = {}
+        # for _info in self.get_schema() :
+        #     _schema[_info['name']] = _info['type'] 

        
        for name in _columns :
-            batches = np.array_split(_df[name].values,10)
+            batches = np.array_split(_df[name].fillna(np.nan).values,2)
+            _type = np.int64 if 'int' in self.info['approximate'][name]else np.float64
            x = []
            for values in batches :
-                _values = np.random.dirichlet(values)
-                x += list(values + _values )if np.random.randint(0,2) else list(values - _values)
-            _df[name] = np.int64(x) if 'int' in _schema[name] else np.float64(x)
+                
+                index = np.where(values != '')
+                _values = np.random.dirichlet(values[index].astype(_type))                                
+                values[index] = list(values[index] + _values )if np.random.randint(0,2) else list(values[index] - _values)
+                values[index] = values[index].astype(_type)
+                x += values.tolist()
+            if x :                
+                _df[name] = x  #np.array(x,dtype=np.int64) if 'int' in _type  else np.arry(x,dtype=np.float64)
        return _df
+    def make_date(self,**_args) :
+        """
+        :param year  initial value        
+        """
+        if _args['year'] in ['',None,np.nan] :
+            return None
+        year = int(_args['year'])
+        offset = _args['offset'] if 'offset' in _args else 0
+        month   = np.random.randint(1,13)
+        if month == 2:
+            _end = 28 if year % 4 != 0 else 29
+        else:
+            _end = 31 if month in [1,3,5,7,8,10,12] else 30
+        day = np.random.randint(1,_end)
+
+        #-- synthetic date
+        _date = datetime(year=year,month=month,day=day)
+        FORMAT = _args['format'] if 'format' in _args else '%Y-%m-%d'
+        r = []
+        if offset :
+            r = [_date.strftime(FORMAT)]
+            for _delta in offset :
+                _date = _date + timedelta(_delta)
+                r.append(_date.strftime(FORMAT))
+            return r
+        else:
+            return _date.strftime(FORMAT)
+
+        pass
    def format(self,_df):
        pass
    def post(self,_candidates):
@ -346,9 +385,18 @@ class Generator (Learner):
            _df = self._df.copy()
            _df[self.columns] = _iodf[self.columns]
            if 'approximate' in self.info :                
+                _df = self.approximate(_df)
+            if 'make_date' in self.info :
+                for name in self.info['make_date'] :
+                    # iname = self.info['make_date']['init_field']
+                    iname = self.info['make_date'][name]

-                _df = self.appriximate(_df)
-            writer.write(_df,schema=self.get_schema())
+                    years = _df[iname]
+                    _dates = [self.make_date(year=year) for year in years]
+                    if _dates :
+                        _df[name] = _dates
+
+            writer.write(_df[['birth_datetime']+self.columns],schema=self.get_schema())
        pass
 class factory :
    _infocache = {}