bug fix: missing data, adding an additional type: pandas._lib.missing.NAType in addition to np.nan, np.na

2022-05-17 13:24:24 -05:00 · 2022-05-17 13:24:24 -05:00 · 6841ccbd5e
parent 2b228f6075
commit 6841ccbd5e
1 changed files with 15 additions and 2 deletions
--- a/data/maker/init.py
+++ b/data/maker/init.py
@ -52,6 +52,7 @@ class Learner(Process):
        self._encoder = None
        self._map = None
        self._df = _args['data'] if 'data' in _args else None
+        
        self.name   =  self.__class__.__name__
        
        #
@ -92,10 +93,22 @@ class Learner(Process):
        if self._df is None :
            self._df     = reader.read(**_read_args)
        columns = self.columns if self.columns else self._df.columns
+        #
+        # Below is a source of inefficiency, unfortunately python's type inference doesn't work well in certain cases
+        # - The code below tries to address the issue (Perhaps better suited for the reading components)
+        for name in columns :
+            _index = np.random.choice(np.arange(self._df[name].size),5,False)
+            no_value = [type(value) in [int,float,np.int64,np.int32,np.float32,np.float64] for value in self._df[name].values[_index]]
+            print ([name,np.sum(no_value)])
+            no_value = 0 if np.sum(no_value) > 0 else ''
+            
+            self._df[name] = self._df[name].fillna(no_value)
+
+
        #
        # convert the data to binary here ...
-        
-        _args = {"schema":self.get_schema(),"data":self._df,"columns":columns}
+        _schema = self.get_schema()       
+        _args = {"schema":_schema,"data":self._df,"columns":columns}
        if self._map :
            _args['map'] = self._map
        self._encoder = prepare.Input(**_args)  if self._df.shape[0] > 0 else None