From 6841ccbd5e4abb8322df6da8b55904f99bcae89c Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 17 May 2022 13:24:24 -0500 Subject: [PATCH] bug fix: missing data, adding an additional type: pandas._lib.missing.NAType in addition to np.nan, np.na --- data/maker/__init__.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 24fabe8..b9b48e4 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -52,6 +52,7 @@ class Learner(Process): self._encoder = None self._map = None self._df = _args['data'] if 'data' in _args else None + self.name = self.__class__.__name__ # @@ -92,10 +93,22 @@ class Learner(Process): if self._df is None : self._df = reader.read(**_read_args) columns = self.columns if self.columns else self._df.columns + # + # Below is a source of inefficiency, unfortunately python's type inference doesn't work well in certain cases + # - The code below tries to address the issue (Perhaps better suited for the reading components) + for name in columns : + _index = np.random.choice(np.arange(self._df[name].size),5,False) + no_value = [type(value) in [int,float,np.int64,np.int32,np.float32,np.float64] for value in self._df[name].values[_index]] + print ([name,np.sum(no_value)]) + no_value = 0 if np.sum(no_value) > 0 else '' + + self._df[name] = self._df[name].fillna(no_value) + + # # convert the data to binary here ... - - _args = {"schema":self.get_schema(),"data":self._df,"columns":columns} + _schema = self.get_schema() + _args = {"schema":_schema,"data":self._df,"columns":columns} if self._map : _args['map'] = self._map self._encoder = prepare.Input(**_args) if self._df.shape[0] > 0 else None