bug fix: missing data, adding an additional type: pandas._lib.missing.NAType in addition to np.nan, np.na

This commit is contained in:
Steve Nyemba 2022-05-17 13:24:24 -05:00
parent 2b228f6075
commit 6841ccbd5e
1 changed files with 15 additions and 2 deletions

View File

@ -52,6 +52,7 @@ class Learner(Process):
self._encoder = None self._encoder = None
self._map = None self._map = None
self._df = _args['data'] if 'data' in _args else None self._df = _args['data'] if 'data' in _args else None
self.name = self.__class__.__name__ self.name = self.__class__.__name__
# #
@ -93,9 +94,21 @@ class Learner(Process):
self._df = reader.read(**_read_args) self._df = reader.read(**_read_args)
columns = self.columns if self.columns else self._df.columns columns = self.columns if self.columns else self._df.columns
# #
# convert the data to binary here ... # Below is a source of inefficiency, unfortunately python's type inference doesn't work well in certain cases
# - The code below tries to address the issue (Perhaps better suited for the reading components)
for name in columns :
_index = np.random.choice(np.arange(self._df[name].size),5,False)
no_value = [type(value) in [int,float,np.int64,np.int32,np.float32,np.float64] for value in self._df[name].values[_index]]
print ([name,np.sum(no_value)])
no_value = 0 if np.sum(no_value) > 0 else ''
_args = {"schema":self.get_schema(),"data":self._df,"columns":columns} self._df[name] = self._df[name].fillna(no_value)
#
# convert the data to binary here ...
_schema = self.get_schema()
_args = {"schema":_schema,"data":self._df,"columns":columns}
if self._map : if self._map :
_args['map'] = self._map _args['map'] = self._map
self._encoder = prepare.Input(**_args) if self._df.shape[0] > 0 else None self._encoder = prepare.Input(**_args) if self._df.shape[0] > 0 else None