bug fix with type inference

2022-05-17 03:05:44 -05:00 · 2022-05-17 03:05:44 -05:00 · 2b228f6075
parent 1e3e0eac45
commit 2b228f6075
2 changed files with 26 additions and 7 deletions
--- a/data/maker/init.py
+++ b/data/maker/init.py
@ -282,9 +282,11 @@ class Generator (Learner):
            if _item['type'].upper() in ['DATE','DATETIME','TIMESTAMP'] :
                FORMAT = '%Y-%m-%d'
                try:
                    #
                    #-- Sometimes data isn't all it's meant to be
                    SIZE = -1
                    if 'format' in self.info and name in self.info['format'] :
                        FORMAT = self.info['format'][name]
                        SIZE = 10
@ -292,20 +294,34 @@ class Generator (Learner):
                            FORMAT = '%Y-%m-%d %H:%M:%S'
                            SIZE = 19
                    if SIZE > 0 :
                        values = pd.to_datetime(_df[name], format=FORMAT).astype(str)
                        _df[name] = [_date[:SIZE] for _date in values]
                    r[name] = FORMAT
-                    _df[name] = pd.to_datetime(_df[name], format=FORMAT) #.astype('datetime64[ns]')
+                    # _df[name] = pd.to_datetime(_df[name], format=FORMAT) #.astype('datetime64[ns]')
                    if _item['type'] in ['DATETIME','TIMESTAMP']:                   
                        pass #;_df[name] = _df[name].fillna('').astype('datetime64[ns]')
-                    else:
+                    
                        _df[name] = _df[name].astype(str)
                except Exception as e:
                    pass
                finally:
                    pass
            else:
-                # print (_item)
+                
-                pass
+                #
-        _df = _df.replace('NaT','').replace('NA','')
+                # Because types are inferred on the basis of the sample being processed they can sometimes be wrong
                #   To help disambiguate we add the schema information
                _type = None
                if 'int' in _df[name].dtypes.name or 'int' in _item['type'].lower():                    
                    _type = np.int
                elif 'float' in _df[name].dtypes.name or 'float' in _item['type'].lower():
                    _type = np.float
                if _type :
                    _df[name] = _df[name].fillna(0).replace('',0).astype(_type)
        # _df = _df.replace('NaT','').replace('NA','')
        if r :
            self.log(**{'action':'format','input':r})
@ -319,7 +335,7 @@ class Generator (Learner):
        _store['context'] = 'write' #-- Just in case
        if 'table' not in _store :
            _store['table'] = self.info['from']
-        writer = transport.factory.instance(**_store)
+        
        N = 0
        for _iodf in _candidates :
            _df = self._df.copy()
@ -346,7 +362,9 @@ class Generator (Learner):
            _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema]
            _df = self.format(_df,_schema)
            writer = transport.factory.instance(**_store)
            writer.write(_df,schema=_schema)
            # _df.to_csv('foo.csv')
        self.log(**{'action':'write','input':{'rows':N,'candidates':len(_candidates)}})
 class Shuffle(Generator):    
--- a/data/maker/prepare/init.py
+++ b/data/maker/prepare/init.py
@ -209,6 +209,7 @@ class Input :
        # @NOTE: For some reason, there is an out of memory error created here, this seems to fix it (go figure)
        #
        _matrix = np.array([np.repeat(0,cols.size) for i in range(0,row_count)])
        [np.put(_matrix[i], np.where(cols ==  rows[i])  ,1)for i in np.arange(row_count) if np.where(cols == rows[i])[0].size > 0]
        # else:
        #     _matrix = cp.zeros([row_count,cols.size])