bug fix: uploading data

2022-05-17 18:04:05 -05:00 · 2022-05-17 18:04:05 -05:00 · 377e84daea
parent 1dae4ffba8
commit 377e84daea
1 changed files with 45 additions and 27 deletions
--- a/data/maker/init.py
+++ b/data/maker/init.py
@ -96,6 +96,7 @@ class Learner(Process):
        #
        # Below is a source of inefficiency, unfortunately python's type inference doesn't work well in certain cases
        # - The code below tries to address the issue (Perhaps better suited for the reading components)
+        _log = {}
        for name in columns :
            _index = np.random.choice(np.arange(self._df[name].size),5,False)
            no_value = [type(value) in [int,float,np.int64,np.int32,np.float32,np.float64] for value in self._df[name].values[_index]]            
@ -103,7 +104,9 @@ class Learner(Process):
            
            self._df[name] = self._df[name].fillna(no_value)
            
-
+            _log[name] = self._df[name].dtypes.name
+        _log = {'action':'structure','input':_log}
+        self.log(**_log)
        #
        # convert the data to binary here ...
        _schema = self.get_schema()       
@ -293,46 +296,52 @@ class Generator (Learner):
            name = _item['name']
            
            if _item['type'].upper() in ['DATE','DATETIME','TIMESTAMP'] :
-                FORMAT = '%Y-%m-%d'
+                FORMAT = '%m-%d-%Y'
                
-                try:
-                    #
-                    #-- Sometimes data isn't all it's meant to be
-                    SIZE = -1
-                    if 'format' in self.info and name in self.info['format'] :
-                        FORMAT = self.info['format'][name]
-                        SIZE = 10
-                    elif _item['type'] in ['DATETIME','TIMESTAMP'] :
-                            FORMAT = '%Y-%m-%d %H:%M:%S'
-                            SIZE = 19
+                # try:
+                #     #
+                #     #-- Sometimes data isn't all it's meant to be
+                #     SIZE = -1
+                #     if 'format' in self.info and name in self.info['format'] :
+                #         FORMAT = self.info['format'][name]
+                #         SIZE = 10
+                #     elif _item['type'] in ['DATETIME','TIMESTAMP'] :
+                #             FORMAT = '%m-%d-%Y %H:%M:%S'
+                #             SIZE = 19
                    
-                    if SIZE > 0 :
+                #     if SIZE > 0 :
                        
-                        values = pd.to_datetime(_df[name], format=FORMAT).astype(str)
-                        _df[name] = [_date[:SIZE] for _date in values]
+                #         values = pd.to_datetime(_df[name], format=FORMAT).astype(str)
+                #         _df[name] = [_date[:SIZE].strip() for _date in values]
                        
                       
-                    r[name] = FORMAT
-                    # _df[name] = pd.to_datetime(_df[name], format=FORMAT) #.astype('datetime64[ns]')
-                    if _item['type'] in ['DATETIME','TIMESTAMP']:                   
-                        pass #;_df[name] = _df[name].fillna('').astype('datetime64[ns]')
+                #     # _df[name] = _df[name].astype(str)
+                #     r[name] = FORMAT
+                #     # _df[name] = pd.to_datetime(_df[name], format=FORMAT) #.astype('datetime64[ns]')
+                #     if _item['type'] in ['DATETIME','TIMESTAMP']:                   
+                #         pass #;_df[name] = _df[name].fillna('').astype('datetime64[ns]')
                    
-                except Exception as e:
-                    pass
-                finally:
-                    pass
+                # except Exception as e:
+                #     pass
+                # finally:
+                #     pass
            else:
                
                #
                # Because types are inferred on the basis of the sample being processed they can sometimes be wrong
                #   To help disambiguate we add the schema information
                _type = None
+                
                if 'int' in _df[name].dtypes.name or 'int' in _item['type'].lower():                    
                    _type = np.int
+                    
                elif 'float' in _df[name].dtypes.name or 'float' in _item['type'].lower():
                    _type = np.float
                if _type :
-                    _df[name] = _df[name].fillna(0).replace('',0).astype(_type)
+                    
+                    _df[name] = _df[name].fillna(0).replace('',0).replace('NA',0).replace('nan',0).astype(_type)
+                # else:
+                #     _df[name] = _df[name].astype(str)
        # _df = _df.replace('NaT','').replace('NA','')
        
        if r :
@ -373,10 +382,19 @@ class Generator (Learner):
            _schema = self.get_schema()
            _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema]
            _df = self.format(_df,_schema)
+            _log = [{"name":_schema[i]['name'],"dataframe":_df[_df.columns[i]].dtypes.name,"schema":_schema[i]['type']} for i in np.arange(len(_schema)) ]
+            self.log(**{"action":"consolidate","input":_log})
+
+            # w = transport.factory.instance(doc='observation',provider='mongodb',context='write',db='IOV01_LOGS',auth_file='/home/steve/dev/transport/mongo.json')
+            # w.write(_df)
+            # print (_df[cols])
            
            writer = transport.factory.instance(**_store)
            writer.write(_df,schema=_schema)
-            # _df.to_csv('foo.csv')
+            
+            
+
+           
        
        self.log(**{'action':'write','input':{'rows':N,'candidates':len(_candidates)}})
 class Shuffle(Generator):