diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 1eea945..24fabe8 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -282,9 +282,11 @@ class Generator (Learner): if _item['type'].upper() in ['DATE','DATETIME','TIMESTAMP'] : FORMAT = '%Y-%m-%d' + try: # #-- Sometimes data isn't all it's meant to be + SIZE = -1 if 'format' in self.info and name in self.info['format'] : FORMAT = self.info['format'][name] SIZE = 10 @@ -292,20 +294,34 @@ class Generator (Learner): FORMAT = '%Y-%m-%d %H:%M:%S' SIZE = 19 + if SIZE > 0 : + + values = pd.to_datetime(_df[name], format=FORMAT).astype(str) + _df[name] = [_date[:SIZE] for _date in values] + + r[name] = FORMAT - _df[name] = pd.to_datetime(_df[name], format=FORMAT) #.astype('datetime64[ns]') + # _df[name] = pd.to_datetime(_df[name], format=FORMAT) #.astype('datetime64[ns]') if _item['type'] in ['DATETIME','TIMESTAMP']: pass #;_df[name] = _df[name].fillna('').astype('datetime64[ns]') - else: - _df[name] = _df[name].astype(str) + except Exception as e: pass finally: pass else: - # print (_item) - pass - _df = _df.replace('NaT','').replace('NA','') + + # + # Because types are inferred on the basis of the sample being processed they can sometimes be wrong + # To help disambiguate we add the schema information + _type = None + if 'int' in _df[name].dtypes.name or 'int' in _item['type'].lower(): + _type = np.int + elif 'float' in _df[name].dtypes.name or 'float' in _item['type'].lower(): + _type = np.float + if _type : + _df[name] = _df[name].fillna(0).replace('',0).astype(_type) + # _df = _df.replace('NaT','').replace('NA','') if r : self.log(**{'action':'format','input':r}) @@ -319,7 +335,7 @@ class Generator (Learner): _store['context'] = 'write' #-- Just in case if 'table' not in _store : _store['table'] = self.info['from'] - writer = transport.factory.instance(**_store) + N = 0 for _iodf in _candidates : _df = self._df.copy() @@ -346,7 +362,9 @@ class Generator (Learner): _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema] _df = self.format(_df,_schema) + writer = transport.factory.instance(**_store) writer.write(_df,schema=_schema) + # _df.to_csv('foo.csv') self.log(**{'action':'write','input':{'rows':N,'candidates':len(_candidates)}}) class Shuffle(Generator): diff --git a/data/maker/prepare/__init__.py b/data/maker/prepare/__init__.py index 50fcfdf..17da778 100644 --- a/data/maker/prepare/__init__.py +++ b/data/maker/prepare/__init__.py @@ -209,6 +209,7 @@ class Input : # @NOTE: For some reason, there is an out of memory error created here, this seems to fix it (go figure) # _matrix = np.array([np.repeat(0,cols.size) for i in range(0,row_count)]) + [np.put(_matrix[i], np.where(cols == rows[i]) ,1)for i in np.arange(row_count) if np.where(cols == rows[i])[0].size > 0] # else: # _matrix = cp.zeros([row_count,cols.size])