bug fix: ETL logging and rabbitmq-server listener
This commit is contained in:
parent
67cb7de861
commit
8cd34d902a
|
@ -68,11 +68,13 @@ class factory :
|
||||||
"mariadb":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"},"driver":my},
|
"mariadb":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"},"driver":my},
|
||||||
"mongo":{"port":27017,"host":"localhost","class":{"read":mongo.MongoReader,"write":mongo.MongoWriter}},
|
"mongo":{"port":27017,"host":"localhost","class":{"read":mongo.MongoReader,"write":mongo.MongoWriter}},
|
||||||
"couch":{"port":5984,"host":"localhost","class":{"read":couch.CouchReader,"write":couch.CouchWriter}},
|
"couch":{"port":5984,"host":"localhost","class":{"read":couch.CouchReader,"write":couch.CouchWriter}},
|
||||||
"netezza":{"port":5480,"driver":nz,"default":{"type":"VARCHAR(256)"}}}
|
"netezza":{"port":5480,"driver":nz,"default":{"type":"VARCHAR(256)"}},
|
||||||
|
"rabbitmq":{"port":5672,"host":"localhost","class":{"read":queue.QueueReader,"write":queue.QueueWriter,"listen":queue.QueueListener},"default":{"type":"application/json"}}}
|
||||||
#
|
#
|
||||||
# creating synonyms
|
# creating synonyms
|
||||||
PROVIDERS['mongodb'] = PROVIDERS['mongo']
|
PROVIDERS['mongodb'] = PROVIDERS['mongo']
|
||||||
PROVIDERS['couchdb'] = PROVIDERS['couch']
|
PROVIDERS['couchdb'] = PROVIDERS['couch']
|
||||||
|
PROVIDERS['bq'] = PROVIDERS['bigquery']
|
||||||
PROVIDERS['sqlite3'] = PROVIDERS['sqlite']
|
PROVIDERS['sqlite3'] = PROVIDERS['sqlite']
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -124,7 +126,7 @@ def instance(**_args):
|
||||||
|
|
||||||
provider = _args['provider']
|
provider = _args['provider']
|
||||||
context = _args['context']if 'context' in _args else None
|
context = _args['context']if 'context' in _args else None
|
||||||
_id = context if context in ['read','write'] else 'read'
|
_id = context if context in list(factory.PROVIDERS[provider]['class'].keys()) else 'read'
|
||||||
if _id :
|
if _id :
|
||||||
args = {'provider':_id}
|
args = {'provider':_id}
|
||||||
for key in factory.PROVIDERS[provider] :
|
for key in factory.PROVIDERS[provider] :
|
||||||
|
@ -147,7 +149,7 @@ def instance(**_args):
|
||||||
try:
|
try:
|
||||||
|
|
||||||
host = ''
|
host = ''
|
||||||
if provider not in ['bigquery','mongodb','couchdb','sqlite','console','etl','file'] :
|
if provider not in ['bigquery','mongodb','couchdb','sqlite','console','etl','file','rabbitmq'] :
|
||||||
#
|
#
|
||||||
# In these cases we are assuming RDBMS and thus would exclude NoSQL and BigQuery
|
# In these cases we are assuming RDBMS and thus would exclude NoSQL and BigQuery
|
||||||
username = args['username'] if 'username' in args else ''
|
username = args['username'] if 'username' in args else ''
|
||||||
|
@ -165,7 +167,7 @@ def instance(**_args):
|
||||||
account = ''
|
account = ''
|
||||||
host = ''
|
host = ''
|
||||||
database = args['path'] if 'path' in args else args['database']
|
database = args['path'] if 'path' in args else args['database']
|
||||||
if provider not in ['mongodb','couchdb','bigquery','console','etl','file'] :
|
if provider not in ['mongodb','couchdb','bigquery','console','etl','file','rabbitmq'] :
|
||||||
uri = ''.join([provider,"://",account,host,'/',database])
|
uri = ''.join([provider,"://",account,host,'/',database])
|
||||||
|
|
||||||
e = sqlalchemy.create_engine (uri,future=True)
|
e = sqlalchemy.create_engine (uri,future=True)
|
||||||
|
|
|
@ -98,15 +98,15 @@ class Console(Writer):
|
||||||
self.debug = self.write
|
self.debug = self.write
|
||||||
self.log = self.write
|
self.log = self.write
|
||||||
pass
|
pass
|
||||||
def write (self,info,**_args):
|
def write (self,**_args):
|
||||||
if self.lock :
|
if self.lock :
|
||||||
Console.lock.acquire()
|
Console.lock.acquire()
|
||||||
try:
|
try:
|
||||||
if type(info) == list:
|
if type(_args) == list:
|
||||||
for row in info :
|
for row in _args :
|
||||||
print (row)
|
print (row)
|
||||||
else:
|
else:
|
||||||
print (info)
|
print (_args)
|
||||||
except Exception as e :
|
except Exception as e :
|
||||||
print (e)
|
print (e)
|
||||||
finally:
|
finally:
|
||||||
|
|
|
@ -54,41 +54,46 @@ if len(sys.argv) > 1:
|
||||||
i += 2
|
i += 2
|
||||||
|
|
||||||
class Post(Process):
|
class Post(Process):
|
||||||
def __init__(self,**args):
|
def __init__(self,**args):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
|
if 'provider' not in args['target'] :
|
||||||
|
self.PROVIDER = args['target']['type']
|
||||||
|
self.writer = transport.factory.instance(**args['target'])
|
||||||
|
else:
|
||||||
|
self.PROVIDER = args['target']['provider']
|
||||||
|
args['target']['context'] = 'write'
|
||||||
|
self.store = args['target']
|
||||||
|
self.store['lock'] = True
|
||||||
|
# self.writer = transport.instance(**args['target'])
|
||||||
|
#
|
||||||
|
# If the table doesn't exists maybe create it ?
|
||||||
|
#
|
||||||
|
self.rows = args['rows'].fillna('')
|
||||||
|
|
||||||
if 'provider' not in args['target'] :
|
def log(self,**_args) :
|
||||||
self.PROVIDER = args['target']['type']
|
if ETL.logger :
|
||||||
self.writer = transport.factory.instance(**args['target'])
|
ETL.logger.info(**_args)
|
||||||
else:
|
|
||||||
self.PROVIDER = args['target']['provider']
|
|
||||||
args['target']['context'] = 'write'
|
|
||||||
self.store = args['target']
|
|
||||||
# self.writer = transport.instance(**args['target'])
|
|
||||||
#
|
|
||||||
# If the table doesn't exists maybe create it ?
|
|
||||||
#
|
|
||||||
self.rows = args['rows'].fillna('')
|
|
||||||
|
|
||||||
|
def run(self):
|
||||||
def run(self):
|
_info = {"values":self.rows} if 'couch' in self.PROVIDER else self.rows
|
||||||
_info = {"values":self.rows} if 'couch' in self.PROVIDER else self.rows
|
ltypes = self.rows.dtypes.values
|
||||||
ltypes = self.rows.dtypes.values
|
columns = self.rows.dtypes.index.tolist()
|
||||||
columns = self.rows.dtypes.index.tolist()
|
# if not self.writer.has() :
|
||||||
# if not self.writer.has() :
|
|
||||||
|
|
||||||
|
|
||||||
# self.writer.make(fields=columns)
|
# self.writer.make(fields=columns)
|
||||||
# ETL.logger.info(module='write',action='make-table',input={"name":self.writer.table})
|
# ETL.logger.info(module='write',action='make-table',input={"name":self.writer.table})
|
||||||
for name in columns :
|
self.log(module='write',action='make-table',input={"schema":columns})
|
||||||
if _info[name].dtype in ['int32','int64','int','float','float32','float64'] :
|
for name in columns :
|
||||||
value = 0
|
if _info[name].dtype in ['int32','int64','int','float','float32','float64'] :
|
||||||
else:
|
value = 0
|
||||||
value = ''
|
else:
|
||||||
_info[name] = _info[name].fillna(value)
|
value = ''
|
||||||
writer = transport.factory.instance(**self.store)
|
_info[name] = _info[name].fillna(value)
|
||||||
writer.write(_info)
|
writer = transport.factory.instance(**self.store)
|
||||||
writer.close()
|
writer.write(_info)
|
||||||
|
writer.close()
|
||||||
|
|
||||||
|
|
||||||
class ETL (Process):
|
class ETL (Process):
|
||||||
|
@ -115,8 +120,9 @@ class ETL (Process):
|
||||||
self.jobs = []
|
self.jobs = []
|
||||||
# self.logger = transport.factory.instance(**_args['logger'])
|
# self.logger = transport.factory.instance(**_args['logger'])
|
||||||
def log(self,**_args) :
|
def log(self,**_args) :
|
||||||
_args['name'] = self.name
|
if ETL.logger :
|
||||||
print (_args)
|
ETL.logger.info(**_args)
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
if self.cmd :
|
if self.cmd :
|
||||||
idf = self.reader.read(**self.cmd)
|
idf = self.reader.read(**self.cmd)
|
||||||
|
@ -126,7 +132,7 @@ class ETL (Process):
|
||||||
# idf = idf.replace({np.nan: None}, inplace = True)
|
# idf = idf.replace({np.nan: None}, inplace = True)
|
||||||
|
|
||||||
idf.columns = [str(name).replace("b'",'').replace("'","").strip() for name in idf.columns.tolist()]
|
idf.columns = [str(name).replace("b'",'').replace("'","").strip() for name in idf.columns.tolist()]
|
||||||
# ETL.logger.info(rows=idf.shape[0],cols=idf.shape[1],jobs=self.JOB_COUNT)
|
self.log(rows=idf.shape[0],cols=idf.shape[1],jobs=self.JOB_COUNT)
|
||||||
|
|
||||||
#
|
#
|
||||||
# writing the data to a designated data source
|
# writing the data to a designated data source
|
||||||
|
@ -134,7 +140,7 @@ class ETL (Process):
|
||||||
try:
|
try:
|
||||||
|
|
||||||
|
|
||||||
# ETL.logger.info(module='write',action='partitioning')
|
self.log(module='write',action='partitioning',jobs=self.JOB_COUNT)
|
||||||
rows = np.array_split(np.arange(0,idf.shape[0]),self.JOB_COUNT)
|
rows = np.array_split(np.arange(0,idf.shape[0]),self.JOB_COUNT)
|
||||||
|
|
||||||
#
|
#
|
||||||
|
@ -148,10 +154,10 @@ class ETL (Process):
|
||||||
proc = Post(target = self._oargs,rows = segment,name=str(i))
|
proc = Post(target = self._oargs,rows = segment,name=str(i))
|
||||||
self.jobs.append(proc)
|
self.jobs.append(proc)
|
||||||
proc.start()
|
proc.start()
|
||||||
|
|
||||||
# ETL.logger.info(module='write',action='working',segment=str(id),table=self.name,rows=segment.shape[0])
|
self.log(module='write',action='working',segment=str(self.name),table=self.name,rows=segment.shape[0])
|
||||||
# while poc :
|
# while self.jobs :
|
||||||
# proc = [job for job in proc if job.is_alive()]
|
# jobs = [job for job in proc if job.is_alive()]
|
||||||
# time.sleep(1)
|
# time.sleep(1)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print (e)
|
print (e)
|
||||||
|
@ -166,9 +172,9 @@ def instance(**_args):
|
||||||
"""
|
"""
|
||||||
logger = _args['logger'] if 'logger' in _args else None
|
logger = _args['logger'] if 'logger' in _args else None
|
||||||
_info = _args['info']
|
_info = _args['info']
|
||||||
if logger :
|
if logger and type(logger) != str:
|
||||||
ETL.logger = logger
|
ETL.logger = logger
|
||||||
else:
|
elif logger == 'console':
|
||||||
ETL.logger = transport.factory.instance(provider='console',lock=True)
|
ETL.logger = transport.factory.instance(provider='console',lock=True)
|
||||||
if type(_info) in [list,dict] :
|
if type(_info) in [list,dict] :
|
||||||
_config = _info if type(_info) != dict else [_info]
|
_config = _info if type(_info) != dict else [_info]
|
||||||
|
@ -195,8 +201,6 @@ if __name__ == '__main__' :
|
||||||
_config['source'] = {"type":"disk.DiskReader","args":{"path":SYS_ARGS['source'],"delimiter":","}}
|
_config['source'] = {"type":"disk.DiskReader","args":{"path":SYS_ARGS['source'],"delimiter":","}}
|
||||||
|
|
||||||
_config['jobs'] = 3 if 'jobs' not in SYS_ARGS else int(SYS_ARGS['jobs'])
|
_config['jobs'] = 3 if 'jobs' not in SYS_ARGS else int(SYS_ARGS['jobs'])
|
||||||
print (_config)
|
|
||||||
print ()
|
|
||||||
etl = ETL (**_config)
|
etl = ETL (**_config)
|
||||||
if index is None:
|
if index is None:
|
||||||
|
|
||||||
|
|
|
@ -222,22 +222,21 @@ class QueueListener(MessageQueue):
|
||||||
def __init__(self,**args):
|
def __init__(self,**args):
|
||||||
MessageQueue.__init__(self,**args)
|
MessageQueue.__init__(self,**args)
|
||||||
self.listen = self.read
|
self.listen = self.read
|
||||||
# def init(self,qid):
|
self.apply = args['apply'] if 'apply' in args else print
|
||||||
# properties = pika.ConnectionParameters(host=self.host)
|
|
||||||
# self.connection = pika.BlockingConnection(properties)
|
|
||||||
# self.channel = self.connection.channel()
|
|
||||||
# self.channel.exchange_declare(exchange=self.exchange,type='direct',durable=True )
|
|
||||||
|
|
||||||
# self.info = self.channel.queue_declare(passive=True,exclusive=True,queue=qid)
|
|
||||||
|
|
||||||
# self.channel.queue_bind(exchange=self.exchange,queue=self.info.method.queue,routing_key=qid)
|
|
||||||
#self.callback = callback
|
|
||||||
|
|
||||||
def finalize(self,channel,ExceptionReason):
|
def finalize(self,channel,ExceptionReason):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def callback(self,channel,method,header,stream) :
|
def callback(self,channel,method,header,stream) :
|
||||||
raise Exception("....")
|
_info= {}
|
||||||
|
# if re.match("^\{|\[",stream) is not None:
|
||||||
|
|
||||||
|
if stream.startswith(b"[") or stream.startswith(b"{"):
|
||||||
|
_info = json.loads(stream)
|
||||||
|
else:
|
||||||
|
|
||||||
|
_info = stream
|
||||||
|
self.apply(_info)
|
||||||
def read(self):
|
def read(self):
|
||||||
|
|
||||||
self.init(self.queue)
|
self.init(self.queue)
|
||||||
|
|
|
@ -312,9 +312,11 @@ class BigQuery:
|
||||||
:param sql sql query to be pulled,
|
:param sql sql query to be pulled,
|
||||||
"""
|
"""
|
||||||
table = _args['table']
|
table = _args['table']
|
||||||
|
try:
|
||||||
ref = self.client.dataset(self.dataset).table(table)
|
ref = self.client.dataset(self.dataset).table(table)
|
||||||
return self.client.get_table(ref).schema
|
return self.client.get_table(ref).schema
|
||||||
|
except Exception as e:
|
||||||
|
return []
|
||||||
def has(self,**_args):
|
def has(self,**_args):
|
||||||
found = False
|
found = False
|
||||||
try:
|
try:
|
||||||
|
|
Loading…
Reference in New Issue