bug fix: etl engine, sqlite inserts
This commit is contained in:
parent
2de6e51bdb
commit
ccc05acc01
|
@ -16,7 +16,17 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI
|
||||||
Usage :
|
Usage :
|
||||||
transport --config <path-to-file.json> --procs <number-procs>
|
transport --config <path-to-file.json> --procs <number-procs>
|
||||||
@TODO: Create tables if they don't exist for relational databases
|
@TODO: Create tables if they don't exist for relational databases
|
||||||
|
example of configuration :
|
||||||
|
|
||||||
|
1. Move data from a folder to a data-store
|
||||||
|
transport [--folder <path> ] --config <config.json> #-- assuming the configuration doesn't have folder
|
||||||
|
transport --folder <path> --provider <postgresql|mongo|sqlite> --<database|db> <name> --table|doc <document_name>
|
||||||
|
In this case the configuration should look like :
|
||||||
|
{folder:..., target:{}}
|
||||||
|
2. Move data from one source to another
|
||||||
|
transport --config <file.json>
|
||||||
|
{source:{..},target:{..}} or [{source:{..},target:{..}},{source:{..},target:{..}}]
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
@ -46,11 +56,23 @@ if len(sys.argv) > 1:
|
||||||
class Post(Process):
|
class Post(Process):
|
||||||
def __init__(self,**args):
|
def __init__(self,**args):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.PROVIDER = args['target']['type']
|
|
||||||
self.writer = transport.factory.instance(**args['target'])
|
if 'provider' not in args['target'] :
|
||||||
|
self.PROVIDER = args['target']['type']
|
||||||
|
self.writer = transport.factory.instance(**args['target'])
|
||||||
|
else:
|
||||||
|
self.PROVIDER = args['target']['provider']
|
||||||
|
args['target']['context'] = 'write'
|
||||||
|
|
||||||
|
self.writer = transport.instance(**args['target'])
|
||||||
|
#
|
||||||
|
# If the table doesn't exists maybe create it ?
|
||||||
|
#
|
||||||
self.rows = args['rows']
|
self.rows = args['rows']
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
_info = {"values":self.rows} if 'couch' in self.PROVIDER else self.rows
|
_info = {"values":self.rows} if 'couch' in self.PROVIDER else self.rows
|
||||||
|
|
||||||
self.writer.write(_info)
|
self.writer.write(_info)
|
||||||
self.writer.close()
|
self.writer.close()
|
||||||
|
|
||||||
|
@ -59,7 +81,19 @@ class ETL (Process):
|
||||||
def __init__(self,**_args):
|
def __init__(self,**_args):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.name = _args['id']
|
self.name = _args['id']
|
||||||
self.reader = transport.factory.instance(**_args['source'])
|
if 'provider' not in _args['source'] :
|
||||||
|
#@deprecate
|
||||||
|
self.reader = transport.factory.instance(**_args['source'])
|
||||||
|
else:
|
||||||
|
#
|
||||||
|
# This is the new interface
|
||||||
|
_args['source']['context'] = 'read'
|
||||||
|
|
||||||
|
self.reader = transport.instance(**_args['source'])
|
||||||
|
#
|
||||||
|
# do we have an sql query provided or not ....
|
||||||
|
# self.sql = _args['source']['sql'] if 'sql' in _args['source'] else None
|
||||||
|
self.cmd = _args['source']['cmd'] if 'cmd' in _args['source'] else None
|
||||||
self._oargs = _args['target'] #transport.factory.instance(**_args['target'])
|
self._oargs = _args['target'] #transport.factory.instance(**_args['target'])
|
||||||
self.JOB_COUNT = _args['jobs']
|
self.JOB_COUNT = _args['jobs']
|
||||||
self.jobs = []
|
self.jobs = []
|
||||||
|
@ -68,8 +102,11 @@ class ETL (Process):
|
||||||
_args['name'] = self.name
|
_args['name'] = self.name
|
||||||
print (_args)
|
print (_args)
|
||||||
def run(self):
|
def run(self):
|
||||||
idf = self.reader.read()
|
if self.cmd :
|
||||||
idf = pd.DataFrame(idf)
|
idf = self.reader.read(**self.cmd)
|
||||||
|
else:
|
||||||
|
idf = self.reader.read()
|
||||||
|
idf = pd.DataFrame(idf)
|
||||||
idf.columns = [str(name).replace("b'",'').replace("'","").strip() for name in idf.columns.tolist()]
|
idf.columns = [str(name).replace("b'",'').replace("'","").strip() for name in idf.columns.tolist()]
|
||||||
self.log(rows=idf.shape[0],cols=idf.shape[1],jobs=self.JOB_COUNT)
|
self.log(rows=idf.shape[0],cols=idf.shape[1],jobs=self.JOB_COUNT)
|
||||||
|
|
||||||
|
@ -79,7 +116,8 @@ class ETL (Process):
|
||||||
try:
|
try:
|
||||||
self.log(module='write',action='partitioning')
|
self.log(module='write',action='partitioning')
|
||||||
rows = np.array_split(np.arange(idf.shape[0]),self.JOB_COUNT)
|
rows = np.array_split(np.arange(idf.shape[0]),self.JOB_COUNT)
|
||||||
|
#
|
||||||
|
# @TODO: locks
|
||||||
for i in rows :
|
for i in rows :
|
||||||
_id = 'segment #'.join([str(rows.index(i)),self.name])
|
_id = 'segment #'.join([str(rows.index(i)),self.name])
|
||||||
segment = idf.loc[i,:] #.to_dict(orient='records')
|
segment = idf.loc[i,:] #.to_dict(orient='records')
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -8,7 +8,7 @@ def read(fname):
|
||||||
return open(os.path.join(os.path.dirname(__file__), fname)).read()
|
return open(os.path.join(os.path.dirname(__file__), fname)).read()
|
||||||
args = {
|
args = {
|
||||||
"name":"data-transport",
|
"name":"data-transport",
|
||||||
"version":"1.4.0",
|
"version":"1.4.1",
|
||||||
"author":"The Phi Technology LLC","author_email":"info@the-phi.com",
|
"author":"The Phi Technology LLC","author_email":"info@the-phi.com",
|
||||||
"license":"MIT",
|
"license":"MIT",
|
||||||
"packages":["transport"]}
|
"packages":["transport"]}
|
||||||
|
|
|
@ -71,16 +71,7 @@ from google.cloud import bigquery as bq
|
||||||
import nzpy as nz #--- netezza drivers
|
import nzpy as nz #--- netezza drivers
|
||||||
import os
|
import os
|
||||||
|
|
||||||
RDBMS = {
|
|
||||||
|
|
||||||
"postgresql":{"port":"5432","driver":pg},
|
|
||||||
"redshift":{"port":"5432","driver":pg},
|
|
||||||
"netezza":{"port":"5480","driver":nz},
|
|
||||||
"mysql":{"port":"3306","driver":my},
|
|
||||||
"mariadb":{"port":"3306","driver":my},
|
|
||||||
"mongodb":{"port":"27017","class":{"read"}},
|
|
||||||
"couchdb":{"port":"5984"}
|
|
||||||
}
|
|
||||||
class factory :
|
class factory :
|
||||||
TYPE = {"sql":{"providers":["postgresql","mysql","neteeza","bigquery","mariadb","redshift"]}}
|
TYPE = {"sql":{"providers":["postgresql","mysql","neteeza","bigquery","mariadb","redshift"]}}
|
||||||
PROVIDERS = {
|
PROVIDERS = {
|
||||||
|
@ -91,9 +82,14 @@ class factory :
|
||||||
"bigquery":{"class":{"read":sql.BQReader,"write":sql.BQWriter}},
|
"bigquery":{"class":{"read":sql.BQReader,"write":sql.BQWriter}},
|
||||||
"mysql":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"}},
|
"mysql":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"}},
|
||||||
"mariadb":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"}},
|
"mariadb":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"}},
|
||||||
"mongo":{"port":27017,"host":"localhost","class":{"read":mongo.MongoReader,"write":mongo.MongoWriter}},
|
"mongo":{"port":27017,"host":"localhost","class":{"read":mongo.MongoReader,"write":mongo.MongoWriter}},
|
||||||
"couch":{"port":5984,"host":"localhost","class":{"read":couch.CouchReader,"write":couch.CouchWriter}},
|
"couch":{"port":5984,"host":"localhost","class":{"read":couch.CouchReader,"write":couch.CouchWriter}},
|
||||||
"netezza":{"port":5480,"driver":nz,"default":{"type":"VARCHAR(256)"}}}
|
"netezza":{"port":5480,"driver":nz,"default":{"type":"VARCHAR(256)"}}}
|
||||||
|
#
|
||||||
|
# creating synonyms
|
||||||
|
PROVIDERS['mongodb'] = PROVIDERS['mongo']
|
||||||
|
PROVIDERS['couchdb'] = PROVIDERS['couch']
|
||||||
|
PROVIDERS['sqlite3'] = PROVIDERS['sqlite']
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def instance(**args):
|
def instance(**args):
|
||||||
|
@ -126,14 +122,17 @@ class factory :
|
||||||
return anObject
|
return anObject
|
||||||
|
|
||||||
import time
|
import time
|
||||||
def instance(provider,context,**_args):
|
def instance(**_args):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@param provider {file,sqlite,postgresql,redshift,bigquery,netezza,mongo,couch ...}
|
@param provider {file,sqlite,postgresql,redshift,bigquery,netezza,mongo,couch ...}
|
||||||
@param context read|write|rw
|
@param context read|write|rw
|
||||||
@param _args argument to got with the datastore (username,password,host,port ...)
|
@param _args argument to got with the datastore (username,password,host,port ...)
|
||||||
"""
|
"""
|
||||||
_id = context if context in ['read','write'] else None
|
|
||||||
|
provider = _args['provider']
|
||||||
|
context = _args['context']
|
||||||
|
_id = context if context in ['read','write'] else 'read'
|
||||||
if _id :
|
if _id :
|
||||||
args = {'provider':_id}
|
args = {'provider':_id}
|
||||||
for key in factory.PROVIDERS[provider] :
|
for key in factory.PROVIDERS[provider] :
|
||||||
|
|
|
@ -114,6 +114,7 @@ class DiskWriter(Writer):
|
||||||
class SQLiteReader (DiskReader):
|
class SQLiteReader (DiskReader):
|
||||||
def __init__(self,**args):
|
def __init__(self,**args):
|
||||||
DiskReader.__init__(self,**args)
|
DiskReader.__init__(self,**args)
|
||||||
|
self.path = args['database'] if 'database' in args else args['path']
|
||||||
self.conn = sqlite3.connect(self.path,isolation_level=None)
|
self.conn = sqlite3.connect(self.path,isolation_level=None)
|
||||||
self.conn.row_factory = sqlite3.Row
|
self.conn.row_factory = sqlite3.Row
|
||||||
self.table = args['table']
|
self.table = args['table']
|
||||||
|
@ -145,7 +146,7 @@ class SQLiteWriter(DiskWriter) :
|
||||||
DiskWriter.__init__(self,**args)
|
DiskWriter.__init__(self,**args)
|
||||||
self.table = args['table']
|
self.table = args['table']
|
||||||
|
|
||||||
self.conn = sqlite3.connect(self.path,isolation_level=None)
|
self.conn = sqlite3.connect(self.path,isolation_level="IMMEDIATE")
|
||||||
self.conn.row_factory = sqlite3.Row
|
self.conn.row_factory = sqlite3.Row
|
||||||
self.fields = args['fields'] if 'fields' in args else []
|
self.fields = args['fields'] if 'fields' in args else []
|
||||||
|
|
||||||
|
@ -184,20 +185,27 @@ class SQLiteWriter(DiskWriter) :
|
||||||
if not self.fields :
|
if not self.fields :
|
||||||
self.init(list(info.keys()))
|
self.init(list(info.keys()))
|
||||||
|
|
||||||
if type(info) != list :
|
if type(info) == object :
|
||||||
info = [info]
|
info = [info]
|
||||||
|
elif type(info) == pd.DataFrame :
|
||||||
|
info = info.to_dict(orient='records')
|
||||||
|
|
||||||
SQLiteWriter.LOCK.acquire()
|
SQLiteWriter.LOCK.acquire()
|
||||||
try:
|
try:
|
||||||
cursor = self.conn.cursor()
|
|
||||||
sql = " " .join(["INSERT INTO ",self.table,"(", ",".join(self.fields) ,")", "values(':values')"])
|
|
||||||
for row in info :
|
|
||||||
stream = json.dumps(row)
|
|
||||||
stream = stream.replace("'","''")
|
|
||||||
cursor.execute(sql.replace(":values",stream) )
|
|
||||||
|
|
||||||
# self.conn.commit()
|
cursor = self.conn.cursor()
|
||||||
|
sql = " " .join(["INSERT INTO ",self.table,"(", ",".join(self.fields) ,")", "values(:values)"])
|
||||||
|
for row in info :
|
||||||
|
stream =["".join(["",value,""]) if type(value) == str else value for value in row.values()]
|
||||||
|
stream = json.dumps(stream).replace("[","").replace("]","")
|
||||||
|
|
||||||
|
|
||||||
|
self.conn.execute(sql.replace(":values",stream) )
|
||||||
|
# cursor.commit()
|
||||||
|
|
||||||
|
self.conn.commit()
|
||||||
# print (sql)
|
# print (sql)
|
||||||
except Exception as e :
|
except Exception as e :
|
||||||
|
print (e)
|
||||||
pass
|
pass
|
||||||
SQLiteWriter.LOCK.release()
|
SQLiteWriter.LOCK.release()
|
Loading…
Reference in New Issue