From e7838f5de121e0e26604d3a87aa919a92c429f09 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 28 Mar 2024 15:34:39 -0500 Subject: [PATCH 01/23] refactoring version 2.0 --- bin/transport | 47 +-- info/__init__.py | 2 +- transport/__init__.py | 421 +++++-------------------- transport/cloud/__init__.py | 6 + transport/cloud/bigquery.py | 156 ++++++++++ transport/cloud/databricks.py | 111 +++++++ transport/cloud/nextcloud.py | 80 +++++ transport/cloud/s3.py | 127 ++++++++ transport/nosql/__init__.py | 10 + transport/nosql/couchdb.py | 213 +++++++++++++ transport/nosql/mongodb.py | 242 +++++++++++++++ transport/other/__init__.py | 1 + transport/other/callback.py | 45 +++ transport/other/console.py | 7 + transport/other/files.py | 68 +++++ transport/other/http.py | 88 ++++++ transport/other/rabbitmq.py | 272 +++++++++++++++++ transport/providers.py | 105 ------- transport/providers/__init__.py | 44 +++ transport/sql.py | 526 -------------------------------- transport/sql/__init__.py | 18 ++ transport/sql/common.py | 125 ++++++++ transport/sql/mysql.py | 18 ++ transport/sql/netezza.py | 15 + transport/sql/postgresql.py | 22 ++ transport/sql/sqlite.py | 25 ++ 26 files changed, 1773 insertions(+), 1021 deletions(-) create mode 100644 transport/cloud/__init__.py create mode 100644 transport/cloud/bigquery.py create mode 100644 transport/cloud/databricks.py create mode 100644 transport/cloud/nextcloud.py create mode 100644 transport/cloud/s3.py create mode 100644 transport/nosql/__init__.py create mode 100644 transport/nosql/couchdb.py create mode 100644 transport/nosql/mongodb.py create mode 100644 transport/other/__init__.py create mode 100644 transport/other/callback.py create mode 100644 transport/other/console.py create mode 100644 transport/other/files.py create mode 100644 transport/other/http.py create mode 100644 transport/other/rabbitmq.py delete mode 100644 transport/providers.py create mode 100644 transport/providers/__init__.py delete mode 100644 transport/sql.py create mode 100644 transport/sql/__init__.py create mode 100644 transport/sql/common.py create mode 100644 transport/sql/mysql.py create mode 100644 transport/sql/netezza.py create mode 100644 transport/sql/postgresql.py create mode 100644 transport/sql/sqlite.py diff --git a/bin/transport b/bin/transport index dd424a2..363d2d9 100755 --- a/bin/transport +++ b/bin/transport @@ -48,24 +48,8 @@ import typer import os import transport from transport import etl -from transport import providers +# from transport import providers -# SYS_ARGS = {} -# if len(sys.argv) > 1: - -# N = len(sys.argv) -# for i in range(1,N): -# value = None -# if sys.argv[i].startswith('--'): -# key = sys.argv[i][2:] #.replace('-','') -# SYS_ARGS[key] = 1 -# if i + 1 < N: -# value = sys.argv[i + 1] = sys.argv[i+1].strip() -# if key and value and not value.startswith('--'): -# SYS_ARGS[key] = value - - -# i += 2 app = typer.Typer() @@ -77,7 +61,7 @@ def wait(jobs): jobs = [thread for thread in jobs if thread.is_alive()] time.sleep(1) -@app.command() +@app.command(name="apply") def move (path,index=None): _proxy = lambda _object: _object.write(_object.read()) @@ -90,27 +74,14 @@ def move (path,index=None): etl.instance(**_config) else: etl.instance(config=_config) - - # - # if type(_config) == dict : - # _object = transport.etl.instance(**_config) - # _proxy(_object) - # else: - # # - # # here we are dealing with a list of objects (long ass etl job) - # jobs = [] - # failed = [] - # for _args in _config : - # if index and _config.index(_args) != index : - # continue +@app.command(name="providers") +def supported (format:str="table") : + """ + This function will print supported providers and their associated classifications + """ + _df = (transport.supported()) + print (json.dumps(_df.to_dict(orient="list"))) - # _object=transport.etl.instance(**_args) - # thread = Process(target=_proxy,args=(_object,)) - # thread.start() - # jobs.append(thread()) - # if _config.index(_args) == 0 : - # thread.join() - # wait(jobs) @app.command() def version(): print (transport.version.__version__) diff --git a/info/__init__.py b/info/__init__.py index 57f7289..2d27032 100644 --- a/info/__init__.py +++ b/info/__init__.py @@ -1,5 +1,5 @@ __author__ = 'The Phi Technology' -__version__= '1.9.8.20' +__version__= '2.0.0' __license__=""" diff --git a/transport/__init__.py b/transport/__init__.py index 9e12b3f..288f646 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -11,360 +11,79 @@ This library is designed to serve as a wrapper to a set of supported data stores - s3 - sqlite The supported operations are read/write and providing meta data to the calling code -Requirements : - pymongo - boto - couldant -The configuration for the data-store is as follows : - e.g: - mongodb - provider:'mongodb',[port:27017],[host:localhost],db:,doc:<_name>,context: +We separated reads from writes to mitigate accidents associated with writes. +Source Code is available under MIT License: + https://healthcareio.the-phi.com/data-transport + https://hiplab.mc.vanderbilt.edu/git/hiplab/data-transport """ - -# import pandas as pd -# import numpy as np -import json -import importlib -import sys -import sqlalchemy -from datetime import datetime -if sys.version_info[0] > 2 : - # from transport.common import Reader, Writer,Console #, factory - from transport import disk - - from transport import s3 as s3 - from transport import rabbitmq as queue - from transport import couch as couch - from transport import mongo as mongo - from transport import sql as sql - from transport import etl as etl - # from transport.version import __version__ - from info import __version__,__author__ - from transport import providers -else: - from common import Reader, Writer,Console #, factory - import disk - import queue - import couch - import mongo - import s3 - import sql - import etl - from info import __version__,__author__ - import providers - import numpy as np -from psycopg2.extensions import register_adapter, AsIs -register_adapter(np.int64, AsIs) - -# import psycopg2 as pg -# import mysql.connector as my -# from google.cloud import bigquery as bq -# import nzpy as nz #--- netezza drivers +from transport import sql, nosql, cloud, other +import pandas as pd +import json import os +from info import __version__,__author__ -# class providers : -# POSTGRESQL = 'postgresql' -# MONGODB = 'mongodb' - -# BIGQUERY ='bigquery' -# FILE = 'file' -# ETL = 'etl' -# SQLITE = 'sqlite' -# SQLITE3= 'sqlite' -# REDSHIFT = 'redshift' -# NETEZZA = 'netezza' -# MYSQL = 'mysql' -# RABBITMQ = 'rabbitmq' -# MARIADB = 'mariadb' -# COUCHDB = 'couch' -# CONSOLE = 'console' -# ETL = 'etl' -# # -# # synonyms of the above -# BQ = BIGQUERY -# MONGO = MONGODB -# FERRETDB= MONGODB -# PG = POSTGRESQL -# PSQL = POSTGRESQL -# PGSQL = POSTGRESQL -# import providers +PROVIDERS = {} +def init(): + global PROVIDERS + for _module in [cloud,sql,nosql,other] : + for _provider_name in dir(_module) : + if _provider_name.startswith('__') : + continue + PROVIDERS[_provider_name] = {'module':getattr(_module,_provider_name),'type':_module.__name__} +# print ([ {name:getattr(sql,name)} for name in dir(sql) if not name.startswith('__')]) -# class IEncoder (json.JSONEncoder): -# def IEncoder (self,object): -# if type(object) == np.integer : -# return int(object) -# elif type(object) == np.floating: -# return float(object) -# elif type(object) == np.ndarray : -# return object.tolist() -# elif type(object) == datetime : -# return o.isoformat() -# else: -# return super(IEncoder,self).default(object) - +def instance (**_args): + """ + type: + read: true|false (default true) + auth_file + """ + global PROVIDERS + if 'auth_file' in _args: + if os.path.exists(_args['auth_file']) : + f = open(_args['auth_file']) + _args = dict (_args,** json.loads(f.read()) ) + f.close() + else: + filename = _args['auth_file'] + raise Exception(f" {filename} was not found or is invalid") + if _args['provider'] in PROVIDERS : + _info = PROVIDERS[_args['provider']] + _module = _info['module'] + if 'context' in _args : + _context = _args['context'] + else: + _context = 'read' + _pointer = getattr(_module,'Reader') if _context == 'read' else getattr(_module,'Writer') + return _pointer (**_args) + pass + else: + raise Exception ("Missing or Unknown provider") + pass +def supported (): + _info = {} + for _provider in PROVIDERS : + _item = PROVIDERS[_provider] + if _item['type'] not in _info : + _info[_item['type']] = [] + _info[_item['type']].append(_provider) + _df = pd.DataFrame() + for _id in _info : + if not _df.shape[0] : + _df = pd.DataFrame(_info[_id],columns=[_id.replace('transport.','')]) + else: + _df = pd.DataFrame(_info[_id],columns=[_id.replace('transport.','')]).join(_df, how='outer') + return _df.fillna('') class factory : - # TYPE = {"sql":{"providers":["postgresql","mysql","neteeza","bigquery","mariadb","redshift"]}} - # PROVIDERS = { - # "etl":{"class":{"read":etl.instance,"write":etl.instance}}, - # # "console":{"class":{"write":Console,"read":Console}}, - # "file":{"class":{"read":disk.DiskReader,"write":disk.DiskWriter}}, - # "sqlite":{"class":{"read":disk.SQLiteReader,"write":disk.SQLiteWriter}}, - # "postgresql":{"port":5432,"host":"localhost","database":None,"driver":pg,"default":{"type":"VARCHAR"},"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, - # "redshift":{"port":5432,"host":"localhost","database":None,"driver":pg,"default":{"type":"VARCHAR"},"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, - # "bigquery":{"class":{"read":sql.BQReader,"write":sql.BQWriter}}, - # "mysql":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"},"driver":my,"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, - # "mariadb":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"},"driver":my,"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, - # "mongo":{"port":27017,"host":"localhost","class":{"read":mongo.MongoReader,"write":mongo.MongoWriter}}, - # "couch":{"port":5984,"host":"localhost","class":{"read":couch.CouchReader,"write":couch.CouchWriter}}, - # "netezza":{"port":5480,"driver":nz,"default":{"type":"VARCHAR(256)"},"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, - # "rabbitmq":{"port":5672,"host":"localhost","class":{"read":queue.QueueReader,"write":queue.QueueWriter,"listen":queue.QueueListener,"listener":queue.QueueListener},"default":{"type":"application/json"}}} - # # - # # creating synonyms - # PROVIDERS['mongodb'] = PROVIDERS['mongo'] - # PROVIDERS['couchdb'] = PROVIDERS['couch'] - # PROVIDERS['bq'] = PROVIDERS['bigquery'] - # PROVIDERS['sqlite3'] = PROVIDERS['sqlite'] - # PROVIDERS['rabbit'] = PROVIDERS['rabbitmq'] - # PROVIDERS['rabbitmq-server'] = PROVIDERS['rabbitmq'] - - @staticmethod - def instance(**_args): - if 'type' in _args : - # - # Legacy code being returned - return factory._instance(**_args); - - - - else: - return instance(**_args) - @staticmethod - def _instance(**args): - """ - This class will create an instance of a transport when providing - :type name of the type we are trying to create - :args The arguments needed to create the instance - """ - source = args['type'] - params = args['args'] - anObject = None - - if source in ['HttpRequestReader','HttpSessionWriter']: - # - # @TODO: Make sure objects are serializable, be smart about them !! - # - aClassName = ''.join([source,'(**params)']) - - - else: - - stream = json.dumps(params) - aClassName = ''.join([source,'(**',stream,')']) - - try: - anObject = eval( aClassName) - #setattr(anObject,'name',source) - except Exception as e: - print(['Error ',e]) - return anObject - -import time -def instance(**_pargs): - """ - creating an instance given the provider, we should have an idea of :class, :driver - :provider - :read|write = {connection to the database} - """ - # - # @TODO: provide authentication file that will hold all the parameters, that will later on be used - # - _args = dict(_pargs,**{}) - if 'auth_file' in _args : - path = _args['auth_file'] - file = open(path) - _config = json.loads( file.read()) - _args = dict(_args,**_config) - file.close() - - _provider = _args['provider'] - _context = list( set(['read','write','listen']) & set(_args.keys()) ) - if _context : - _context = _context[0] - else: - _context = _args['context'] if 'context' in _args else 'read' - # _group = None - - - # for _id in providers.CATEGORIES : - # if _provider in providers.CATEGORIES[_id] : - # _group = _id - # break - # if _group : - - if _provider in providers.PROVIDERS and _context in providers.PROVIDERS[_provider]: - - # _classPointer = _getClassInstance(_group,**_args) - _classPointer = providers.PROVIDERS[_provider][_context] - # - # Let us reformat the arguments - # if 'read' in _args or 'write' in _args : - # _args = _args['read'] if 'read' in _args else _args['write'] - # _args['provider'] = _provider - # if _group == 'sql' : - if _provider in providers.CATEGORIES['sql'] : - _info = _get_alchemyEngine(**_args) - - _args = dict(_args,**_info) - _args['driver'] = providers.DRIVERS[_provider] - - else: - if _provider in providers.DEFAULT : - _default = providers.DEFAULT[_provider] - _defkeys = list(set(_default.keys()) - set(_args.keys())) - if _defkeys : - for key in _defkeys : - _args[key] = _default[key] - pass - # - # get default values from - - return _classPointer(**_args) - # - # Let us determine the category of the provider that has been given -def _get_alchemyEngine(**_args): - """ - This function returns the SQLAlchemy engine associated with parameters, This is only applicable for SQL _items - :_args arguments passed to the factory {provider and other} - """ - _provider = _args['provider'] - _pargs = {} - if _provider == providers.SQLITE3 : - _path = _args['database'] if 'database' in _args else _args['path'] - uri = ''.join([_provider,':///',_path]) - - else: - - #@TODO: Enable authentication files (private_key) - _username = _args['username'] if 'username' in _args else '' - _password = _args['password'] if 'password' in _args else '' - _account = _args['account'] if 'account' in _args else '' - _database = _args['database'] if 'database' in _args else _args['path'] - - if _username != '': - _account = _username + ':'+_password+'@' - _host = _args['host'] if 'host' in _args else '' - _port = _args['port'] if 'port' in _args else '' - if _provider in providers.DEFAULT : - _default = providers.DEFAULT[_provider] - _host = _host if _host != '' else (_default['host'] if 'host' in _default else '') - _port = _port if _port != '' else (_default['port'] if 'port' in _default else '') - if _port == '': - _port = providers.DEFAULT['port'] if 'port' in providers.DEFAULT else '' - # - - if _host != '' and _port != '' : - _fhost = _host+":"+str(_port) #--formatted hostname - else: - _fhost = _host - # Let us update the parameters we have thus far - # - - - uri = ''.join([_provider,"://",_account,_fhost,'/',_database]) - _pargs = {'host':_host,'port':_port,'username':_username,'password':_password} - _engine = sqlalchemy.create_engine (uri,future=True) - _out = {'sqlalchemy':_engine} - - for key in _pargs : - if _pargs[key] != '' : - _out[key] = _pargs[key] - return _out -@DeprecationWarning -def _getClassInstance(_group,**_args): - """ - This function returns the class instance we are attempting to instanciate - :_group items in providers.CATEGORIES.keys() - :_args arguments passed to the factory class - """ - # if 'read' in _args or 'write' in _args : - # _context = 'read' if 'read' in _args else _args['write'] - # _info = _args[_context] - # else: - # _context = _args['context'] if 'context' in _args else 'read' - # _class = providers.READ[_group] if _context == 'read' else providers.WRITE[_group] - # if type(_class) == dict and _args['provider'] in _class: - # _class = _class[_args['provider']] - - # return _class - -@DeprecationWarning -def __instance(**_args): - """ - - @param provider {file,sqlite,postgresql,redshift,bigquery,netezza,mongo,couch ...} - @param context read|write|rw - @param _args argument to got with the datastore (username,password,host,port ...) - """ - - provider = _args['provider'] - context = _args['context']if 'context' in _args else None - _id = context if context in list(factory.PROVIDERS[provider]['class'].keys()) else 'read' - if _id : - args = {'provider':_id} - for key in factory.PROVIDERS[provider] : - if key == 'class' : - continue - value = factory.PROVIDERS[provider][key] - args[key] = value - # - # - - args = dict(args,**_args) - - # print (provider in factory.PROVIDERS) - if 'class' in factory.PROVIDERS[provider]: - pointer = factory.PROVIDERS[provider]['class'][_id] - else: - pointer = sql.SQLReader if _id == 'read' else sql.SQLWriter - # - # Let us try to establish an sqlalchemy wrapper - try: - account = '' - host = '' - if provider not in [providers.BIGQUERY,providers.MONGODB, providers.COUCHDB, providers.SQLITE, providers.CONSOLE,providers.ETL, providers.FILE, providers.RABBITMQ] : - # if provider not in ['bigquery','mongodb','mongo','couchdb','sqlite','console','etl','file','rabbitmq'] : - # - # In these cases we are assuming RDBMS and thus would exclude NoSQL and BigQuery - username = args['username'] if 'username' in args else '' - password = args['password'] if 'password' in args else '' - if username == '' : - account = '' - else: - account = username + ':'+password+'@' - host = args['host'] - if 'port' in args : - host = host+":"+str(args['port']) - - database = args['database'] - elif provider in [providers.SQLITE,providers.FILE]: - account = '' - host = '' - database = args['path'] if 'path' in args else args['database'] - - if provider not in [providers.MONGODB, providers.COUCHDB, providers.BIGQUERY, providers.CONSOLE, providers.ETL,providers.FILE,providers.RABBITMQ] : - # if provider not in ['mongodb','mongo','couchdb','bigquery','console','etl','file','rabbitmq'] : - uri = ''.join([provider,"://",account,host,'/',database]) - - e = sqlalchemy.create_engine (uri,future=True) - args['sqlalchemy'] = e - - # - # @TODO: Include handling of bigquery with SQLAlchemy - except Exception as e: - print (_args) - print (e) - - return pointer(**args) - - return None + pass +factory.instance = instance +init() +# if __name__ == '__main__' : +# # if not PROVIDERS : +# init() +# print (list(PROVIDERS.keys())) +# pgr = instance(provider='postgresql',database='io',table='foo',write=True) +# print (pgr.read()) +# print () +# print (supported()) \ No newline at end of file diff --git a/transport/cloud/__init__.py b/transport/cloud/__init__.py new file mode 100644 index 0000000..e741ed0 --- /dev/null +++ b/transport/cloud/__init__.py @@ -0,0 +1,6 @@ +""" +Steve L. Nyemba, nyemba@gmail.com +This namespace implements support for cloud databases databricks,bigquery ... +""" +from . import bigquery, databricks, nextcloud, s3 + diff --git a/transport/cloud/bigquery.py b/transport/cloud/bigquery.py new file mode 100644 index 0000000..479c060 --- /dev/null +++ b/transport/cloud/bigquery.py @@ -0,0 +1,156 @@ +""" +Implementing support for google's bigquery + - cloud.bigquery.Read + - cloud.bigquery.Write +""" +import json +from google.oauth2 import service_account +from google.cloud import bigquery as bq + +from multiprocessing import Lock, RLock +import pandas as pd +import pandas_gbq as pd_gbq +import numpy as np +import time + +MAX_CHUNK = 2000000 +class BigQuery: + def __init__(self,**_args): + path = _args['service_key'] if 'service_key' in _args else _args['private_key'] + self.credentials = service_account.Credentials.from_service_account_file(path) + self.dataset = _args['dataset'] if 'dataset' in _args else None + self.path = path + self.dtypes = _args['dtypes'] if 'dtypes' in _args else None + self.table = _args['table'] if 'table' in _args else None + self.client = bq.Client.from_service_account_json(self.path) + def meta(self,**_args): + """ + This function returns meta data for a given table or query with dataset/table properly formatted + :param table name of the name WITHOUT including dataset + :param sql sql query to be pulled, + """ + table = _args['table'] if 'table' in _args else self.table + + try: + if table : + _dataset = self.dataset if 'dataset' not in _args else _args['dataset'] + sql = f"""SELECT column_name as name, data_type as type FROM {_dataset}.INFORMATION_SCHEMA.COLUMNS WHERE table_name = '{table}' """ + _info = {'credentials':self.credentials,'dialect':'standard'} + return pd_gbq.read_gbq(sql,**_info).to_dict(orient='records') + # return self.read(sql=sql).to_dict(orient='records') + # ref = self.client.dataset(self.dataset).table(table) + + # _schema = self.client.get_table(ref).schema + # return [{"name":_item.name,"type":_item.field_type,"description":( "" if not hasattr(_item,"description") else _item.description )} for _item in _schema] + else : + return [] + except Exception as e: + + return [] + def has(self,**_args): + found = False + try: + _has = self.meta(**_args) + found = _has is not None and len(_has) > 0 + except Exception as e: + pass + return found +class Reader (BigQuery): + """ + Implementing support for reading from bigquery, This class acts as a wrapper around google's API + """ + def __init__(self,**_args): + + super().__init__(**_args) + def apply(self,sql): + return self.read(sql=sql) + + def read(self,**_args): + SQL = None + table = self.table if 'table' not in _args else _args['table'] + if 'sql' in _args : + SQL = _args['sql'] + elif table: + + table = "".join(["`",table,"`"]) if '.' in table else "".join(["`:dataset.",table,"`"]) + SQL = "SELECT * FROM :table ".replace(":table",table) + if not SQL : + return None + if SQL and 'limit' in _args: + SQL += " LIMIT "+str(_args['limit']) + if (':dataset' in SQL or ':DATASET' in SQL) and self.dataset: + SQL = SQL.replace(':dataset',self.dataset).replace(':DATASET',self.dataset) + _info = {'credentials':self.credentials,'dialect':'standard'} + return pd_gbq.read_gbq(SQL,**_info) if SQL else None + # return self.client.query(SQL).to_dataframe() if SQL else None + +class Writer (BigQuery): + """ + This class implements support for writing against bigquery + """ + lock = RLock() + def __init__(self,**_args): + super().__init__(**_args) + + self.parallel = False if 'lock' not in _args else _args['lock'] + self.table = _args['table'] if 'table' in _args else None + self.mode = {'if_exists':'append','chunksize':900000,'destination_table':self.table,'credentials':self.credentials} + self._chunks = 1 if 'chunks' not in _args else int(_args['chunks']) + self._location = 'US' if 'location' not in _args else _args['location'] + def write(self,_data,**_args) : + """ + This function will perform a write to bigquery + :_data data-frame to be written to bigquery + """ + try: + if self.parallel or 'lock' in _args : + Write.lock.acquire() + _args['table'] = self.table if 'table' not in _args else _args['table'] + self._write(_data,**_args) + finally: + if self.parallel: + Write.lock.release() + def submit(self,_sql): + """ + Write the output of a massive query to a given table, biquery will handle this as a job + This function will return the job identifier + """ + _config = bq.QueryJobConfig() + _config.destination = self.client.dataset(self.dataset).table(self.table) + _config.allow_large_results = True + # _config.write_disposition = bq.bq_consts.WRITE_APPEND + _config.dry_run = False + # _config.priority = 'BATCH' + _resp = self.client.query(_sql,location=self._location,job_config=_config) + return _resp.job_id + def status (self,_id): + return self.client.get_job(_id,location=self._location) + def _write(self,_info,**_args) : + _df = None + if type(_info) in [list,pd.DataFrame] : + if type(_info) == list : + _df = pd.DataFrame(_info) + elif type(_info) == pd.DataFrame : + _df = _info + + if '.' not in _args['table'] : + self.mode['destination_table'] = '.'.join([self.dataset,_args['table']]) + else: + + self.mode['destination_table'] = _args['table'].strip() + if 'schema' in _args : + self.mode['table_schema'] = _args['schema'] + # + # Let us insure that the types are somewhat compatible ... + # _map = {'INTEGER':np.int64,'DATETIME':'datetime64[ns]','TIMESTAMP':'datetime64[ns]','FLOAT':np.float64,'DOUBLE':np.float64,'STRING':str} + # _mode = copy.deepcopy(self.mode) + _mode = self.mode + # _df.to_gbq(**self.mode) #if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000) + # + # Let us adjust the chunking here + self._chunks = 10 if _df.shape[0] > MAX_CHUNK and self._chunks == 1 else self._chunks + _indexes = np.array_split(np.arange(_df.shape[0]),self._chunks) + for i in _indexes : + _df.iloc[i].to_gbq(**self.mode) + time.sleep(1) + pass \ No newline at end of file diff --git a/transport/cloud/databricks.py b/transport/cloud/databricks.py new file mode 100644 index 0000000..5c1ee0d --- /dev/null +++ b/transport/cloud/databricks.py @@ -0,0 +1,111 @@ +""" +This file implements databricks handling, This functionality will rely on databricks-sql-connector +LICENSE (MIT) +Copyright 2016-2020, The Phi Technology LLC + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +@TODO: + - Migrate SQLite to SQL hierarchy + - Include Write in Chunks from pandas +""" +import os +import sqlalchemy +# from transport.common import Reader,Writer +import pandas as pd + + +class Bricks: + """ + :host + :token + :database + :cluster_path + :table + """ + def __init__(self,**_args): + _host = _args['host'] + _token= _args['token'] + _cluster_path = _args['cluster_path'] + self._schema = _args['schema'] if 'schema' in _args else _args['database'] + _catalog = _args['catalog'] + self._table = _args['table'] if 'table' in _args else None + + # + # @TODO: + # Sometimes when the cluster isn't up and running it takes a while, the user should be alerted of this + # + + _uri = f'''databricks+connector://token:{_token}@{_host}?http_path={_cluster_path}&catalog={_catalog}&schema={self._schema}''' + self._engine = sqlalchemy.create_engine (_uri) + pass + def meta(self,**_args): + table = _args['table'] if 'table' in _args else self._table + if not table : + return [] + else: + if sqlalchemy.__version__.startswith('1.') : + _m = sqlalchemy.MetaData(bind=self._engine) + _m.reflect(only=[table]) + else: + _m = sqlalchemy.MetaData() + _m.reflect(bind=self._engine) + # + # Let's retrieve te information associated with a table + # + return [{'name':_attr.name,'type':_attr.type} for _attr in _m.tables[table].columns] + + def has(self,**_args): + return self.meta(**_args) + def apply(self,_sql): + try: + if _sql.lower().startswith('select') : + return pd.read_sql(_sql,self._engine) + except Exception as e: + pass + +class Reader(Bricks): + """ + This class is designed for reads and will execute reads against a table name or a select SQL statement + """ + def __init__(self,**_args): + super().__init__(**_args) + def read(self,**_args): + limit = None if 'limit' not in _args else str(_args['limit']) + + if 'sql' in _args : + sql = _args['sql'] + elif 'table' in _args : + table = _args['table'] + sql = f'SELECT * FROM {table}' + if limit : + sql = sql + f' LIMIT {limit}' + + if 'sql' in _args or 'table' in _args : + return self.apply(sql) + else: + return pd.DataFrame() + pass +class Writer(Bricks): + def __init__(self,**_args): + super().__init__(**_args) + def write(self,_data,**_args): + """ + This data will write data to data-bricks against a given table. If the table is not specified upon initiazation, it can be specified here + _data: data frame to push to databricks + _args: chunks, table, schema + """ + _schema = self._schema if 'schema' not in _args else _args['schema'] + _table = self._table if 'table' not in _args else _args['table'] + _df = _data if type(_data) == pd.DataFrame else _data + if type(_df) == dict : + _df = [_df] + if type(_df) == list : + _df = pd.DataFrame(_df) + _df.to_sql( + name=_table,schema=_schema, + con=self._engine,if_exists='append',index=False); + pass diff --git a/transport/cloud/nextcloud.py b/transport/cloud/nextcloud.py new file mode 100644 index 0000000..ebb44d3 --- /dev/null +++ b/transport/cloud/nextcloud.py @@ -0,0 +1,80 @@ +""" +We are implementing transport to and from nextcloud (just like s3) +""" +import os +import sys +from transport.common import IEncoder +import pandas as pd +from io import StringIO +import json +import nextcloud_client as nextcloud + +class Nextcloud : + def __init__(self,**_args): + pass + self._delimiter = None + self._handler = nextcloud.Client(_args['url']) + _uid = _args['uid'] + _token = _args['token'] + self._uri = _args['folder'] if 'folder' in _args else './' + if self._uri.endswith('/') : + self._uri = self._uri[:-1] + self._file = None if 'file' not in _args else _args['file'] + self._handler.login(_uid,_token) + def close(self): + try: + self._handler.logout() + except Exception as e: + pass + + +class Reader(Nextcloud): + def __init__(self,**_args): + # self._file = [] if 'file' not in _args else _args['file'] + super().__init__(**_args) + pass + def read(self,**_args): + _filename = self._file if 'file' not in _args else _args['file'] + # + # @TODO: if _filename is none, an exception should be raised + # + _uri = '/'.join([self._uri,_filename]) + if self._handler.get_file(_uri) : + # + # + _info = self._handler.file_info(_uri) + _content = self._handler.get_file_contents(_uri).decode('utf8') + if _info.get_content_type() == 'text/csv' : + # + # @TODO: enable handling of csv, xls, parquet, pickles + _file = StringIO(_content) + return pd.read_csv(_file) + else: + # + # if it is neither a structured document like csv, we will return the content as is + return _content + return None +class Writer (Nextcloud): + """ + This class will write data to an instance of nextcloud + """ + def __init__(self,**_args) : + super().__init__(**_args) + self + def write(self,_data,**_args): + """ + This function will upload a file to a given destination + :file has the uri of the location of the file + """ + _filename = self._file if 'file' not in _args else _args['file'] + _uri = '/'.join([self._uri,_filename]) + if type(_data) == pd.DataFrame : + f = StringIO() + _data.to_csv(f,index=False) + _content = f.getvalue() + elif type(_data) == dict : + _content = json.dumps(_data,cls=IEncoder) + else: + _content = str(_data) + self._handler.put_file_contents(_uri,_content) + diff --git a/transport/cloud/s3.py b/transport/cloud/s3.py new file mode 100644 index 0000000..4e230e8 --- /dev/null +++ b/transport/cloud/s3.py @@ -0,0 +1,127 @@ +""" +Data Transport - 1.0 +Steve L. Nyemba, The Phi Technology LLC + +This file is a wrapper around s3 bucket provided by AWS for reading and writing content +""" +from datetime import datetime +import boto +from boto.s3.connection import S3Connection, OrdinaryCallingFormat +import numpy as np +import botocore +from smart_open import smart_open +import sys + +import json +from io import StringIO +import json + +class s3 : + """ + @TODO: Implement a search function for a file given a bucket?? + """ + def __init__(self,**args) : + """ + This function will extract a file or set of files from s3 bucket provided + @param access_key + @param secret_key + @param path location of the file + @param filter filename or filtering elements + """ + try: + self.s3 = S3Connection(args['access_key'],args['secret_key'],calling_format=OrdinaryCallingFormat()) + self.bucket = self.s3.get_bucket(args['bucket'].strip(),validate=False) if 'bucket' in args else None + # self.path = args['path'] + self.filter = args['filter'] if 'filter' in args else None + self.filename = args['file'] if 'file' in args else None + self.bucket_name = args['bucket'] if 'bucket' in args else None + + except Exception as e : + self.s3 = None + self.bucket = None + print (e) + def meta(self,**args): + """ + :name name of the bucket + """ + info = self.list(**args) + [item.open() for item in info] + return [{"name":item.name,"size":item.size} for item in info] + def list(self,**args): + """ + This function will list the content of a bucket, the bucket must be provided by the name + :name name of the bucket + """ + return list(self.s3.get_bucket(args['name']).list()) + + + def buckets(self): + # + # This function will return all buckets, not sure why but it should be used cautiously + # based on why the s3 infrastructure is used + # + return [item.name for item in self.s3.get_all_buckets()] + + # def buckets(self): + pass + # """ + # This function is a wrapper around the bucket list of buckets for s3 + # """ + # return self.s3.get_all_buckets() + + +class Reader(s3) : + """ + Because s3 contains buckets and files, reading becomes a tricky proposition : + - list files if file is None + - stream content if file is Not None + @TODO: support read from all buckets, think about it + """ + def __init__(self,**args) : + s3.__init__(self,**args) + def files(self): + r = [] + try: + return [item.name for item in self.bucket if item.size > 0] + except Exception as e: + pass + return r + def stream(self,limit=-1): + """ + At this point we should stream a file from a given bucket + """ + key = self.bucket.get_key(self.filename.strip()) + if key is None : + yield None + else: + count = 0 + with smart_open(key) as remote_file: + for line in remote_file: + if count == limit and limit > 0 : + break + yield line + count += 1 + def read(self,**args) : + if self.filename is None : + # + # returning the list of files because no one file was specified. + return self.files() + else: + limit = args['size'] if 'size' in args else -1 + return self.stream(limit) + +class Writer(s3) : + + def __init__(self,**args) : + s3.__init__(self,**args) + def mkdir(self,name): + """ + This function will create a folder in a bucket + :name name of the folder + """ + self.s3.put_object(Bucket=self.bucket_name,key=(name+'/')) + def write(self,content): + file = StringIO(content.decode("utf8")) + self.s3.upload_fileobj(file,self.bucket_name,self.filename) + pass + diff --git a/transport/nosql/__init__.py b/transport/nosql/__init__.py new file mode 100644 index 0000000..465b912 --- /dev/null +++ b/transport/nosql/__init__.py @@ -0,0 +1,10 @@ +""" +Steve L. Nyemba, nyemba@gmail.com +This namespace implements support for cloud databases couchdb,mongodb, cloudant ... +""" +from transport.nosql import couchdb +from transport.nosql import mongodb +# from . import mongodb +# from . import couchdb + +cloudant = couchdb \ No newline at end of file diff --git a/transport/nosql/couchdb.py b/transport/nosql/couchdb.py new file mode 100644 index 0000000..aa503fb --- /dev/null +++ b/transport/nosql/couchdb.py @@ -0,0 +1,213 @@ +""" +Data-Transport +Steve L. Nyemba, The Phi Technology + +This file is a wrapper around couchdb using IBM Cloudant SDK that has an interface to couchdb + +""" +import cloudant +import json +import sys +# from transport.common import Reader, Writer +from datetime import datetime + + +class Couch: + """ + This class is a wrapper for read/write against couchdb. The class captures common operations for read/write. + @param url host & port reference default http://localhost:5984 + @param doc user id involved + @param dbname database name (target) + """ + def __init__(self,**args): + url = args['url'] if 'url' in args else 'http://localhost:5984' + self._id = args['doc'] + dbname = args['dbname'] + if 'username' not in args and 'password' not in args : + self.server = cloudant.CouchDB(None,None,url=url) + else: + self.server = cloudant.CouchDB(args['username'],args['password'],url=url) + self.server.connect() + + if dbname in self.server.all_dbs() : + self.dbase = self.server.get(dbname,dbname,True) + # + # @TODO Check if the database exists ... + # + doc = cloudant.document.Document(self.dbase,self._id) #self.dbase.get(self._id) + if not doc.exists(): + doc = self.dbase.create_document({"_id":self._id}) + doc.save() + else: + self.dbase = None + """ + Insuring the preconditions are met for processing + """ + def isready(self): + p = self.server.metadata() != {} + if p == False or not self.dbase: + return False + # + # At this point we are sure that the server is connected + # We are also sure that the database actually exists + # + doc = cloudant.document.Document(self.dbase,self._id) + # q = self.dbase.all_docs(key=self._id)['rows'] + # if not q : + if not doc.exists(): + return False + return True + + def view(self,**args): + """ + The function will execute a view (provivded a user is authenticated) + :id design document _design/xxxx (provide full name with _design prefix) + :view_name name of the view i.e + :key(s) key(s) to be used to filter the content + """ + document = cloudant.design_document.DesignDocument(self.dbase,args['id']) + document.fetch() + params = {'group_level':1,'group':True} + if 'key' in args : + params ['key'] = args['key'] + elif 'keys' in args : + params['keys'] = args['keys'] + return document.get_view(args['view_name'])(**params)['rows'] + + + + +class Reader(Couch): + """ + This function will read an attachment from couchdb and return it to calling code. The attachment must have been placed before hand (otherwise oops) + @T: Account for security & access control + """ + def __init__(self,**args): + """ + @param filename filename (attachment) + """ + # + # setting the basic parameters for + Couch.__init__(self,**args) + if 'filename' in args : + self.filename = args['filename'] + else: + self.filename = None + + + def stream(self): + # + # @TODO Need to get this working ... + # + document = cloudant.document.Document(self.dbase,self._id) + # content = self.dbase.fetch_attachment(self._id,self.filename).split('\n') ; + content = self.get_attachment(self.filename) + for row in content: + yield row + + def read(self,**args): + if self.filename is not None: + self.stream() + else: + return self.basic_read() + def basic_read(self): + document = cloudant.document.Document(self.dbase,self._id) + + # document = self.dbase.get(self._id) + if document.exists() : + document.fetch() + document = dict(document) + del document['_rev'] + else: + document = {} + return document + +class Writer(Couch): + """ + This class will write on a couchdb document provided a scope + The scope is the attribute that will be on the couchdb document + """ + def __init__(self,**args): + """ + @param uri host & port reference + @param uid user id involved + @param filename filename (attachment) + @param dbname database name (target) + """ + + super().__init__(self,**args) + def set (self,info): + document = cloudant.document.Document(self.dbase,self._id) + if document.exists() : + keys = list(set(document.keys()) - set(['_id','_rev','_attachments'])) + for id in keys : + document.field_set(document,id,None) + for id in info : + value = info[id] + document.info(document,id,value) + + document.save() + pass + else: + _document = dict({"_id":self._id},**args) + document.create_document(_document) + def write(self,info): + """ + write a given attribute to a document database + @info object to be written to the to an attribute. this + """ + + # document = self.dbase.get(self._id) + document = cloudant.document.Document(self.dbase,self._id) #.get(self._id) + if document.exists() is False : + document = self.dbase.create_document({"_id":self._id}) + # label = params['label'] + # row = params['row'] + # if label not in document : + # document[label] = [] + # document[label].append(row) + for key in info : + if key in document and type(document[key]) == list : + document[key] += info[key] + else: + document[key] = info[key] + + document.save() + # self.dbase.bulk_docs([document]) + # self.dbase.save_doc(document) + + def upload(self,**args): + """ + :param name name of the file to be uploaded + :param data content of the file (binary or text) + :param content_type (default) + """ + mimetype = args['content_type'] if 'content_type' in args else 'text/plain' + document = cloudant.document.Document(self.dbase,self.uid) + document.put_attachment(self.dbase,args['filename'],mimetype,args['content']) + document.save() + + def archive(self,params=None): + """ + This function will archive the document onto itself. + """ + # document = self.dbase.all_docs(self._id,include_docs=True) + document = cloudant.document.Document(self.dbase,self.filename) + document.fetch() + content = {} + # _doc = {} + for id in document: + if id not in ['_id','_rev','_attachments'] : + content[id] = document[id] + del document[id] + + content = json.dumps(content) + # document= _doc + now = str(datetime.today()) + + name = '-'.join([document['_id'] , now,'.json']) + self.upload(filename=name,data=content,content_type='application/json') + # self.dbase.bulk_docs([document]) + # self.dbase.put_attachment(document,content,name,'application/json') + # document.put_attachment(self.dbase,name,'application/json',content) + # document.save() diff --git a/transport/nosql/mongodb.py b/transport/nosql/mongodb.py new file mode 100644 index 0000000..2784cd2 --- /dev/null +++ b/transport/nosql/mongodb.py @@ -0,0 +1,242 @@ +""" +Data Transport - 1.0 +Steve L. Nyemba, The Phi Technology LLC + +This file is a wrapper around mongodb for reading/writing content against a mongodb server and executing views (mapreduce) +""" +from pymongo import MongoClient +import bson +from bson.objectid import ObjectId +from bson.binary import Binary +# import nujson as json +from datetime import datetime +import pandas as pd +import numpy as np +import gridfs +import sys +import json +import re +from multiprocessing import Lock, RLock +from transport.common import IEncoder + +class Mongo : + lock = RLock() + """ + Basic mongodb functions are captured here + """ + def __init__(self,**args): + """ + :dbname database name/identifier + :host host and port of the database by default localhost:27017 + :username username for authentication + :password password for current user + """ + self.host = 'localhost' if 'host' not in args else args['host'] + self.mechanism= 'SCRAM-SHA-256' if 'mechanism' not in args else args['mechanism'] + # authSource=(args['authSource'] if 'authSource' in args else self.dbname) + self._lock = False if 'lock' not in args else args['lock'] + self.dbname = None + username = password = None + if 'auth_file' in args : + _info = json.loads((open(args['auth_file'])).read()) + + + else: + _info = {} + _args = dict(args,**_info) + _map = {'dbname':'db','database':'db','table':'uid','collection':'uid','col':'uid','doc':'uid'} + for key in _args : + if key in ['username','password'] : + username = _args['username'] if key=='username' else username + password = _args['password'] if key == 'password' else password + continue + value = _args[key] + if key in _map : + key = _map[key] + + self.setattr(key,value) + # + # Let us perform aliasing in order to remain backwards compatible + + self.dbname = self.db if hasattr(self,'db')else self.dbname + self.collection = _args['table'] if 'table' in _args else (_args['doc'] if 'doc' in _args else (_args['collection'] if 'collection' in _args else None)) + if username and password : + self.client = MongoClient(self.host, + username=username, + password=password , + authSource=self.authSource, + authMechanism=self.mechanism) + + else: + self.client = MongoClient(self.host,maxPoolSize=10000) + + self.db = self.client[self.dbname] + + def isready(self): + p = self.dbname in self.client.list_database_names() + q = self.collection in self.client[self.dbname].list_collection_names() + return p and q + def setattr(self,key,value): + _allowed = ['host','port','db','doc','collection','authSource','mechanism'] + if key in _allowed : + setattr(self,key,value) + pass + def close(self): + self.client.close() + def meta(self,**_args): + return [] +class Reader(Mongo): + """ + This class will read from a mongodb data store and return the content of a document (not a collection) + """ + def __init__(self,**args): + Mongo.__init__(self,**args) + def read(self,**args): + + if 'mongo' in args or 'cmd' in args or 'pipeline' in args: + # + # @TODO: + cmd = {} + if 'aggregate' not in cmd and 'aggregate' not in args: + cmd['aggregate'] = self.collection + elif 'aggregate' in args : + cmd['aggregate'] = args['aggregate'] + if 'pipeline' in args : + cmd['pipeline']= args['pipeline'] + + if 'pipeline' not in args or 'aggregate' not in cmd : + cmd = args['mongo'] if 'mongo' in args else args['cmd'] + if "aggregate" in cmd : + if "allowDiskUse" not in cmd : + cmd["allowDiskUse"] = True + if "cursor" not in cmd : + cmd["cursor"] = {} + r = [] + out = self.db.command(cmd) + #@TODO: consider using a yield (generator) works wonders + while True : + if 'values' in out : + r += out['values'] + if 'cursor' in out : + key = 'firstBatch' if 'firstBatch' in out['cursor'] else 'nextBatch' + else: + key = 'n' + if 'cursor' in out and out['cursor'][key] : + r += list(out['cursor'][key]) + elif key in out and out[key]: + r.append (out[key]) + # yield out['cursor'][key] + if key not in ['firstBatch','nextBatch'] or ('cursor' in out and out['cursor']['id'] == 0) : + break + else: + out = self.db.command({"getMore":out['cursor']['id'],"collection":out['cursor']['ns'].split(".")[-1]}) + + + return pd.DataFrame(r) + else: + + + if 'table' in args or 'collection' in args : + if 'table' in args: + _uid = args['table'] + elif 'collection' in args : + _uid = args['collection'] + else: + _uid = self.collection + else: + _uid = self.collection + collection = self.db[_uid] + _filter = args['filter'] if 'filter' in args else {} + _df = pd.DataFrame(collection.find(_filter)) + columns = _df.columns.tolist()[1:] + return _df[columns] + def view(self,**args): + """ + This function is designed to execute a view (map/reduce) operation + """ + pass +class Writer(Mongo): + """ + This class is designed to write to a mongodb collection within a database + """ + def __init__(self,**args): + Mongo.__init__(self,**args) + def upload(self,**args) : + """ + This function will upload a file to the current database (using GridFS) + :param data binary stream/text to be stored + :param filename filename to be used + :param encoding content_encoding (default utf-8) + + """ + if 'encoding' not in args : + args['encoding'] = 'utf-8' + gfs = GridFS(self.db) + gfs.put(**args) + + def archive(self): + """ + This function will archive documents to the + """ + collection = self.db[self.collection] + rows = list(collection.find()) + for row in rows : + if type(row['_id']) == ObjectId : + row['_id'] = str(row['_id']) + stream = Binary(json.dumps(collection,cls=IEncoder).encode()) + collection.delete_many({}) + now = "-".join([str(datetime.now().year()),str(datetime.now().month), str(datetime.now().day)]) + name = ".".join([self.collection,'archive',now])+".json" + description = " ".join([self.collection,'archive',str(len(rows))]) + self.upload(filename=name,data=stream,description=description,content_type='application/json') + # gfs = GridFS(self.db) + # gfs.put(filename=name,description=description,data=stream,encoding='utf-8') + # self.write({{"filename":name,"file":stream,"description":descriptions}}) + + + pass + + def write(self,info,**_args): + """ + This function will write to a given collection i.e add a record to a collection (no updates) + @param info new record in the collection to be added + """ + # document = self.db[self.collection].find() + #collection = self.db[self.collection] + # if type(info) == list : + # self.db[self.collection].insert_many(info) + # else: + try: + if 'table' in _args or 'collection' in _args : + _uid = _args['table'] if 'table' in _args else _args['collection'] + else: + _uid = self.collection if 'doc' not in _args else _args['doc'] + if self._lock : + Mongo.lock.acquire() + if type(info) == list or type(info) == pd.DataFrame : + info if type(info) == list else info.to_dict(orient='records') + info = json.loads(json.dumps(info,cls=IEncoder)) + self.db[_uid].insert_many(info) + else: + self.db[_uid].insert_one(json.loads(json.dumps(info,cls=IEncoder))) + finally: + if self._lock : + Mongo.lock.release() + def set(self,document): + """ + if no identifier is provided the function will delete the entire collection and set the new document. + Please use this function with great care (archive the content first before using it... for safety) + """ + + collection = self.db[self.collection] + if collection.count_document() > 0 and '_id' in document: + id = document['_id'] + del document['_id'] + collection.find_one_and_replace({'_id':id},document) + else: + collection.delete_many({}) + self.write(info) + def close(self): + Mongo.close(self) + # collecton.update_one({"_id":self.collection},document,True) + diff --git a/transport/other/__init__.py b/transport/other/__init__.py new file mode 100644 index 0000000..ea26d80 --- /dev/null +++ b/transport/other/__init__.py @@ -0,0 +1 @@ +from . import files, http, rabbitmq, callback, files \ No newline at end of file diff --git a/transport/other/callback.py b/transport/other/callback.py new file mode 100644 index 0000000..29b03fc --- /dev/null +++ b/transport/other/callback.py @@ -0,0 +1,45 @@ +import queue +from threading import Thread, Lock +from transport.common import Reader,Writer +import numpy as np +import pandas as pd + +class Writer : + lock = Lock() + _queue = {'default':queue.Queue()} + def __init__(self,**_args): + self._cache = {} + self._callback = _args['callback'] if 'callback' in _args else None + self._id = _args['id'] if 'id' in _args else 'default' + if self._id not in Writer._queue : + Writer._queue[self._id] = queue.Queue() + thread = Thread(target=self._forward) + thread.start() + def _forward(self): + _q = Writer._queue[self._id] + _data = _q.get() + _q.task_done() + self._callback(_data) + + def has(self,**_args) : + return self._callback is not None + + + def close(self): + """ + This will empty the queue and have it ready for another operation + """ + _q = Writer._queue[self._id] + with _q.mutex: + _q.queue.clear() + _q.all_tasks_done.notify_all() + + def write(self,_data,**_args): + _id = _args['id'] if 'id' in _args else self._id + + _q = Writer._queue[_id] + _q.put(_data) + _q.join() + + + # self.callback = print \ No newline at end of file diff --git a/transport/other/console.py b/transport/other/console.py new file mode 100644 index 0000000..16f589a --- /dev/null +++ b/transport/other/console.py @@ -0,0 +1,7 @@ +from . import callback + + +class Writer (callback.Writer): + def __init__(self,**_args): + super().__init__(callback=print) + \ No newline at end of file diff --git a/transport/other/files.py b/transport/other/files.py new file mode 100644 index 0000000..a4e8a08 --- /dev/null +++ b/transport/other/files.py @@ -0,0 +1,68 @@ +""" +This file is a wrapper around pandas built-in functionalities to handle character delimited files +""" +import pandas as pd +import numpy as np +import os +class File : + def __init__(self,**params): + """ + + @param path absolute path of the file to be read + """ + self.path = params['path'] if 'path' in params else None + self.delimiter = params['delimiter'] if 'delimiter' in params else ',' + + def isready(self): + return os.path.exists(self.path) + def meta(self,**_args): + return [] + +class Reader (File): + """ + This class is designed to read data from disk (location on hard drive) + @pre : isready() == True + """ + + def __init__(self,**_args): + super().__init__(**_args) + + def read(self,**args): + _path = self.path if 'path' not in args else args['path'] + _delimiter = self.delimiter if 'delimiter' not in args else args['delimiter'] + return pd.read_csv(_path,delimiter=self.delimiter) + def stream(self,**args): + raise Exception ("streaming needs to be implemented") +class Writer (File): + + """ + This function writes output to disk in a designated location. The function will write a text to a text file + - If a delimiter is provided it will use that to generate a xchar-delimited file + - If not then the object will be dumped as is + """ + # THREAD_LOCK = RLock() + def __init__(self,**_args): + super().__init__(**_args) + self._mode = 'w' if 'mode' not in _args else _args['mode'] + + def write(self,info,**_args): + """ + This function writes a record to a designated file + @param label + @param row row to be written + """ + try: + + _delim = self._delimiter if 'delimiter' not in _args else _args['delimiter'] + _path = self._path if 'path' not in _args else _args['path'] + _mode = self._mode if 'mode' not in _args else _args['mode'] + info.to_csv(_path,index=False,sep=_delim) + + pass + except Exception as e: + # + # Not sure what should be done here ... + pass + finally: + # DiskWriter.THREAD_LOCK.release() + pass \ No newline at end of file diff --git a/transport/other/http.py b/transport/other/http.py new file mode 100644 index 0000000..d92e334 --- /dev/null +++ b/transport/other/http.py @@ -0,0 +1,88 @@ +from flask import request, session +from datetime import datetime +import re +# from transport.common import Reader, Writer +import json +import requests +from io import StringIO +import pandas as pd + + +class Reader: + """ + This class is designed to read data from an Http request file handler provided to us by flask + The file will be heald in memory and processed accordingly + NOTE: This is inefficient and can crash a micro-instance (becareful) + """ + + def __init__(self,**_args): + self._url = _args['url'] + self._headers = None if 'headers' not in _args else _args['headers'] + + # def isready(self): + # return self.file_length > 0 + def format(self,_response): + _mimetype= _response.headers['Content-Type'] + if _mimetype == 'text/csv' or 'text/csv': + _content = _response.text + return pd.read_csv(StringIO(_content)) + # + # @TODO: Add support for excel, JSON and other file formats that fit into a data-frame + # + + return _response.text + def read(self,**_args): + if self._headers : + r = requests.get(self._url,headers = self._headers) + else: + r = requests.get(self._url,headers = self._headers) + return self.format(r) + +class Writer: + """ + This class is designed to submit data to an endpoint (url) + """ + def __init__(self,**_args): + """ + @param key required session key + """ + self._url = _args['url'] + self._name = _args['name'] + self._method = 'post' if 'method' not in _args else _args['method'] + + # self.session = params['queue'] + # self.session['sql'] = [] + # self.session['csv'] = [] + # self.tablename = re.sub('..+$','',params['filename']) + # self.session['uid'] = params['uid'] + #self.xchar = params['xchar'] + + + def format_sql(self,row): + values = "','".join([col.replace('"','').replace("'",'') for col in row]) + return "".join(["INSERT INTO :table VALUES('",values,"');\n"]).replace(':table',self.tablename) + def isready(self): + return True + def write(self,_data,**_args): + # + # + _method = self._method if 'method' not in _args else _args['method'] + _method = _method.lower() + _mimetype = 'text/csv' + if type(_data) == dict : + _mimetype = 'application/json' + _content = _data + else: + _content = _data.to_dict(orient='records') + _headers = {'Content-Type':_mimetype} + _pointer = getattr(requests,_method) + + _pointer ({self._name:_content},headers=_headers) + + + # label = params['label'] + # row = params ['row'] + + # if label == 'usable': + # self.session['csv'].append(self.format(row,',')) + # self.session['sql'].append(self.format_sql(row)) diff --git a/transport/other/rabbitmq.py b/transport/other/rabbitmq.py new file mode 100644 index 0000000..f56800d --- /dev/null +++ b/transport/other/rabbitmq.py @@ -0,0 +1,272 @@ +""" +Data Transport - 1.0 +Steve L. Nyemba, The Phi Technology LLC + +This file is a wrapper around rabbitmq server for reading and writing content to a queue (exchange) + +""" +import pika +from datetime import datetime +import re +import json +import os +import sys +# if sys.version_info[0] > 2 : +# from transport.common import Reader, Writer +# else: +# from common import Reader, Writer +import json +from multiprocessing import RLock +class MessageQueue: + """ + This class hierarchy is designed to handle interactions with a queue server using pika framework (our tests are based on rabbitmq) + :host + :xid identifier of the exchange + :qid identifier of the queue + """ + def __init__(self,**params): + self.host= 'localhost' if 'host' not in params else params['host'] #-- location of the queue server + self.port= 5672 if 'port' not in params else params['port'] + self.virtual_host = '/' if 'vhost' not in params else params['vhost'] + self.exchange = params['exchange'] if 'exchange' in params else 'amq.direct' #-- exchange + self.queue = params['queue'] if 'queue' in params else 'demo' + self.connection = None + self.channel = None + + self.name = self.__class__.__name__.lower() if 'name' not in params else params['name'] + + username = password = None + if 'username' in params : + username = params['username'] + password = params['password'] + if 'auth_file' in params : + _info = json.loads((open(params['auth_file'])).read()) + username=_info['username'] + password=_info['password'] + self.virtual_host = _info['virtual_host'] if 'virtual_host' in _info else self.virtual_host + self.exchange = _info['exchange'] if 'exchange' in _info else self.exchange + self.queue = _info['queue'] if 'queue' in _info else self.queue + + self.credentials= pika.PlainCredentials('guest','guest') + if 'username' in params : + self.credentials = pika.PlainCredentials( + params['username'], + ('' if 'password' not in params else params['password']) + ) + + def init(self,label=None): + properties = pika.ConnectionParameters(host=self.host,port=self.port,virtual_host=self.virtual_host, + client_properties={'connection_name':self.name}, + credentials=self.credentials) + self.connection = pika.BlockingConnection(properties) + self.channel = self.connection.channel() + self.info = self.channel.exchange_declare(exchange=self.exchange,exchange_type='direct',durable=True) + if label is None: + self.qhandler = self.channel.queue_declare(queue=self.queue,durable=True) + else: + self.qhandler = self.channel.queue_declare(queue=label,durable=True) + + self.channel.queue_bind(exchange=self.exchange,queue=self.qhandler.method.queue) + + def isready(self): + #self.init() + resp = self.connection is not None and self.connection.is_open + # self.close() + return resp + def finalize(self): + pass + def close(self): + if self.connection.is_closed == False : + self.channel.close() + self.connection.close() + +class Writer(MessageQueue): + """ + This class is designed to publish content to an AMQP (Rabbitmq) + The class will rely on pika to implement this functionality + + We will publish information to a given queue for a given exchange + """ + def __init__(self,**params): + #self.host= params['host'] + #self.exchange = params['uid'] + #self.queue = params['queue'] + MessageQueue.__init__(self,**params); + self.init() + def write(self,data,_type='text/plain'): + """ + This function writes a stream of data to the a given queue + @param object object to be written (will be converted to JSON) + @TODO: make this less chatty + """ + + stream = json.dumps(data) if isinstance(data,dict) else data + self.channel.basic_publish( + exchange=self.exchange, + routing_key=self.queue, + body=stream, + properties=pika.BasicProperties(content_type=_type,delivery_mode=2) + ); + # self.close() + + def flush(self): + self.init() + _mode = 1 #-- Non persistent + self.channel.queue_delete( queue=self.queue); + self.close() + +class Reader(MessageQueue): + """ + This class will read from a queue provided an exchange, queue and host + @TODO: Account for security and virtualhosts + """ + + def __init__(self,**params): + """ + @param host host + @param uid exchange identifier + @param qid queue identifier + """ + + #self.host= params['host'] + #self.exchange = params['uid'] + #self.queue = params['qid'] + MessageQueue.__init__(self,**params); + # self.init() + self.durable = False if 'durable' not in params else params['durable'] + # if 'durable' in params : + # self.durable = True + # else: + # self.durable = False + self.size = -1 + self.data = {} + # def init(self,qid): + + # properties = pika.ConnectionParameters(host=self.host) + # self.connection = pika.BlockingConnection(properties) + # self.channel = self.connection.channel() + # self.channel.exchange_declare(exchange=self.exchange,type='direct',durable=True) + + # self.info = self.channel.queue_declare(queue=qid,durable=True) + + + def callback(self,channel,method,header,stream): + """ + This is the callback function designed to process the data stream from the queue + + """ + + r = [] + # if re.match("^\{|\[",stream) is not None: + if stream.startswith(b'{') or stream.startswith(b'['): + r = json.loads(stream) + else: + + r = stream + + qid = self.qhandler.method.queue + if qid not in self.data : + self.data[qid] = [] + + self.data[qid].append(r) + # + # We stop reading when the all the messages of the queue are staked + # + if self.size == len(self.data[qid]) or len(self.data[qid]) == self.info.method.message_count: + self.close() + + def read(self,**args): + """ + This function will read, the first message from a queue + @TODO: + Implement channel.basic_get in order to retrieve a single message at a time + Have the number of messages retrieved be specified by size (parameter) + """ + r = {} + self.size = -1 if 'size' in args else int(args['size']) + # + # We enabled the reader to be able to read from several queues (sequentially for now) + # The qid parameter will be an array of queues the reader will be reading from + # + if isinstance(self.queue,str) : + self.queue = [self.queue] + + for qid in self.queue: + self.init(qid) + # r[qid] = [] + + if self.qhandler.method.message_count > 0: + + self.channel.basic_consume(queue=qid,on_message_callback=self.callback,auto_ack=False); + self.channel.start_consuming() + else: + + pass + #self.close() + # r[qid].append( self.data) + + return self.data +class QueueListener(MessageQueue): + lock = RLock() + """ + This class is designed to have an active listener (worker) against a specified Exchange/Queue + It is initialized as would any other object and will require a callback function to address the objects returned. + """ + def __init__(self,**args): + MessageQueue.__init__(self,**args) + self.listen = self.read + self.apply = args['apply'] if 'apply' in args else print + self.lock = False if 'lock' not in args else args['lock'] + + def finalize(self,channel,ExceptionReason): + pass + + def callback(self,channel,method,header,stream) : + _info= {} + # if re.match("^\{|\[",stream) is not None: + + + if stream.startswith(b"[") or stream.startswith(b"{"): + _info = json.loads(stream) + else: + + _info = stream + # + # At this point we should invoke the apply function with a lock if need be + # @TODO: Establish a vocabulary + + if stream == b'QUIT' : + # channel.exit() + self.close() + if self.lock == True : + QueueListener.lock.acquire() + try: + # + # In case the user has not specified a function to apply the data against, it will simply be printed + # + self.apply(_info) + except Exception as e: + pass + if self.lock == True : + QueueListener.lock.release() + def read(self): + + self.init(self.queue) + + self.channel.basic_consume(self.queue,self.callback,auto_ack=True); + self.channel.start_consuming() + + + +class Factory : + @staticmethod + def instance(**_args): + """ + :param count number of workers + :param apply function workers + """ + _apply = _args['apply'] + _count = _args['count'] + for i in np.arange(_count) : + _name = _args['name'] if 'name' in _args else 'worker_'+str(i) + transport.factory.instance(provider="rabbit",context="listener",apply=_apply,auth_file=_args['auth_file']) \ No newline at end of file diff --git a/transport/providers.py b/transport/providers.py deleted file mode 100644 index ddb2fcb..0000000 --- a/transport/providers.py +++ /dev/null @@ -1,105 +0,0 @@ -# from transport.common import Reader, Writer,Console #, factory -from transport import disk -import sqlite3 -from transport import s3 as s3 -from transport import rabbitmq as queue -from transport import couch as couch -from transport import mongo as mongo -from transport import sql as sql -from transport import etl as etl -from transport import qlistener -from transport import bricks -from transport import session -from transport import nextcloud -import psycopg2 as pg -import mysql.connector as my -from google.cloud import bigquery as bq -import nzpy as nz #--- netezza drivers -import os - -from info import __version__ - -POSTGRESQL = 'postgresql' -MONGODB = 'mongodb' -HTTP='http' -BIGQUERY ='bigquery' -FILE = 'file' -ETL = 'etl' -SQLITE = 'sqlite' -SQLITE3= 'sqlite' -REDSHIFT = 'redshift' -NETEZZA = 'netezza' -MYSQL = 'mysql+mysqlconnector' -RABBITMQ = 'rabbitmq' -MARIADB = 'mariadb' -COUCHDB = 'couch' -CONSOLE = 'console' -ETL = 'etl' -TRANSPORT = ETL -NEXTCLOUD = 'nextcloud' - -# -# synonyms of the above -BQ = BIGQUERY -MONGO = MONGODB -FERRETDB= MONGODB -PG = POSTGRESQL -PSQL = POSTGRESQL -PGSQL = POSTGRESQL -S3 = 's3' -AWS_S3 = 's3' -RABBIT = RABBITMQ - -QLISTENER = 'qlistener' -QUEUE = QLISTENER -CALLBACK = QLISTENER -DATABRICKS= 'databricks+connector' -DRIVERS = {PG:pg,REDSHIFT:pg,MYSQL:my,MARIADB:my,NETEZZA:nz,SQLITE:sqlite3} -CATEGORIES ={'sql':[NETEZZA,PG,MYSQL,REDSHIFT,SQLITE,MARIADB],'nosql':[MONGODB,COUCHDB],'cloud':[NEXTCLOUD,S3,BIGQUERY,DATABRICKS],'file':[FILE], - 'queue':[RABBIT,QLISTENER],'memory':[CONSOLE,QUEUE],'http':[HTTP]} - -READ = {'sql':sql.SQLReader,'nosql':{MONGODB:mongo.MongoReader,COUCHDB:couch.CouchReader}, - 'cloud':{BIGQUERY:sql.BigQueryReader,DATABRICKS:bricks.BricksReader,NEXTCLOUD:nextcloud.NextcloudReader}, - 'file':disk.DiskReader,'queue':{RABBIT:queue.QueueReader,QLISTENER:qlistener.qListener}, - # 'cli':{CONSOLE:Console},'memory':{CONSOLE:Console},'http':session.HttpReader - } -WRITE = {'sql':sql.SQLWriter,'nosql':{MONGODB:mongo.MongoWriter,COUCHDB:couch.CouchWriter}, - 'cloud':{BIGQUERY:sql.BigQueryWriter,DATABRICKS:bricks.BricksWriter,NEXTCLOUD:nextcloud.NextcloudWriter}, - 'file':disk.DiskWriter,'queue':{RABBIT:queue.QueueWriter,QLISTENER:qlistener.qListener}, - # 'cli':{CONSOLE:Console}, - # 'memory':{CONSOLE:Console}, 'http':session.HttpReader - - } -# SQL_PROVIDERS = [POSTGRESQL,MYSQL,NETEZZA,MARIADB,SQLITE] -PROVIDERS = { - FILE:{'read':disk.DiskReader,'write':disk.DiskWriter}, - SQLITE:{'read':disk.SQLiteReader,'write':disk.SQLiteWriter,'driver':sqlite3}, - 'sqlite3':{'read':disk.SQLiteReader,'write':disk.SQLiteWriter,'driver':sqlite3}, - - POSTGRESQL:{'read':sql.SQLReader,'write':sql.SQLWriter,'driver':pg,'default':{'host':'localhost','port':5432}}, - NETEZZA:{'read':sql.SQLReader,'write':sql.SQLWriter,'driver':nz,'default':{'port':5480}}, - REDSHIFT:{'read':sql.SQLReader,'write':sql.SQLWriter,'driver':pg,'default':{'host':'localhost','port':5432}}, - RABBITMQ:{'read':queue.QueueReader,'writer':queue.QueueWriter,'context':queue.QueueListener,'default':{'host':'localhost','port':5432}}, - - MYSQL:{'read':sql.SQLReader,'write':sql.SQLWriter,'driver':my,'default':{'host':'localhost','port':3306}}, - MARIADB:{'read':sql.SQLReader,'write':sql.SQLWriter,'driver':my,'default':{'host':'localhost','port':3306}}, - - S3:{'read':s3.s3Reader,'write':s3.s3Writer}, - BIGQUERY:{'read':sql.BigQueryReader,'write':sql.BigQueryWriter}, - DATABRICKS:{'read':bricks.BricksReader,'write':bricks.BricksWriter}, - NEXTCLOUD:{'read':nextcloud.NextcloudReader,'write':nextcloud.NextcloudWriter}, - - QLISTENER:{'read':qlistener.qListener,'write':qlistener.qListener,'default':{'host':'localhost','port':5672}}, - CONSOLE:{'read':qlistener.Console,"write":qlistener.Console}, - HTTP:{'read':session.HttpReader,'write':session.HttpWriter}, - - MONGODB:{'read':mongo.MongoReader,'write':mongo.MongoWriter,'default':{'port':27017,'host':'localhost'}}, - COUCHDB:{'read':couch.CouchReader,'writer':couch.CouchWriter,'default':{'host':'localhost','port':5984}}, -# ETL :{'read':etl.Transporter,'write':etl.Transporter} - ETL :{'read':etl.instance,'write':etl.instance} -} -DEFAULT = {PG:{'host':'localhost','port':5432},MYSQL:{'host':'localhost','port':3306}} -DEFAULT[MONGODB] = {'port':27017,'host':'localhost'} -DEFAULT[REDSHIFT] = DEFAULT[PG] -DEFAULT[MARIADB] = DEFAULT[MYSQL] -DEFAULT[NETEZZA] = {'port':5480} diff --git a/transport/providers/__init__.py b/transport/providers/__init__.py new file mode 100644 index 0000000..fc0f1e7 --- /dev/null +++ b/transport/providers/__init__.py @@ -0,0 +1,44 @@ +""" +This file is intended to aggregate all we can about the framework in terms of support +""" + +BIGQUERY='bigquery' + +POSTGRESQL = 'postgresql' +MONGODB = 'mongodb' +HTTP='http' +BIGQUERY ='bigquery' +FILE = 'file' +ETL = 'etl' +SQLITE = 'sqlite' +SQLITE3= 'sqlite3' +REDSHIFT = 'redshift' +NETEZZA = 'netezza' +MYSQL = 'mysql' +MARIADB= MYSQL + +COUCHDB = 'couchdb' +CONSOLE = 'console' +ETL = 'etl' +TRANSPORT = ETL +NEXTCLOUD = 'nextcloud' +S3 = 's3' +CALLBACK = 'callback' +CONSOLE = 'console' +RABBITMQ = 'rabbitmq' +DATABRICKS= 'databricks' + +# +# synonyms of the above +BQ = BIGQUERY +MONGO = MONGODB +FERRETDB= MONGODB +PG = POSTGRESQL +PSQL = POSTGRESQL +PGSQL = POSTGRESQL + +AWS_S3 = 's3' +RABBIT = RABBITMQ + +# QLISTENER = 'qlistener' + \ No newline at end of file diff --git a/transport/sql.py b/transport/sql.py deleted file mode 100644 index c5b52d4..0000000 --- a/transport/sql.py +++ /dev/null @@ -1,526 +0,0 @@ -""" -This file is intended to perform read/writes against an SQL database such as PostgreSQL, Redshift, Mysql, MsSQL ... - -LICENSE (MIT) -Copyright 2016-2020, The Phi Technology LLC - -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -@TODO: - - Migrate SQLite to SQL hierarchy - - Include Write in Chunks from pandas -""" -import psycopg2 as pg -import mysql.connector as my -import sys - -import sqlalchemy -if sys.version_info[0] > 2 : - from transport.common import Reader, Writer #, factory -else: - from common import Reader,Writer -import json -from google.oauth2 import service_account -from google.cloud import bigquery as bq -# import constants.bq_utils as bq_consts - -from multiprocessing import Lock, RLock -import pandas as pd -import pandas_gbq as pd_gbq -import numpy as np -import nzpy as nz #--- netezza drivers -import sqlite3 -import copy -import os -import time - -class SQLRW : - lock = RLock() - MAX_CHUNK = 2000000 - DRIVERS = {"postgresql":pg,"redshift":pg,"mysql":my,"mariadb":my,"netezza":nz} - REFERENCE = { - "netezza":{"port":5480,"handler":nz,"dtype":"VARCHAR(512)"}, - "postgresql":{"port":5432,"handler":pg,"dtype":"VARCHAR"}, - "redshift":{"port":5432,"handler":pg,"dtype":"VARCHAR"}, - "mysql":{"port":3360,"handler":my,"dtype":"VARCHAR(256)"}, - "mariadb":{"port":3360,"handler":my,"dtype":"VARCHAR(256)"}, - } - def __init__(self,**_args): - - - _info = {} - _info['dbname'] = _args['db'] if 'db' in _args else _args['database'] - self.table = _args['table'] if 'table' in _args else None - self.fields = _args['fields'] if 'fields' in _args else [] - self.schema = _args['schema'] if 'schema' in _args else '' - self._chunks = 1 if 'chunks' not in _args else int(_args['chunks']) - - self._provider = _args['provider'] if 'provider' in _args else None - # _info['host'] = 'localhost' if 'host' not in _args else _args['host'] - # _info['port'] = SQLWriter.REFERENCE[_provider]['port'] if 'port' not in _args else _args['port'] - - _info['host'] = _args['host'] if 'host' in _args else '' - _info['port'] = _args['port'] if 'port' in _args else '' - - # if 'host' in _args : - # _info['host'] = 'localhost' if 'host' not in _args else _args['host'] - # # _info['port'] = SQLWriter.PROVIDERS[_args['provider']] if 'port' not in _args else _args['port'] - # _info['port'] = SQLWriter.REFERENCE[_provider]['port'] if 'port' not in _args else _args['port'] - self.lock = False if 'lock' not in _args else _args['lock'] - if 'username' in _args or 'user' in _args: - key = 'username' if 'username' in _args else 'user' - _info['user'] = _args[key] - _info['password'] = _args['password'] if 'password' in _args else '' - if 'auth_file' in _args : - _auth = json.loads( open(_args['auth_file']).read() ) - key = 'username' if 'username' in _auth else 'user' - _info['user'] = _auth[key] - _info['password'] = _auth['password'] if 'password' in _auth else '' - - _info['host'] = _auth['host'] if 'host' in _auth else _info['host'] - _info['port'] = _auth['port'] if 'port' in _auth else _info['port'] - if 'database' in _auth: - _info['dbname'] = _auth['database'] - self.table = _auth['table'] if 'table' in _auth else self.table - # - # We need to load the drivers here to see what we are dealing with ... - - - # _handler = SQLWriter.REFERENCE[_provider]['handler'] - _handler = _args['driver'] #-- handler to the driver - self._dtype = _args['default']['type'] if 'default' in _args and 'type' in _args['default'] else 'VARCHAR(256)' - # self._provider = _args['provider'] - # self._dtype = SQLWriter.REFERENCE[_provider]['dtype'] if 'dtype' not in _args else _args['dtype'] - # self._provider = _provider - if _handler == nz : - _info['database'] = _info['dbname'] - _info['securityLevel'] = 0 - del _info['dbname'] - if _handler == my : - _info['database'] = _info['dbname'] - del _info['dbname'] - if _handler == sqlite3 : - _info = {'path':_info['dbname'],'isolation_level':'IMMEDIATE'} - if _handler != sqlite3 : - self.conn = _handler.connect(**_info) - else: - self.conn = _handler.connect(_info['path'],isolation_level='IMMEDIATE') - self._engine = _args['sqlalchemy'] if 'sqlalchemy' in _args else None - def meta(self,**_args): - schema = [] - try: - if self._engine : - table = _args['table'] if 'table' in _args else self.table - if sqlalchemy.__version__.startswith('1.') : - _m = sqlalchemy.MetaData(bind=self._engine) - _m.reflect() - else: - - _m = sqlalchemy.MetaData() - _m.reflect(bind=self._engine) - schema = [{"name":_attr.name,"type":str(_attr.type)} for _attr in _m.tables[table].columns] - # - # Some house keeping work - _m = {'BIGINT':'INTEGER','TEXT':'STRING','DOUBLE_PRECISION':'FLOAT','NUMERIC':'FLOAT','DECIMAL':'FLOAT','REAL':'FLOAT'} - for _item in schema : - if _item['type'] in _m : - _item['type'] = _m[_item['type']] - - except Exception as e: - print (e) - pass - return schema - def _tablename(self,name) : - - return self.schema +'.'+name if self.schema not in [None, ''] and '.' not in name else name - def has(self,**_args): - return self.meta(**_args) - # found = False - # try: - - # table = self._tablename(_args['table'])if 'table' in _args else self._tablename(self.table) - # sql = "SELECT * FROM :table LIMIT 1".replace(":table",table) - # if self._engine : - # _conn = self._engine.connect() - # else: - # _conn = self.conn - # found = pd.read_sql(sql,_conn).shape[0] - # found = True - - # except Exception as e: - # print (e) - # pass - # finally: - # if not self._engine : - # _conn.close() - # return found - def isready(self): - _sql = "SELECT * FROM :table LIMIT 1".replace(":table",self.table) - try: - _conn = self.conn if not hasattr(self,'_engine') else self._engine - return pd.read_sql(_sql,_conn).columns.tolist() - except Exception as e: - pass - return False - def apply(self,_sql): - """ - This function applies a command and/or a query against the current relational data-store - :param _sql insert/select statement - @TODO: Store procedure calls - """ - # - _out = None - try: - if _sql.lower().startswith('select') : - - _conn = self._engine if self._engine else self.conn - return pd.read_sql(_sql,_conn) - else: - # Executing a command i.e no expected return values ... - cursor = self.conn.cursor() - cursor.execute(_sql) - self.conn.commit() - except Exception as e : - print (e) - finally: - if not self._engine : - self.conn.commit() - # cursor.close() - def close(self): - try: - self.conn.close() - except Exception as error : - print (error) - pass -class SQLReader(SQLRW,Reader) : - def __init__(self,**_args): - super().__init__(**_args) - - def read(self,**_args): - if 'sql' in _args : - _sql = (_args['sql']) - else: - if 'table' in _args : - table = _args['table'] - else: - table = self.table - # table = self.table if self.table is not None else _args['table'] - _sql = "SELECT :fields FROM "+self._tablename(table) - if 'filter' in _args : - _sql = _sql +" WHERE "+_args['filter'] - if 'fields' in _args : - _fields = _args['fields'] - else: - _fields = '*' if not self.fields else ",".join(self.fields) - _sql = _sql.replace(":fields",_fields) - # - # At this point we have a query we can execute gracefully - if 'limit' in _args : - _sql = _sql + " LIMIT "+str(_args['limit']) - # - # @TODO: - # It is here that we should inspect to see if there are any pre/post conditions - # - return self.apply(_sql) - def close(self) : - try: - self.conn.close() - except Exception as error : - print (error) - pass - -class SQLWriter(SQLRW,Writer): - def __init__(self,**_args) : - super().__init__(**_args) - # - # In the advent that data typing is difficult to determine we can inspect and perform a default case - # This slows down the process but improves reliability of the data - # NOTE: Proper data type should be set on the target system if their source is unclear. - - self._cast = False if 'cast' not in _args else _args['cast'] - - def init(self,fields=None): - # if not fields : - # try: - # table = self._tablename(self.table) - # self.fields = pd.read_sql_query("SELECT * FROM :table LIMIT 1".replace(":table",table),self.conn).columns.tolist() - # except Exception as e: - # pass - # finally: - # pass - # else: - self.fields = fields; - - def make(self,**_args): - table = self._tablename(self.table) if 'table' not in _args else self._tablename(_args['table']) - if 'fields' in _args : - fields = _args['fields'] - # table = self._tablename(self.table) - sql = " ".join(["CREATE TABLE",table," (", ",".join([ name +' '+ self._dtype for name in fields]),")"]) - - else: - schema = _args['schema'] if 'schema' in _args else [] - - _map = _args['map'] if 'map' in _args else {} - sql = [] # ["CREATE TABLE ",_args['table'],"("] - for _item in schema : - _type = _item['type'] - if _type in _map : - _type = _map[_type] - sql = sql + [" " .join([_item['name'], ' ',_type])] - sql = ",".join(sql) - # table = self._tablename(_args['table']) - sql = ["CREATE TABLE ",table,"( ",sql," )"] - sql = " ".join(sql) - - cursor = self.conn.cursor() - try: - - cursor.execute(sql) - except Exception as e : - print (e) - # print (sql) - pass - finally: - # cursor.close() - self.conn.commit() - pass - def write(self,info,**_args): - """ - :param info writes a list of data to a given set of fields - """ - # inspect = False if 'inspect' not in _args else _args['inspect'] - # cast = False if 'cast' not in _args else _args['cast'] - # if not self.fields : - # if type(info) == list : - # _fields = info[0].keys() - # elif type(info) == dict : - # _fields = info.keys() - # elif type(info) == pd.DataFrame : - # _fields = info.columns.tolist() - - # # _fields = info.keys() if type(info) == dict else info[0].keys() - # # _fields = list (_fields) - # self.init(_fields) - - try: - table = _args['table'] if 'table' in _args else self.table - # - # In SQL, schema can stand for namespace or the structure of a table - # In case we have a list, we are likely dealing with table structure - # - if 'schema' in _args : - if type(_args['schema']) == str : - self.schema = _args['schema'] if 'schema' in _args else self.schema - elif type(_args['schema']) == list and len(_args['schema']) > 0 and not self.has(table=table): - # - # There is a messed up case when an empty array is passed (no table should be created) - # - self.make(table=table,schema=_args['schema']) - pass - # self.schema = _args['schema'] if 'schema' in _args else self.schema - table = self._tablename(table) - - _sql = "INSERT INTO :table (:fields) VALUES (:values)".replace(":table",table) #.replace(":table",self.table).replace(":fields",_fields) - - if type(info) == list : - _info = pd.DataFrame(info) - elif type(info) == dict : - _info = pd.DataFrame([info]) - else: - _info = pd.DataFrame(info) - - - if _info.shape[0] == 0 : - - return - if self.lock : - SQLRW.lock.acquire() - # - # we will adjust the chunks here in case we are not always sure of the - if self._chunks == 1 and _info.shape[0] > SQLRW.MAX_CHUNK : - self._chunks = 10 - _indexes = np.array_split(np.arange(_info.shape[0]),self._chunks) - for i in _indexes : - # - # In case we have an invalid chunk ... - if _info.iloc[i].shape[0] == 0 : - continue - # - # We are enabling writing by chunks/batches because some persistent layers have quotas or limitations on volume of data - - if self._engine is not None: - # pd.to_sql(_info,self._engine) - if self.schema in ['',None] : - rows = _info.iloc[i].to_sql(table,self._engine,if_exists='append',index=False) - else: - # - # Writing with schema information ... - rows = _info.iloc[i].to_sql(self.table,self._engine,schema=self.schema,if_exists='append',index=False) - time.sleep(1) - else: - _fields = ",".join(self.fields) - _sql = _sql.replace(":fields",_fields) - values = ", ".join("?"*len(self.fields)) if self._provider == 'netezza' else ",".join(["%s" for name in self.fields]) - _sql = _sql.replace(":values",values) - cursor = self.conn.cursor() - cursor.executemany(_sql,_info.iloc[i].values.tolist()) - cursor.close() - # cursor.commit() - - # self.conn.commit() - except Exception as e: - print(e) - pass - finally: - - if self._engine is None : - self.conn.commit() - if self.lock : - SQLRW.lock.release() - # cursor.close() - pass - def close(self): - try: - self.conn.close() - finally: - pass -class BigQuery: - def __init__(self,**_args): - path = _args['service_key'] if 'service_key' in _args else _args['private_key'] - self.credentials = service_account.Credentials.from_service_account_file(path) - self.dataset = _args['dataset'] if 'dataset' in _args else None - self.path = path - self.dtypes = _args['dtypes'] if 'dtypes' in _args else None - self.table = _args['table'] if 'table' in _args else None - self.client = bq.Client.from_service_account_json(self.path) - def meta(self,**_args): - """ - This function returns meta data for a given table or query with dataset/table properly formatted - :param table name of the name WITHOUT including dataset - :param sql sql query to be pulled, - """ - table = _args['table'] if 'table' in _args else self.table - - try: - if table : - _dataset = self.dataset if 'dataset' not in _args else _args['dataset'] - sql = f"""SELECT column_name as name, data_type as type FROM {_dataset}.INFORMATION_SCHEMA.COLUMNS WHERE table_name = '{table}' """ - _info = {'credentials':self.credentials,'dialect':'standard'} - return pd_gbq.read_gbq(sql,**_info).to_dict(orient='records') - # return self.read(sql=sql).to_dict(orient='records') - # ref = self.client.dataset(self.dataset).table(table) - - # _schema = self.client.get_table(ref).schema - # return [{"name":_item.name,"type":_item.field_type,"description":( "" if not hasattr(_item,"description") else _item.description )} for _item in _schema] - else : - return [] - except Exception as e: - - return [] - def has(self,**_args): - found = False - try: - _has = self.meta(**_args) - found = _has is not None and len(_has) > 0 - except Exception as e: - pass - return found -class BQReader(BigQuery,Reader) : - def __init__(self,**_args): - - super().__init__(**_args) - def apply(self,sql): - return self.read(sql=sql) - - def read(self,**_args): - SQL = None - table = self.table if 'table' not in _args else _args['table'] - if 'sql' in _args : - SQL = _args['sql'] - elif table: - - table = "".join(["`",table,"`"]) if '.' in table else "".join(["`:dataset.",table,"`"]) - SQL = "SELECT * FROM :table ".replace(":table",table) - if not SQL : - return None - if SQL and 'limit' in _args: - SQL += " LIMIT "+str(_args['limit']) - if (':dataset' in SQL or ':DATASET' in SQL) and self.dataset: - SQL = SQL.replace(':dataset',self.dataset).replace(':DATASET',self.dataset) - _info = {'credentials':self.credentials,'dialect':'standard'} - return pd_gbq.read_gbq(SQL,**_info) if SQL else None - # return self.client.query(SQL).to_dataframe() if SQL else None - - -class BQWriter(BigQuery,Writer): - lock = Lock() - def __init__(self,**_args): - super().__init__(**_args) - - self.parallel = False if 'lock' not in _args else _args['lock'] - self.table = _args['table'] if 'table' in _args else None - self.mode = {'if_exists':'append','chunksize':900000,'destination_table':self.table,'credentials':self.credentials} - self._chunks = 1 if 'chunks' not in _args else int(_args['chunks']) - self._location = 'US' if 'location' not in _args else _args['location'] - def write(self,_info,**_args) : - try: - if self.parallel or 'lock' in _args : - BQWriter.lock.acquire() - _args['table'] = self.table if 'table' not in _args else _args['table'] - self._write(_info,**_args) - finally: - if self.parallel: - BQWriter.lock.release() - def submit(self,_sql): - """ - Write the output of a massive query to a given table, biquery will handle this as a job - This function will return the job identifier - """ - _config = bq.QueryJobConfig() - _config.destination = self.client.dataset(self.dataset).table(self.table) - _config.allow_large_results = True - # _config.write_disposition = bq.bq_consts.WRITE_APPEND - _config.dry_run = False - # _config.priority = 'BATCH' - _resp = self.client.query(_sql,location=self._location,job_config=_config) - return _resp.job_id - def status (self,_id): - return self.client.get_job(_id,location=self._location) - def _write(self,_info,**_args) : - _df = None - if type(_info) in [list,pd.DataFrame] : - if type(_info) == list : - _df = pd.DataFrame(_info) - elif type(_info) == pd.DataFrame : - _df = _info - - if '.' not in _args['table'] : - self.mode['destination_table'] = '.'.join([self.dataset,_args['table']]) - else: - - self.mode['destination_table'] = _args['table'].strip() - if 'schema' in _args : - self.mode['table_schema'] = _args['schema'] - # - # Let us insure that the types are somewhat compatible ... - # _map = {'INTEGER':np.int64,'DATETIME':'datetime64[ns]','TIMESTAMP':'datetime64[ns]','FLOAT':np.float64,'DOUBLE':np.float64,'STRING':str} - # _mode = copy.deepcopy(self.mode) - _mode = self.mode - # _df.to_gbq(**self.mode) #if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000) - # - # Let us adjust the chunking here - self._chunks = 10 if _df.shape[0] > SQLRW.MAX_CHUNK and self._chunks == 1 else self._chunks - _indexes = np.array_split(np.arange(_df.shape[0]),self._chunks) - for i in _indexes : - _df.iloc[i].to_gbq(**self.mode) - time.sleep(1) - pass -# -# Aliasing the big query classes allowing it to be backward compatible -# -BigQueryReader = BQReader -BigQueryWriter = BQWriter \ No newline at end of file diff --git a/transport/sql/__init__.py b/transport/sql/__init__.py new file mode 100644 index 0000000..557d36d --- /dev/null +++ b/transport/sql/__init__.py @@ -0,0 +1,18 @@ +""" +This namespace/package wrap the sql functionalities for a certain data-stores + - netezza, postgresql, mysql and sqlite + - mariadb, redshift (also included) +""" +from . import postgresql, mysql, netezza, sqlite + + +# +# Creating aliases for support of additional data-store providerss +# +mariadb = mysql +redshift = postgresql +sqlite3 = sqlite + + +# from transport import sql + diff --git a/transport/sql/common.py b/transport/sql/common.py new file mode 100644 index 0000000..89dcefb --- /dev/null +++ b/transport/sql/common.py @@ -0,0 +1,125 @@ +""" +This file encapsulates common operations associated with SQL databases via SQLAlchemy + +""" +import sqlalchemy as sqa +import pandas as pd + +class Base: + def __init__(self,**_args): + self._host = _args['host'] if 'host' in _args else 'localhost' + self._port = None + self._database = _args['database'] + self._table = _args['table'] if 'table' in _args else None + self._engine= sqa.create_engine(self._get_uri(**_args),future=True) + def _set_uri(self,**_args) : + """ + :provider provider + :host host and port + :account account user/pwd + """ + _account = _args['account'] if 'account' in _args else None + _host = _args['host'] + _provider = _args['provider'].replace(':','').replace('/','').strip() + def _get_uri(self,**_args): + """ + This function will return the formatted uri for the sqlAlchemy engine + """ + raise Exception ("Function Needs to be implemented ") + def meta (self,**_args): + """ + This function returns the schema (table definition) of a given table + :table optional name of the table (can be fully qualified) + """ + _table = self._table if 'table' not in _args else _args['table'] + _schema = [] + if _table : + if sqa.__version__.startswith('1.') : + _handler = sqa.MetaData(bind=self._engine) + _handler.reflect() + else: + # + # sqlalchemy's version 2.+ + _handler = sqa.MetaData() + _handler.reflect(bind=self._engine) + # + # Let us extract the schema with the native types + _map = {'BIGINT':'INTEGER','TEXT':'STRING','DOUBLE_PRECISION':'FLOAT','NUMERIC':'FLOAT','DECIMAL':'FLOAT','REAL':'FLOAT'} + _schema = [{"name":_attr.name,"type":_map.get(str(_attr.type),str(_attr.type))} for _attr in _handler.tables[_table].columns] + return _schema + def has(self,**_args): + return self.meta(**_args) + def apply(self,sql): + """ + Executing sql statement that returns query results (hence the restriction on sql and/or with) + :sql SQL query to be exectued + + @TODO: Execution of stored procedures + """ + return pd.read_sql(sql,self._engine) if sql.lower().startswith('select') or sql.lower().startswith('with') else None + +class SQLBase(Base): + def __init__(self,**_args): + super().__init__(**_args) + def get_provider(self): + raise Exception ("Provider Needs to be set ...") + def get_default_port(self) : + raise Exception ("default port needs to be set") + + def _get_uri(self,**_args): + _host = self._host + _account = '' + if self._port : + _port = self._port + else: + _port = self.get_default_port() + + _host = f'{_host}:{_port}' + + if 'username' in _args : + _account = ''.join([_args['username'],':',_args['password'],'@']) + _database = self._database + _provider = self.get_provider().replace(':','').replace('/','') + # _uri = [f'{_provider}:/',_account,_host,_database] + # _uri = [_item.strip() for _item in _uri if _item.strip()] + # return '/'.join(_uri) + return f'{_provider}://{_host}/{_database}' if _account == '' else f'{_provider}://{_account}{_host}/{_database}' + +class BaseReader(SQLBase): + def __init__(self,**_args): + super().__init__(**_args) + def read(self,**_args): + """ + This function will read a query or table from the specific database + """ + if 'sql' in _args : + sql = _args['sql'] + else: + _table = _args['table'] if 'table' in _args else self._table + sql = f'SELECT * FROM {_table}' + return self.apply(sql) + + +class BaseWriter (SQLBase): + """ + This class implements SQLAlchemy support for Writting to a data-store (RDBMS) + """ + def __init__(self,**_args): + super().__init__(**_args) + def write(self,_data,**_args): + if type(_data) == dict : + _df = pd.DataFrame(_data) + elif type(_data) == list : + _df = pd.DataFrame(_data) + else: + _df = _data.copy() + # + # We are assuming we have a data-frame at this point + # + _table = _args['table'] if 'table' in _args else self._table + _mode = {'chunksize':2000000,'if_exists':'append','index':False} + if 'schema' in _args : + _mode['schema'] = _args['schema'] + if 'if_exists' in _args : + _mode['if_exists'] = _args['if_exists'] + _df.to_sql(_table,self._engine,**_args,index=False) \ No newline at end of file diff --git a/transport/sql/mysql.py b/transport/sql/mysql.py new file mode 100644 index 0000000..320eb68 --- /dev/null +++ b/transport/sql/mysql.py @@ -0,0 +1,18 @@ +""" +This file implements support for mysql and maria db (with drivers mysql+mysql) +""" +from transport.sql.common import BaseReader, BaseWriter +# import mysql.connector as my +class MYSQL: + + def get_provider(self): + return "mysql+mysqlconnector" + def get_default_port(self): + return "3306" +class Reader(MYSQL,BaseReader) : + def __init__(self,**_args): + super().__init__(**_args) + +class Writer(MYSQL,BaseWriter) : + def __init__(self,**_args): + super().__init__(**_args) \ No newline at end of file diff --git a/transport/sql/netezza.py b/transport/sql/netezza.py new file mode 100644 index 0000000..6d53164 --- /dev/null +++ b/transport/sql/netezza.py @@ -0,0 +1,15 @@ +import nzpy as nz +from transport.sql.common import BaseReader, BaseWriter + +class Netezza: + def get_provider(self): + return 'netezza+nzpy' + def get_default_port(self): + return '5480' + +class Reader(Netezza,BaseReader) : + def __init__(self,**_args): + super().__init__(**_args) +class Writer(Netezza,BaseWriter): + def __init__(self,**_args): + super().__init__(**_args) \ No newline at end of file diff --git a/transport/sql/postgresql.py b/transport/sql/postgresql.py new file mode 100644 index 0000000..0831291 --- /dev/null +++ b/transport/sql/postgresql.py @@ -0,0 +1,22 @@ + +from transport.sql.common import BaseReader , BaseWriter +from psycopg2.extensions import register_adapter, AsIs +import numpy as np + +register_adapter(np.int64, AsIs) + +class PG: + def __init__(self,**_args): + super().__init__(**_args) + def get_provider(self): + return "postgresql" + + def get_default_port(self): + return "5432" +class Reader(PG,BaseReader) : + def __init__(self,**_args): + super().__init__(**_args) +class Writer(PG,BaseWriter): + def __init__(self,**_args): + super().__init__(**_args) + diff --git a/transport/sql/sqlite.py b/transport/sql/sqlite.py new file mode 100644 index 0000000..734ab24 --- /dev/null +++ b/transport/sql/sqlite.py @@ -0,0 +1,25 @@ +import sqlalchemy +import pandas as pd +from transport.sql.common import Base, BaseReader, BaseWriter +class SQLite (BaseReader): + def __init__(self,**_args): + super().__init__(**_args) + if 'path' in _args : + self._database = _args['path'] + if 'database' in _args : + self._database = _args['database'] + def _get_uri(self,**_args): + path = self._database + return f'sqlite:///{path}' # ensure this is the correct path for the sqlite file. + +class Reader(SQLite,BaseReader): + def __init__(self,**_args): + super().__init__(**_args) + # def read(self,**_args): + # sql = _args['sql'] + # return pd.read_sql(sql,self._engine) + + +class Writer (SQLite,BaseWriter): + def __init__(self,**_args): + super().__init__(**_args) \ No newline at end of file From 6feae101b0097ef28183cfa4df0d087ce333f449 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 1 Apr 2024 12:52:06 -0500 Subject: [PATCH 02/23] bug fixes: and support for plugins --- transport/__init__.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/transport/__init__.py b/transport/__init__.py index 288f646..387161d 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -17,12 +17,14 @@ Source Code is available under MIT License: https://hiplab.mc.vanderbilt.edu/git/hiplab/data-transport """ import numpy as np + from transport import sql, nosql, cloud, other import pandas as pd import json import os from info import __version__,__author__ - +from transport.iowrapper import IWriter, IReader +from transport.plugins import PluginLoader PROVIDERS = {} def init(): global PROVIDERS @@ -31,7 +33,6 @@ def init(): if _provider_name.startswith('__') : continue PROVIDERS[_provider_name] = {'module':getattr(_module,_provider_name),'type':_module.__name__} -# print ([ {name:getattr(sql,name)} for name in dir(sql) if not name.startswith('__')]) def instance (**_args): """ @@ -55,9 +56,23 @@ def instance (**_args): _context = _args['context'] else: _context = 'read' - _pointer = getattr(_module,'Reader') if _context == 'read' else getattr(_module,'Writer') - return _pointer (**_args) - pass + _pointer = getattr(_module,'Reader') if _context == 'read' else getattr(_module,'Writer') + _agent = _pointer (**_args) + # + loader = None + if 'plugins' in _args : + _params = _args['plugins'] + + if 'path' in _params and 'names' in _params : + loader = PluginLoader(**_params) + elif type(_params) == list: + loader = PluginLoader() + for _delegate in _params : + loader.set(_delegate) + + + return IReader(_agent,loader) if _context == 'read' else IWriter(_agent,loader) + else: raise Exception ("Missing or Unknown provider") pass @@ -79,11 +94,3 @@ class factory : pass factory.instance = instance init() -# if __name__ == '__main__' : -# # if not PROVIDERS : -# init() -# print (list(PROVIDERS.keys())) -# pgr = instance(provider='postgresql',database='io',table='foo',write=True) -# print (pgr.read()) -# print () -# print (supported()) \ No newline at end of file From fd899f554985cecdfd5da04e34decd33063f1bcb Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 1 Apr 2024 12:52:42 -0500 Subject: [PATCH 03/23] adding wrapper class/design pattern to support plugins --- transport/iowrapper.py | 47 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 transport/iowrapper.py diff --git a/transport/iowrapper.py b/transport/iowrapper.py new file mode 100644 index 0000000..f113d85 --- /dev/null +++ b/transport/iowrapper.py @@ -0,0 +1,47 @@ +""" +This class is a wrapper around read/write classes of cloud,sql,nosql,other packages +The wrapper allows for application of plugins as pre-post conditions +""" +class IO: + """ + Base wrapper class for read/write + """ + def __init__(self,_agent,loader): + self._agent = _agent + self._loader = loader + def meta (self,**_args): + if hasattr(self._agent,'meta') : + return self._agent.meta(**_args) + return [] + + def close(self): + if hasattr(self._agent,'close') : + self._agent.close() + def apply(self): + """ + applying pre/post conditions given a pipeline expression + """ + for _pointer in self._loader : + _data = _pointer(_data) + def apply(self,_query): + if hasattr(self._agent,'apply') : + return self._agent.apply(_query) + return None +class IReader(IO): + def __init__(self,_agent,pipeline=None): + super().__init__(_agent,pipeline) + def read(self,**_args): + _data = self._agent.read(**_args) + if self._loader and self._loader.ratio() > 0 : + _data = self._loader.apply(_data) + # + # output data + return _data +class IWriter(IO): + def __init__(self,_agent,pipeline=None): + super().__init__(_agent,pipeline) + def write(self,_data,**_args): + if self._loader and self._loader.ratio() > 0 : + _data = self._loader.apply(_data) + + self._agent.write(_data,**_args) From b160d0a295ed19deef01d885b1e93c8774b7897e Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 1 Apr 2024 13:27:14 -0500 Subject: [PATCH 04/23] housekeeping work --- transport/couch.py | 234 --------------------------------------- transport/mongo.py | 241 ----------------------------------------- transport/nextcloud.py | 80 -------------- transport/qlistener.py | 47 -------- transport/session.py | 88 --------------- 5 files changed, 690 deletions(-) delete mode 100644 transport/couch.py delete mode 100644 transport/mongo.py delete mode 100644 transport/nextcloud.py delete mode 100644 transport/qlistener.py delete mode 100644 transport/session.py diff --git a/transport/couch.py b/transport/couch.py deleted file mode 100644 index 8e02a4e..0000000 --- a/transport/couch.py +++ /dev/null @@ -1,234 +0,0 @@ -""" -Data-Transport -Steve L. Nyemba, The Phi Technology - -This file is a wrapper around couchdb using IBM Cloudant SDK that has an interface to couchdb - -""" -import cloudant -import json -import sys -if sys.version_info[0] > 2 : - from transport.common import Reader, Writer -else: - from common import Reader, Writer -class Couch: - """ - This class is a wrapper for read/write against couchdb. The class captures common operations for read/write. - @param url host & port reference default http://localhost:5984 - @param doc user id involved - @param dbname database name (target) - """ - def __init__(self,**args): - url = args['url'] if 'url' in args else 'http://localhost:5984' - self._id = args['doc'] - dbname = args['dbname'] - if 'username' not in args and 'password' not in args : - self.server = cloudant.CouchDB(None,None,url=url) - else: - self.server = cloudant.CouchDB(args['username'],args['password'],url=url) - self.server.connect() - - if dbname in self.server.all_dbs() : - self.dbase = self.server.get(dbname,dbname,True) - # - # @TODO Check if the database exists ... - # - doc = cloudant.document.Document(self.dbase,self._id) #self.dbase.get(self._id) - if not doc.exists(): - doc = self.dbase.create_document({"_id":self._id}) - doc.save() - else: - self.dbase = None - """ - Insuring the preconditions are met for processing - """ - def isready(self): - p = self.server.metadata() != {} - if p == False or not self.dbase: - return False - # - # At this point we are sure that the server is connected - # We are also sure that the database actually exists - # - doc = cloudant.document.Document(self.dbase,self._id) - # q = self.dbase.all_docs(key=self._id)['rows'] - # if not q : - if not doc.exists(): - return False - return True - - def view(self,**args): - """ - The function will execute a view (provivded a user is authenticated) - :id design document _design/xxxx (provide full name with _design prefix) - :view_name name of the view i.e - :key(s) key(s) to be used to filter the content - """ - document = cloudant.design_document.DesignDocument(self.dbase,args['id']) - document.fetch() - params = {'group_level':1,'group':True} - if 'key' in args : - params ['key'] = args['key'] - elif 'keys' in args : - params['keys'] = args['keys'] - return document.get_view(args['view_name'])(**params)['rows'] - - - - -class CouchReader(Couch,Reader): - """ - This function will read an attachment from couchdb and return it to calling code. The attachment must have been placed before hand (otherwise oops) - @T: Account for security & access control - """ - def __init__(self,**args): - """ - @param filename filename (attachment) - """ - # - # setting the basic parameters for - Couch.__init__(self,**args) - if 'filename' in args : - self.filename = args['filename'] - else: - self.filename = None - - # def isready(self): - # # - # # Is the basic information about the database valid - # # - # p = Couchdb.isready(self) - - # if p == False: - # return False - # # - # # The database name is set and correct at this point - # # We insure the document of the given user has the requested attachment. - # # - - # doc = self.dbase.get(self._id) - - # if '_attachments' in doc: - # r = self.filename in doc['_attachments'].keys() - - # else: - # r = False - - # return r - def stream(self): - # - # @TODO Need to get this working ... - # - document = cloudant.document.Document(self.dbase,self._id) - # content = self.dbase.fetch_attachment(self._id,self.filename).split('\n') ; - content = self.get_attachment(self.filename) - for row in content: - yield row - - def read(self,**args): - if self.filename is not None: - self.stream() - else: - return self.basic_read() - def basic_read(self): - document = cloudant.document.Document(self.dbase,self._id) - - # document = self.dbase.get(self._id) - if document.exists() : - document.fetch() - document = dict(document) - del document['_rev'] - else: - document = {} - return document - -class CouchWriter(Couch,Writer): - """ - This class will write on a couchdb document provided a scope - The scope is the attribute that will be on the couchdb document - """ - def __init__(self,**args): - """ - @param uri host & port reference - @param uid user id involved - @param filename filename (attachment) - @param dbname database name (target) - """ - - Couch.__init__(self,**args) - def set (self,info): - document = cloudand.document.Document(self.dbase,self._id) - if document.exists() : - keys = list(set(document.keys()) - set(['_id','_rev','_attachments'])) - for id in keys : - document.field_set(document,id,None) - for id in args : - value = args[id] - document.field_set(document,id,value) - - document.save() - pass - else: - _document = dict({"_id":self._id},**args) - document.create_document(_document) - def write(self,info): - """ - write a given attribute to a document database - @info object to be written to the to an attribute. this - """ - - # document = self.dbase.get(self._id) - document = cloudant.document.Document(self.dbase,self._id) #.get(self._id) - if document.exists() is False : - document = self.dbase.create_document({"_id":self._id}) - # label = params['label'] - # row = params['row'] - # if label not in document : - # document[label] = [] - # document[label].append(row) - for key in info : - if key in document and type(document[key]) == list : - document[key] += info[key] - else: - document[key] = info[key] - - document.save() - # self.dbase.bulk_docs([document]) - # self.dbase.save_doc(document) - - def upload(self,**args): - """ - :param name name of the file to be uploaded - :param data content of the file (binary or text) - :param content_type (default) - """ - mimetype = args['content_type'] if 'content_type' in args else 'text/plain' - document = cloudant.document.Document(self.dbase,self.uid) - document.put_attachment(self.dbase,args['filename'],mimetype,args['content']) - document.save() - - def archive(self,params=None): - """ - This function will archive the document onto itself. - """ - # document = self.dbase.all_docs(self._id,include_docs=True) - document = cloudant.document.Document(self.dbase,self.filename) - document.fetch() - content = {} - # _doc = {} - for id in document: - if id not in ['_id','_rev','_attachments'] : - content[id] = document[id] - del document[id] - - content = json.dumps(content) - # document= _doc - now = str(datetime.today()) - - name = '-'.join([document['_id'] , now,'.json']) - self.upload(filename=name,data=content,content_type='application/json') - # self.dbase.bulk_docs([document]) - # self.dbase.put_attachment(document,content,name,'application/json') - # document.put_attachment(self.dbase,name,'application/json',content) - # document.save() diff --git a/transport/mongo.py b/transport/mongo.py deleted file mode 100644 index c7b5ed8..0000000 --- a/transport/mongo.py +++ /dev/null @@ -1,241 +0,0 @@ -""" -Data Transport - 1.0 -Steve L. Nyemba, The Phi Technology LLC - -This file is a wrapper around mongodb for reading/writing content against a mongodb server and executing views (mapreduce) -""" -from pymongo import MongoClient -from bson.objectid import ObjectId -from bson.binary import Binary -# import nujson as json -from datetime import datetime -import pandas as pd -import numpy as np -import gridfs -# from transport import Reader,Writer -import sys -if sys.version_info[0] > 2 : - from transport.common import Reader, Writer, IEncoder -else: - from common import Reader, Writer -import json -import re -from multiprocessing import Lock, RLock -class Mongo : - lock = RLock() - """ - Basic mongodb functions are captured here - """ - def __init__(self,**args): - """ - :dbname database name/identifier - :host host and port of the database by default localhost:27017 - :username username for authentication - :password password for current user - """ - - self.mechanism= 'SCRAM-SHA-256' if 'mechanism' not in args else args['mechanism'] - # authSource=(args['authSource'] if 'authSource' in args else self.dbname) - self._lock = False if 'lock' not in args else args['lock'] - self.dbname = None - username = password = None - if 'auth_file' in args : - _info = json.loads((open(args['auth_file'])).read()) - - - else: - _info = {} - _args = dict(args,**_info) - _map = {'dbname':'db','database':'db','table':'uid','collection':'uid','col':'uid','doc':'uid'} - for key in _args : - if key in ['username','password'] : - username = _args['username'] if key=='username' else username - password = _args['password'] if key == 'password' else password - continue - value = _args[key] - if key in _map : - key = _map[key] - - self.setattr(key,value) - # - # Let us perform aliasing in order to remain backwards compatible - - self.dbname = self.db if hasattr(self,'db')else self.dbname - self.uid = _args['table'] if 'table' in _args else (_args['doc'] if 'doc' in _args else (_args['collection'] if 'collection' in _args else None)) - if username and password : - self.client = MongoClient(self.host, - username=username, - password=password , - authSource=self.authSource, - authMechanism=self.mechanism) - - else: - self.client = MongoClient(self.host,maxPoolSize=10000) - - self.db = self.client[self.dbname] - - def isready(self): - p = self.dbname in self.client.list_database_names() - q = self.uid in self.client[self.dbname].list_collection_names() - return p and q - def setattr(self,key,value): - _allowed = ['host','port','db','doc','collection','authSource','mechanism'] - if key in _allowed : - setattr(self,key,value) - pass - def close(self): - self.client.close() - def meta(self,**_args): - return [] -class MongoReader(Mongo,Reader): - """ - This class will read from a mongodb data store and return the content of a document (not a collection) - """ - def __init__(self,**args): - Mongo.__init__(self,**args) - def read(self,**args): - - if 'mongo' in args or 'cmd' in args or 'pipeline' in args: - # - # @TODO: - cmd = {} - if 'aggregate' not in cmd and 'aggregate' not in args: - cmd['aggregate'] = self.uid - elif 'aggregate' in args : - cmd['aggregate'] = args['aggregate'] - if 'pipeline' in args : - cmd['pipeline']= args['pipeline'] - - if 'pipeline' not in args or 'aggregate' not in cmd : - cmd = args['mongo'] if 'mongo' in args else args['cmd'] - if "aggregate" in cmd : - if "allowDiskUse" not in cmd : - cmd["allowDiskUse"] = True - if "cursor" not in cmd : - cmd["cursor"] = {} - r = [] - out = self.db.command(cmd) - #@TODO: consider using a yield (generator) works wonders - while True : - if 'values' in out : - r += out['values'] - if 'cursor' in out : - key = 'firstBatch' if 'firstBatch' in out['cursor'] else 'nextBatch' - else: - key = 'n' - if 'cursor' in out and out['cursor'][key] : - r += list(out['cursor'][key]) - elif key in out and out[key]: - r.append (out[key]) - # yield out['cursor'][key] - if key not in ['firstBatch','nextBatch'] or ('cursor' in out and out['cursor']['id'] == 0) : - break - else: - out = self.db.command({"getMore":out['cursor']['id'],"collection":out['cursor']['ns'].split(".")[-1]}) - - - return pd.DataFrame(r) - else: - - - if 'table' in args or 'collection' in args : - if 'table' in args: - _uid = args['table'] - elif 'collection' in args : - _uid = args['collection'] - else: - _uid = self.uid - else: - _uid = self.uid - collection = self.db[_uid] - _filter = args['filter'] if 'filter' in args else {} - _df = pd.DataFrame(collection.find(_filter)) - columns = _df.columns.tolist()[1:] - return _df[columns] - def view(self,**args): - """ - This function is designed to execute a view (map/reduce) operation - """ - pass -class MongoWriter(Mongo,Writer): - """ - This class is designed to write to a mongodb collection within a database - """ - def __init__(self,**args): - Mongo.__init__(self,**args) - def upload(self,**args) : - """ - This function will upload a file to the current database (using GridFS) - :param data binary stream/text to be stored - :param filename filename to be used - :param encoding content_encoding (default utf-8) - - """ - if 'encoding' not in args : - args['encoding'] = 'utf-8' - gfs = GridFS(self.db) - gfs.put(**args) - - def archive(self): - """ - This function will archive documents to the - """ - collection = self.db[self.uid] - rows = list(collection.find()) - for row in rows : - if type(row['_id']) == ObjectId : - row['_id'] = str(row['_id']) - stream = Binary(json.dumps(collection,cls=IEncoder).encode()) - collection.delete_many({}) - now = "-".join([str(datetime.now().year()),str(datetime.now().month), str(datetime.now().day)]) - name = ".".join([self.uid,'archive',now])+".json" - description = " ".join([self.uid,'archive',str(len(rows))]) - self.upload(filename=name,data=stream,description=description,content_type='application/json') - # gfs = GridFS(self.db) - # gfs.put(filename=name,description=description,data=stream,encoding='utf-8') - # self.write({{"filename":name,"file":stream,"description":descriptions}}) - - - pass - def write(self,info,**_args): - """ - This function will write to a given collection i.e add a record to a collection (no updates) - @param info new record in the collection to be added - """ - # document = self.db[self.uid].find() - #collection = self.db[self.uid] - # if type(info) == list : - # self.db[self.uid].insert_many(info) - # else: - try: - if 'table' in _args or 'collection' in _args : - _uid = _args['table'] if 'table' in _args else _args['collection'] - else: - _uid = self.uid if 'doc' not in _args else _args['doc'] - if self._lock : - Mongo.lock.acquire() - if type(info) == list or type(info) == pd.DataFrame : - self.db[_uid].insert_many(info if type(info) == list else info.to_dict(orient='records')) - else: - self.db[_uid].insert_one(info) - finally: - if self._lock : - Mongo.lock.release() - def set(self,document): - """ - if no identifier is provided the function will delete the entire collection and set the new document. - Please use this function with great care (archive the content first before using it... for safety) - """ - - collection = self.db[self.uid] - if collection.count_document() > 0 and '_id' in document: - id = document['_id'] - del document['_id'] - collection.find_one_and_replace({'_id':id},document) - else: - collection.delete_many({}) - self.write(info) - def close(self): - Mongo.close(self) - # collecton.update_one({"_id":self.uid},document,True) - diff --git a/transport/nextcloud.py b/transport/nextcloud.py deleted file mode 100644 index 2eefd51..0000000 --- a/transport/nextcloud.py +++ /dev/null @@ -1,80 +0,0 @@ -""" -We are implementing transport to and from nextcloud (just like s3) -""" -import os -import sys -from transport.common import Reader,Writer, IEncoder -import pandas as pd -from io import StringIO -import json -import nextcloud_client as nextcloud - -class Nextcloud : - def __init__(self,**_args): - pass - self._delimiter = None - self._handler = nextcloud.Client(_args['url']) - _uid = _args['uid'] - _token = _args['token'] - self._uri = _args['folder'] if 'folder' in _args else './' - if self._uri.endswith('/') : - self._uri = self._uri[:-1] - self._file = None if 'file' not in _args else _args['file'] - self._handler.login(_uid,_token) - def close(self): - try: - self._handler.logout() - except Exception as e: - pass - - -class NextcloudReader(Nextcloud,Reader): - def __init__(self,**_args): - # self._file = [] if 'file' not in _args else _args['file'] - super().__init__(**_args) - pass - def read(self,**_args): - _filename = self._file if 'file' not in _args else _args['file'] - # - # @TODO: if _filename is none, an exception should be raised - # - _uri = '/'.join([self._uri,_filename]) - if self._handler.get_file(_uri) : - # - # - _info = self._handler.file_info(_uri) - _content = self._handler.get_file_contents(_uri).decode('utf8') - if _info.get_content_type() == 'text/csv' : - # - # @TODO: enable handling of csv, xls, parquet, pickles - _file = StringIO(_content) - return pd.read_csv(_file) - else: - # - # if it is neither a structured document like csv, we will return the content as is - return _content - return None -class NextcloudWriter (Nextcloud,Writer): - """ - This class will write data to an instance of nextcloud - """ - def __init__(self,**_args) : - super().__init__(**_args) - self - def write(self,_data,**_args): - """ - This function will upload a file to a given destination - :file has the uri of the location of the file - """ - _filename = self._file if 'file' not in _args else _args['file'] - _uri = '/'.join([self._uri,_filename]) - if type(_data) == pd.DataFrame : - f = StringIO() - _data.to_csv(f,index=False) - _content = f.getvalue() - elif type(_data) == dict : - _content = json.dumps(_data,cls=IEncoder) - else: - _content = str(_data) - self._handler.put_file_contents(_uri,_content) - diff --git a/transport/qlistener.py b/transport/qlistener.py deleted file mode 100644 index 26f0ba8..0000000 --- a/transport/qlistener.py +++ /dev/null @@ -1,47 +0,0 @@ -import queue -from threading import Thread, Lock -from transport.common import Reader,Writer -import numpy as np -import pandas as pd - -class qListener : - lock = Lock() - _queue = {'default':queue.Queue()} - def __init__(self,**_args): - self._cache = {} - self._callback = _args['callback'] if 'callback' in _args else None - self._id = _args['id'] if 'id' in _args else 'default' - if self._id not in qListener._queue : - qListener._queue[self._id] = queue.Queue() - thread = Thread(target=self._forward) - thread.start() - def _forward(self): - _q = qListener._queue[self._id] - _data = _q.get() - _q.task_done() - self._callback(_data) - - def has(self,**_args) : - return self._callback is not None - - - def close(self): - """ - This will empty the queue and have it ready for another operation - """ - _q = qListener._queue[self._id] - with _q.mutex: - _q.queue.clear() - _q.all_tasks_done.notify_all() - - def write(self,_data,**_args): - _id = _args['id'] if 'id' in _args else self._id - - _q = qListener._queue[_id] - _q.put(_data) - _q.join() -class Console (qListener): - def __init__(self,**_args): - super().__init__(callback=print) - - # self.callback = print \ No newline at end of file diff --git a/transport/session.py b/transport/session.py deleted file mode 100644 index d74669a..0000000 --- a/transport/session.py +++ /dev/null @@ -1,88 +0,0 @@ -from flask import request, session -from datetime import datetime -import re -from transport.common import Reader, Writer -import json -import requests -from io import StringIO -import pandas as pd - - -class HttpReader(Reader): - """ - This class is designed to read data from an Http request file handler provided to us by flask - The file will be heald in memory and processed accordingly - NOTE: This is inefficient and can crash a micro-instance (becareful) - """ - - def __init__(self,**_args): - self._url = _args['url'] - self._headers = None if 'headers' not in _args else _args['headers'] - - # def isready(self): - # return self.file_length > 0 - def format(self,_response): - _mimetype= _response.headers['Content-Type'] - if _mimetype == 'text/csv' or 'text/csv': - _content = _response.text - return pd.read_csv(StringIO(_content)) - # - # @TODO: Add support for excel, JSON and other file formats that fit into a data-frame - # - - return _response.text - def read(self,**_args): - if self._headers : - r = requests.get(self._url,headers = self._headers) - else: - r = requests.get(self._url,headers = self._headers) - return self.format(r) - -class HttpWriter(Writer): - """ - This class is designed to submit data to an endpoint (url) - """ - def __init__(self,**_args): - """ - @param key required session key - """ - self._url = _args['url'] - self._name = _args['name'] - self._method = 'post' if 'method' not in _args else _args['method'] - - # self.session = params['queue'] - # self.session['sql'] = [] - # self.session['csv'] = [] - # self.tablename = re.sub('..+$','',params['filename']) - # self.session['uid'] = params['uid'] - #self.xchar = params['xchar'] - - - def format_sql(self,row): - values = "','".join([col.replace('"','').replace("'",'') for col in row]) - return "".join(["INSERT INTO :table VALUES('",values,"');\n"]).replace(':table',self.tablename) - def isready(self): - return True - def write(self,_data,**_args): - # - # - _method = self._method if 'method' not in _args else _args['method'] - _method = _method.lower() - _mimetype = 'text/csv' - if type(_data) == dict : - _mimetype = 'application/json' - _content = _data - else: - _content = _data.to_dict(orient='records') - _headers = {'Content-Type':_mimetype} - _pointer = getattr(requests,_method) - - _pointer ({self._name:_content},headers=_headers) - - - # label = params['label'] - # row = params ['row'] - - # if label == 'usable': - # self.session['csv'].append(self.format(row,',')) - # self.session['sql'].append(self.format_sql(row)) From 0cf56f3e8f1c4b1909a1ce2f5f88f086bda7bce5 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 1 Apr 2024 13:28:13 -0500 Subject: [PATCH 05/23] refactoring V2.0 --- transport/bricks.py | 111 ----------------- transport/common.py | 151 ----------------------- transport/disk.py | 269 ---------------------------------------- transport/rabbitmq.py | 279 ------------------------------------------ transport/s3.py | 130 -------------------- 5 files changed, 940 deletions(-) delete mode 100644 transport/bricks.py delete mode 100644 transport/common.py delete mode 100644 transport/disk.py delete mode 100644 transport/rabbitmq.py delete mode 100644 transport/s3.py diff --git a/transport/bricks.py b/transport/bricks.py deleted file mode 100644 index 0aa4383..0000000 --- a/transport/bricks.py +++ /dev/null @@ -1,111 +0,0 @@ -""" -This file implements databricks handling, This functionality will rely on databricks-sql-connector -LICENSE (MIT) -Copyright 2016-2020, The Phi Technology LLC - -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -@TODO: - - Migrate SQLite to SQL hierarchy - - Include Write in Chunks from pandas -""" -import os -import sqlalchemy -from transport.common import Reader,Writer -import pandas as pd - - -class Bricks: - """ - :host - :token - :database - :cluster_path - :table - """ - def __init__(self,**_args): - _host = _args['host'] - _token= _args['token'] - _cluster_path = _args['cluster_path'] - self._schema = _args['schema'] if 'schema' in _args else _args['database'] - _catalog = _args['catalog'] - self._table = _args['table'] if 'table' in _args else None - - # - # @TODO: - # Sometimes when the cluster isn't up and running it takes a while, the user should be alerted of this - # - - _uri = f'''databricks://token:{_token}@{_host}?http_path={_cluster_path}&catalog={_catalog}&schema={self._schema}''' - self._engine = sqlalchemy.create_engine (_uri) - pass - def meta(self,**_args): - table = _args['table'] if 'table' in _args else self._table - if not table : - return [] - else: - if sqlalchemy.__version__.startswith('1.') : - _m = sqlalchemy.MetaData(bind=self._engine) - _m.reflect(only=[table]) - else: - _m = sqlalchemy.MetaData() - _m.reflect(bind=self._engine) - # - # Let's retrieve te information associated with a table - # - return [{'name':_attr.name,'type':_attr.type} for _attr in _m.tables[table].columns] - - def has(self,**_args): - return self.meta(**_args) - def apply(self,_sql): - try: - if _sql.lower().startswith('select') : - return pd.read_sql(_sql,self._engine) - except Exception as e: - pass - -class BricksReader(Bricks,Reader): - """ - This class is designed for reads and will execute reads against a table name or a select SQL statement - """ - def __init__(self,**_args): - super().__init__(**_args) - def read(self,**_args): - limit = None if 'limit' not in _args else str(_args['limit']) - - if 'sql' in _args : - sql = _args['sql'] - elif 'table' in _args : - table = _args['table'] - sql = f'SELECT * FROM {table}' - if limit : - sql = sql + f' LIMIT {limit}' - - if 'sql' in _args or 'table' in _args : - return self.apply(sql) - else: - return pd.DataFrame() - pass -class BricksWriter(Bricks,Writer): - def __init__(self,**_args): - super().__init__(**_args) - def write(self,_data,**_args): - """ - This data will write data to data-bricks against a given table. If the table is not specified upon initiazation, it can be specified here - _data: data frame to push to databricks - _args: chunks, table, schema - """ - _schema = self._schema if 'schema' not in _args else _args['schema'] - _table = self._table if 'table' not in _args else _args['table'] - _df = _data if type(_data) == pd.DataFrame else _data - if type(_df) == dict : - _df = [_df] - if type(_df) == list : - _df = pd.DataFrame(_df) - _df.to_sql( - name=_table,schema=_schema, - con=self._engine,if_exists='append',index=False); - pass diff --git a/transport/common.py b/transport/common.py deleted file mode 100644 index 8b9f718..0000000 --- a/transport/common.py +++ /dev/null @@ -1,151 +0,0 @@ -""" -Data Transport - 1.0 -Steve L. Nyemba, The Phi Technology LLC - -This module is designed to serve as a wrapper to a set of supported data stores : - - couchdb - - mongodb - - Files (character delimited) - - Queues (Rabbmitmq) - - Session (Flask) - - s3 -The supported operations are read/write and providing meta data to the calling code -Requirements : - pymongo - boto - couldant -@TODO: - Enable read/writing to multiple reads/writes -""" -__author__ = 'The Phi Technology' -import numpy as np -import json -import importlib -from multiprocessing import RLock -import queue -# import couch -# import mongo -from datetime import datetime - -class IO: - def init(self,**args): - """ - This function enables attributes to be changed at runtime. Only the attributes defined in the class can be changed - Adding attributes will require sub-classing otherwise we may have an unpredictable class ... - """ - allowed = list(vars(self).keys()) - for field in args : - if field not in allowed : - continue - value = args[field] - setattr(self,field,value) -class IEncoder (json.JSONEncoder): - def default (self,object): - if type(object) == np.integer : - return int(object) - elif type(object) == np.floating: - return float(object) - elif type(object) == np.ndarray : - return object.tolist() - elif type(object) == datetime : - return object.isoformat() - else: - return super(IEncoder,self).default(object) - -class Reader (IO): - """ - This class is an abstraction of a read functionalities of a data store - """ - def __init__(self): - pass - def meta(self,**_args): - """ - This function is intended to return meta-data associated with what has just been read - @return object of meta data information associated with the content of the store - """ - raise Exception ("meta function needs to be implemented") - def read(self,**args): - """ - This function is intended to read the content of a store provided parameters to be used at the discretion of the subclass - """ - raise Exception ("read function needs to be implemented") - - -class Writer(IO): - def __init__(self): - self.cache = {"default":[]} - def log(self,**args): - self.cache[id] = args - def meta (self,id="default",**args): - raise Exception ("meta function needs to be implemented") - def format(self,row,xchar): - if xchar is not None and isinstance(row,list): - return xchar.join(row)+'\n' - elif xchar is None and isinstance(row,dict): - row = json.dumps(row) - return row - def write(self,**args): - """ - This function will write content to a store given parameters to be used at the discretion of the sub-class - """ - raise Exception ("write function needs to be implemented") - - def archive(self): - """ - It is important to be able to archive data so as to insure that growth is controlled - Nothing in nature grows indefinitely neither should data being handled. - """ - raise Exception ("archive function needs to be implemented") - def close(self): - """ - This function will close the persistent storage connection/handler - """ - pass -class ReadWriter(Reader,Writer) : - """ - This class implements the read/write functions aggregated - """ - pass -# class Console(Writer): -# lock = RLock() -# def __init__(self,**_args): -# self.lock = _args['lock'] if 'lock' in _args else False -# self.info = self.write -# self.debug = self.write -# self.log = self.write -# pass -# def write (self,logs=None,**_args): -# if self.lock : -# Console.lock.acquire() -# try: -# _params = _args if logs is None and _args else logs -# if type(_params) == list: -# for row in _params : -# print (row) -# else: -# print (_params) -# except Exception as e : -# print (e) -# finally: -# if self.lock : -# Console.lock.release() - - -""" -@NOTE : Experimental !! -""" -class Proxy : - """ - This class will forward a call to a function that is provided by the user code - """ - def __init__(self,**_args): - self.callback = _args['callback'] - def read(self,**_args) : - try: - return self.callback(**_args) - except Exception as e: - return self.callback() - - pass - def write(self,data,**_args): - self.callback(data,**_args) diff --git a/transport/disk.py b/transport/disk.py deleted file mode 100644 index 5e43b69..0000000 --- a/transport/disk.py +++ /dev/null @@ -1,269 +0,0 @@ -import os -import sys - - -if sys.version_info[0] > 2 : - from transport.common import Reader, Writer #, factory -else: - from common import Reader,Writer -# import nujson as json -import json -# from threading import Lock -import sqlite3 -import pandas as pd -from multiprocessing import Lock -from transport.common import Reader, Writer, IEncoder -import sqlalchemy -from sqlalchemy import create_engine -class DiskReader(Reader) : - """ - This class is designed to read data from disk (location on hard drive) - @pre : isready() == True - """ - - def __init__(self,**params): - """ - - @param path absolute path of the file to be read - """ - - Reader.__init__(self) - self.path = params['path'] if 'path' in params else None - self.delimiter = params['delimiter'] if 'delimiter' in params else ',' - - def isready(self): - return os.path.exists(self.path) - def meta(self,**_args): - return [] - def read(self,**args): - _path = self.path if 'path' not in args else args['path'] - _delimiter = self.delimiter if 'delimiter' not in args else args['delimiter'] - return pd.read_csv(_path,delimiter=self.delimiter) - def stream(self,**args): - """ - This function reads the rows from a designated location on disk - @param size number of rows to be read, -1 suggests all rows - """ - - size = -1 if 'size' not in args else int(args['size']) - f = open(self.path,'rU') - i = 1 - for row in f: - - i += 1 - if size == i: - break - if self.delimiter : - yield row.split(self.delimiter) - yield row - f.close() -class DiskWriter(Writer): - - """ - This function writes output to disk in a designated location. The function will write a text to a text file - - If a delimiter is provided it will use that to generate a xchar-delimited file - - If not then the object will be dumped as is - """ - THREAD_LOCK = Lock() - def __init__(self,**params): - super().__init__() - self._path = params['path'] - self._delimiter = params['delimiter'] if 'delimiter' in params else None - self._mode = 'w' if 'mode' not in params else params['mode'] - # def meta(self): - # return self.cache['meta'] - # def isready(self): - # """ - # This function determines if the class is ready for execution or not - # i.e it determines if the preconditions of met prior execution - # """ - # return True - # # p = self.path is not None and os.path.exists(self.path) - # # q = self.name is not None - # # return p and q - # def format (self,row): - # self.cache['meta']['cols'] += len(row) if isinstance(row,list) else len(row.keys()) - # self.cache['meta']['rows'] += 1 - # return (self.delimiter.join(row) if self.delimiter else json.dumps(row))+"\n" - def write(self,info,**_args): - """ - This function writes a record to a designated file - @param label - @param row row to be written - """ - try: - - - DiskWriter.THREAD_LOCK.acquire() - - _delim = self._delimiter if 'delimiter' not in _args else _args['delimiter'] - _path = self._path if 'path' not in _args else _args['path'] - _mode = self._mode if 'mode' not in _args else _args['mode'] - info.to_csv(_path,index=False,sep=_delim) - pass - except Exception as e: - # - # Not sure what should be done here ... - pass - finally: - DiskWriter.THREAD_LOCK.release() -class SQLite : - def __init__(self,**_args) : - self.path = _args['database'] if 'database' in _args else _args['path'] - self.conn = sqlite3.connect(self.path,isolation_level="IMMEDIATE") - self.conn.row_factory = sqlite3.Row - self.fields = _args['fields'] if 'fields' in _args else [] - def has (self,**_args): - found = False - try: - if 'table' in _args : - table = _args['table'] - sql = "SELECT * FROM :table limit 1".replace(":table",table) - _df = pd.read_sql(sql,self.conn) - found = _df.columns.size > 0 - except Exception as e: - pass - return found - def close(self): - try: - self.conn.close() - except Exception as e : - print(e) - def apply(self,sql): - try: - if not sql.lower().startswith('select'): - cursor = self.conn.cursor() - cursor.execute(sql) - cursor.close() - self.conn.commit() - else: - return pd.read_sql(sql,self.conn) - except Exception as e: - print (e) -class SQLiteReader (SQLite,DiskReader): - def __init__(self,**args): - super().__init__(**args) - # DiskReader.__init__(self,**args) - # self.path = args['database'] if 'database' in args else args['path'] - # self.conn = sqlite3.connect(self.path,isolation_level=None) - # self.conn.row_factory = sqlite3.Row - self.table = args['table'] if 'table' in args else None - def read(self,**args): - if 'sql' in args : - sql = args['sql'] - elif 'filter' in args : - sql = "SELECT :fields FROM ",self.table, "WHERE (:filter)".replace(":filter",args['filter']) - sql = sql.replace(":fields",args['fields']) if 'fields' in args else sql.replace(":fields","*") - else: - sql = ' '.join(['SELECT * FROM ',self.table]) - if 'limit' in args : - sql = sql + " LIMIT "+args['limit'] - return pd.read_sql(sql,self.conn) - def close(self): - try: - self.conn.close() - except Exception as e : - pass - -class SQLiteWriter(SQLite,DiskWriter) : - connection = None - LOCK = Lock() - def __init__(self,**args): - """ - :path - :fields json|csv - """ - # DiskWriter.__init__(self,**args) - super().__init__(**args) - self.table = args['table'] if 'table' in args else None - path = self.path - self._engine = create_engine(f'sqlite:///{path}') - - # self.conn = sqlite3.connect(self.path,isolation_level="IMMEDIATE") - # self.conn.row_factory = sqlite3.Row - # self.fields = args['fields'] if 'fields' in args else [] - - if self.fields and not self.isready() and self.table: - self.init(self.fields) - SQLiteWriter.connection = self.conn - def init(self,fields): - self.fields = fields; - sql = " ".join(["CREATE TABLE IF NOT EXISTS ",self.table," (", ",".join(self.fields),")"]) - - cursor = self.conn.cursor() - cursor.execute(sql) - cursor.close() - self.conn.commit() - def isready(self): - try: - sql = "SELECT count(*) FROM sqlite_master where name=':table'" - sql = sql.replace(":table",self.table) - cursor = self.conn.cursor() - - r = cursor.execute(sql) - r = r.fetchall() - cursor.close() - - return r[0][0] != 0 - except Exception as e: - pass - return 0 - # - # If the table doesn't exist we should create it - # - # def write(self,_data,**_args): - # SQLiteWriter.LOCK.acquire() - # try: - # if type(_data) == dict : - # _data = [_data] - # _table = self.table if 'table' not in _args else _args['table'] - # _df = pd.DataFrame(_data) - # _df.to_sql(_table,self._engine.connect(),if_exists='append',index=False) - # except Exception as e: - # print (e) - # SQLiteWriter.LOCK.release() - def write(self,info,**_args): - """ - """ - - #if not self.fields : - # #if type(info) == pd.DataFrame : - # # _columns = list(info.columns) - # #self.init(list(info.keys())) - - if type(info) == dict : - info = [info] - elif type(info) == pd.DataFrame : - info = info.fillna('') - info = info.to_dict(orient='records') - - if not self.fields : - _rec = info[0] - self.init(list(_rec.keys())) - - SQLiteWriter.LOCK.acquire() - try: - - cursor = self.conn.cursor() - sql = " " .join(["INSERT INTO ",self.table,"(", ",".join(self.fields) ,")", "values(:values)"]) - for row in info : - values = [ str(row[field]) if type(row[field]) not in [list,dict] else json.dumps(row[field],cls=IEncoder) for field in self.fields] - values = ["".join(["'",value,"'"]) for value in values] - - # stream =["".join(["",value,""]) if type(value) == str else value for value in row.values()] - # stream = json.dumps(stream,cls=IEncoder) - # stream = stream.replace("[","").replace("]","") - - # print (sql.replace(":values",stream)) - # self.conn.execute(sql.replace(":values",stream) ) - self.conn.execute(sql.replace(":values", ",".join(values)) ) - # cursor.commit() - - self.conn.commit() - # print (sql) - except Exception as e : - print () - - print (e) - pass - SQLiteWriter.LOCK.release() diff --git a/transport/rabbitmq.py b/transport/rabbitmq.py deleted file mode 100644 index a56393b..0000000 --- a/transport/rabbitmq.py +++ /dev/null @@ -1,279 +0,0 @@ -""" -Data Transport - 1.0 -Steve L. Nyemba, The Phi Technology LLC - -This file is a wrapper around rabbitmq server for reading and writing content to a queue (exchange) - -""" -import pika -from datetime import datetime -import re -import json -import os -import sys -if sys.version_info[0] > 2 : - from transport.common import Reader, Writer -else: - from common import Reader, Writer -import json -from multiprocessing import RLock -class MessageQueue: - """ - This class hierarchy is designed to handle interactions with a queue server using pika framework (our tests are based on rabbitmq) - :host - :xid identifier of the exchange - :qid identifier of the queue - """ - def __init__(self,**params): - self.host= 'localhost' if 'host' not in params else params['host'] #-- location of the queue server - self.port= 5672 if 'port' not in params else params['port'] - self.virtual_host = '/' if 'vhost' not in params else params['vhost'] - self.exchange = params['exchange'] if 'exchange' in params else 'amq.direct' #-- exchange - self.queue = params['queue'] if 'queue' in params else 'demo' - self.connection = None - self.channel = None - - self.name = self.__class__.__name__.lower() if 'name' not in params else params['name'] - - username = password = None - if 'username' in params : - username = params['username'] - password = params['password'] - if 'auth_file' in params : - _info = json.loads((open(params['auth_file'])).read()) - username=_info['username'] - password=_info['password'] - self.virtual_host = _info['virtual_host'] if 'virtual_host' in _info else self.virtual_host - self.exchange = _info['exchange'] if 'exchange' in _info else self.exchange - self.queue = _info['queue'] if 'queue' in _info else self.queue - - self.credentials= pika.PlainCredentials('guest','guest') - if 'username' in params : - self.credentials = pika.PlainCredentials( - params['username'], - ('' if 'password' not in params else params['password']) - ) - - def init(self,label=None): - properties = pika.ConnectionParameters(host=self.host,port=self.port,virtual_host=self.virtual_host, - client_properties={'connection_name':self.name}, - credentials=self.credentials) - self.connection = pika.BlockingConnection(properties) - self.channel = self.connection.channel() - self.info = self.channel.exchange_declare(exchange=self.exchange,exchange_type='direct',durable=True) - if label is None: - self.qhandler = self.channel.queue_declare(queue=self.queue,durable=True) - else: - self.qhandler = self.channel.queue_declare(queue=label,durable=True) - - self.channel.queue_bind(exchange=self.exchange,queue=self.qhandler.method.queue) - - def isready(self): - #self.init() - resp = self.connection is not None and self.connection.is_open - # self.close() - return resp - def finalize(self): - pass - def close(self): - if self.connection.is_closed == False : - self.channel.close() - self.connection.close() - -class QueueWriter(MessageQueue,Writer): - """ - This class is designed to publish content to an AMQP (Rabbitmq) - The class will rely on pika to implement this functionality - - We will publish information to a given queue for a given exchange - """ - def __init__(self,**params): - #self.host= params['host'] - #self.exchange = params['uid'] - #self.queue = params['queue'] - MessageQueue.__init__(self,**params); - self.init() - - - - - - - - def write(self,data,_type='text/plain'): - """ - This function writes a stream of data to the a given queue - @param object object to be written (will be converted to JSON) - @TODO: make this less chatty - """ - - stream = json.dumps(data) if isinstance(data,dict) else data - self.channel.basic_publish( - exchange=self.exchange, - routing_key=self.queue, - body=stream, - properties=pika.BasicProperties(content_type=_type,delivery_mode=2) - ); - # self.close() - - def flush(self): - self.init() - _mode = 1 #-- Non persistent - self.channel.queue_delete( queue=self.queue); - self.close() - -class QueueReader(MessageQueue,Reader): - """ - This class will read from a queue provided an exchange, queue and host - @TODO: Account for security and virtualhosts - """ - - def __init__(self,**params): - """ - @param host host - @param uid exchange identifier - @param qid queue identifier - """ - - #self.host= params['host'] - #self.exchange = params['uid'] - #self.queue = params['qid'] - MessageQueue.__init__(self,**params); - # self.init() - self.durable = False if 'durable' not in params else params['durable'] - # if 'durable' in params : - # self.durable = True - # else: - # self.durable = False - self.size = -1 - self.data = {} - # def init(self,qid): - - # properties = pika.ConnectionParameters(host=self.host) - # self.connection = pika.BlockingConnection(properties) - # self.channel = self.connection.channel() - # self.channel.exchange_declare(exchange=self.exchange,type='direct',durable=True) - - # self.info = self.channel.queue_declare(queue=qid,durable=True) - - - def callback(self,channel,method,header,stream): - """ - This is the callback function designed to process the data stream from the queue - - """ - - r = [] - # if re.match("^\{|\[",stream) is not None: - if stream.startswith(b'{') or stream.startswith(b'['): - r = json.loads(stream) - else: - - r = stream - - qid = self.qhandler.method.queue - if qid not in self.data : - self.data[qid] = [] - - self.data[qid].append(r) - # - # We stop reading when the all the messages of the queue are staked - # - if self.size == len(self.data[qid]) or len(self.data[qid]) == self.info.method.message_count: - self.close() - - def read(self,**args): - """ - This function will read, the first message from a queue - @TODO: - Implement channel.basic_get in order to retrieve a single message at a time - Have the number of messages retrieved be specified by size (parameter) - """ - r = {} - self.size = -1 if 'size' in args else int(args['size']) - # - # We enabled the reader to be able to read from several queues (sequentially for now) - # The qid parameter will be an array of queues the reader will be reading from - # - if isinstance(self.queue,str) : - self.queue = [self.queue] - - for qid in self.queue: - self.init(qid) - # r[qid] = [] - - if self.qhandler.method.message_count > 0: - - self.channel.basic_consume(queue=qid,on_message_callback=self.callback,auto_ack=False); - self.channel.start_consuming() - else: - - pass - #self.close() - # r[qid].append( self.data) - - return self.data -class QueueListener(MessageQueue): - lock = RLock() - """ - This class is designed to have an active listener (worker) against a specified Exchange/Queue - It is initialized as would any other object and will require a callback function to address the objects returned. - """ - def __init__(self,**args): - MessageQueue.__init__(self,**args) - self.listen = self.read - self.apply = args['apply'] if 'apply' in args else print - self.lock = False if 'lock' not in args else args['lock'] - - def finalize(self,channel,ExceptionReason): - pass - - def callback(self,channel,method,header,stream) : - _info= {} - # if re.match("^\{|\[",stream) is not None: - - - if stream.startswith(b"[") or stream.startswith(b"{"): - _info = json.loads(stream) - else: - - _info = stream - # - # At this point we should invoke the apply function with a lock if need be - # @TODO: Establish a vocabulary - - if stream == b'QUIT' : - # channel.exit() - self.close() - if self.lock == True : - QueueListener.lock.acquire() - try: - # - # In case the user has not specified a function to apply the data against, it will simply be printed - # - self.apply(_info) - except Exception as e: - pass - if self.lock == True : - QueueListener.lock.release() - def read(self): - - self.init(self.queue) - - self.channel.basic_consume(self.queue,self.callback,auto_ack=True); - self.channel.start_consuming() - - - -class Factory : - @staticmethod - def instance(**_args): - """ - :param count number of workers - :param apply function workers - """ - _apply = _args['apply'] - _count = _args['count'] - for i in np.arange(_count) : - _name = _args['name'] if 'name' in _args else 'worker_'+str(i) - transport.factory.instance(provider="rabbit",context="listener",apply=_apply,auth_file=_args['auth_file']) \ No newline at end of file diff --git a/transport/s3.py b/transport/s3.py deleted file mode 100644 index 339cb5c..0000000 --- a/transport/s3.py +++ /dev/null @@ -1,130 +0,0 @@ -""" -Data Transport - 1.0 -Steve L. Nyemba, The Phi Technology LLC - -This file is a wrapper around s3 bucket provided by AWS for reading and writing content -""" -from datetime import datetime -import boto -from boto.s3.connection import S3Connection, OrdinaryCallingFormat -import numpy as np -import botocore -from smart_open import smart_open -import sys -if sys.version_info[0] > 2 : - from transport.common import Reader, Writer -else: - from common import Reader, Writer -import json -from io import StringIO -import json - -class s3 : - """ - @TODO: Implement a search function for a file given a bucket?? - """ - def __init__(self,**args) : - """ - This function will extract a file or set of files from s3 bucket provided - @param access_key - @param secret_key - @param path location of the file - @param filter filename or filtering elements - """ - try: - self.s3 = S3Connection(args['access_key'],args['secret_key'],calling_format=OrdinaryCallingFormat()) - self.bucket = self.s3.get_bucket(args['bucket'].strip(),validate=False) if 'bucket' in args else None - # self.path = args['path'] - self.filter = args['filter'] if 'filter' in args else None - self.filename = args['file'] if 'file' in args else None - self.bucket_name = args['bucket'] if 'bucket' in args else None - - except Exception as e : - self.s3 = None - self.bucket = None - print (e) - def meta(self,**args): - """ - :name name of the bucket - """ - info = self.list(**args) - [item.open() for item in info] - return [{"name":item.name,"size":item.size} for item in info] - def list(self,**args): - """ - This function will list the content of a bucket, the bucket must be provided by the name - :name name of the bucket - """ - return list(self.s3.get_bucket(args['name']).list()) - - - def buckets(self): - # - # This function will return all buckets, not sure why but it should be used cautiously - # based on why the s3 infrastructure is used - # - return [item.name for item in self.s3.get_all_buckets()] - - # def buckets(self): - pass - # """ - # This function is a wrapper around the bucket list of buckets for s3 - # """ - # return self.s3.get_all_buckets() - - -class s3Reader(s3,Reader) : - """ - Because s3 contains buckets and files, reading becomes a tricky proposition : - - list files if file is None - - stream content if file is Not None - @TODO: support read from all buckets, think about it - """ - def __init__(self,**args) : - s3.__init__(self,**args) - def files(self): - r = [] - try: - return [item.name for item in self.bucket if item.size > 0] - except Exception as e: - pass - return r - def stream(self,limit=-1): - """ - At this point we should stream a file from a given bucket - """ - key = self.bucket.get_key(self.filename.strip()) - if key is None : - yield None - else: - count = 0 - with smart_open(key) as remote_file: - for line in remote_file: - if count == limit and limit > 0 : - break - yield line - count += 1 - def read(self,**args) : - if self.filename is None : - # - # returning the list of files because no one file was specified. - return self.files() - else: - limit = args['size'] if 'size' in args else -1 - return self.stream(limit) - -class s3Writer(s3,Writer) : - - def __init__(self,**args) : - s3.__init__(self,**args) - def mkdir(self,name): - """ - This function will create a folder in a bucket - :name name of the folder - """ - self.s3.put_object(Bucket=self.bucket_name,key=(name+'/')) - def write(self,content): - file = StringIO(content.decode("utf8")) - self.s3.upload_fileobj(file,self.bucket_name,self.filename) - pass - From 6f7d912e20a4f134287f4db1560c4efe49afff57 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 1 Apr 2024 14:06:34 -0500 Subject: [PATCH 06/23] bug fix/refactoring commong IEncoder --- transport/common.py | 17 +++++++++++++++++ transport/nosql/mongodb.py | 16 +++++++++++++++- transport/other/callback.py | 2 +- 3 files changed, 33 insertions(+), 2 deletions(-) create mode 100644 transport/common.py diff --git a/transport/common.py b/transport/common.py new file mode 100644 index 0000000..e17c615 --- /dev/null +++ b/transport/common.py @@ -0,0 +1,17 @@ +import json + + +class IEncoder (json.JSONEncoder): + def default (self,object): + if type(object) == np.integer : + return int(object) + elif type(object) == np.floating: + return float(object) + elif type(object) == np.ndarray : + return object.tolist() + elif type(object) == datetime : + return object.isoformat() + else: + return super(IEncoder,self).default(object) + + diff --git a/transport/nosql/mongodb.py b/transport/nosql/mongodb.py index 2784cd2..00d20ba 100644 --- a/transport/nosql/mongodb.py +++ b/transport/nosql/mongodb.py @@ -17,7 +17,21 @@ import sys import json import re from multiprocessing import Lock, RLock -from transport.common import IEncoder +# from transport.common import IEncoder + +class IEncoder (json.JSONEncoder): + def default (self,object): + if type(object) == np.integer : + return int(object) + elif type(object) == np.floating: + return float(object) + elif type(object) == np.ndarray : + return object.tolist() + elif type(object) == datetime : + return object.isoformat() + else: + return super(IEncoder,self).default(object) + class Mongo : lock = RLock() diff --git a/transport/other/callback.py b/transport/other/callback.py index 29b03fc..c56c175 100644 --- a/transport/other/callback.py +++ b/transport/other/callback.py @@ -1,6 +1,6 @@ import queue from threading import Thread, Lock -from transport.common import Reader,Writer +# from transport.common import Reader,Writer import numpy as np import pandas as pd From eed612b3969e93f36fef242bfd23be1ce55ade4c Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 1 Apr 2024 14:09:45 -0500 Subject: [PATCH 07/23] bug fix: import --- transport/nosql/mongodb.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/transport/nosql/mongodb.py b/transport/nosql/mongodb.py index 00d20ba..2784cd2 100644 --- a/transport/nosql/mongodb.py +++ b/transport/nosql/mongodb.py @@ -17,21 +17,7 @@ import sys import json import re from multiprocessing import Lock, RLock -# from transport.common import IEncoder - -class IEncoder (json.JSONEncoder): - def default (self,object): - if type(object) == np.integer : - return int(object) - elif type(object) == np.floating: - return float(object) - elif type(object) == np.ndarray : - return object.tolist() - elif type(object) == datetime : - return object.isoformat() - else: - return super(IEncoder,self).default(object) - +from transport.common import IEncoder class Mongo : lock = RLock() From 383f887db68faa78b3a6b004561a2fee19a58ae8 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 1 Apr 2024 14:30:00 -0500 Subject: [PATCH 08/23] V2.0 plugin support --- transport/plugins/__init__.py | 128 ++++++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 transport/plugins/__init__.py diff --git a/transport/plugins/__init__.py b/transport/plugins/__init__.py new file mode 100644 index 0000000..6117664 --- /dev/null +++ b/transport/plugins/__init__.py @@ -0,0 +1,128 @@ +""" +The functions within are designed to load external files and apply functions against the data +The plugins are applied as + - post-processing if we are reading data + - and pre-processing if we are writing data + +The plugin will use a decorator to identify meaningful functions +@TODO: This should work in tandem with loggin (otherwise we don't have visibility into what is going on) +""" +import importlib as IL +import importlib.util +import sys +import os + +class plugin : + """ + Implementing function decorator for data-transport plugins (post-pre)-processing + """ + def __init__(self,**_args): + """ + :name name of the plugin + :mode restrict to reader/writer + :about tell what the function is about + """ + self._name = _args['name'] + self._about = _args['about'] + self._mode = _args['mode'] if 'mode' in _args else 'rw' + def __call__(self,pointer): + def wrapper(_args): + return pointer(_args) + # + # @TODO: + # add attributes to the wrapper object + # + setattr(wrapper,'transport',True) + setattr(wrapper,'name',self._name) + setattr(wrapper,'mode',self._mode) + setattr(wrapper,'about',self._about) + return wrapper + + +class PluginLoader : + """ + This class is intended to load a plugin and make it available and assess the quality of the developed plugin + """ + def __init__(self,**_args): + """ + :path location of the plugin (should be a single file) + :_names of functions to load + """ + _names = _args['names'] if 'names' in _args else None + path = _args['path'] if 'path' in _args else None + self._names = _names if type(_names) == list else [_names] + self._modules = {} + self._names = [] + if path and os.path.exists(path) and _names: + for _name in self._names : + spec = importlib.util.spec_from_file_location('private', path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) #--loads it into sys.modules + if hasattr(module,_name) : + if self.isplugin(module,_name) : + self._modules[_name] = getattr(module,_name) + else: + print ([f'Found {_name}', 'not plugin']) + else: + # + # @TODO: We should log this somewhere some how + print (['skipping ',_name, hasattr(module,_name)]) + pass + else: + # + # Initialization is empty + self._names = [] + pass + def set(self,_pointer) : + """ + This function will set a pointer to the list of modules to be called + This should be used within the context of using the framework as a library + """ + _name = _pointer.__name__ + + self._modules[_name] = _pointer + self._names.append(_name) + def isplugin(self,module,name): + """ + This function determines if a module is a recognized plugin + :module module object loaded from importlib + :name name of the functiion of interest + """ + + p = type(getattr(module,name)).__name__ =='function' + q = hasattr(getattr(module,name),'transport') + # + # @TODO: add a generated key, and more indepth validation + return p and q + def has(self,_name): + """ + This will determine if the module name is loaded or not + """ + return _name in self._modules + def ratio (self): + """ + how many modules loaded vs unloaded given the list of names + """ + + _n = len(self._names) + return len(set(self._modules.keys()) & set (self._names)) / _n + def apply(self,_data): + for _name in self._modules : + _pointer = self._modules[_name] + # + # @TODO: add exception handling + _data = _pointer(_data) + return _data + # def apply(self,_data,_name): + # """ + # This function applies an external module function against the data. + # The responsibility is on the plugin to properly return data, thus responsibility is offloaded + # """ + # try: + + # _pointer = self._modules[_name] + # _data = _pointer(_data) + + # except Exception as e: + # pass + # return _data From edd3efd3286d795365fc7ffcdc43a0967b91d66c Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 1 Apr 2024 15:41:39 -0500 Subject: [PATCH 09/23] bug fixes: imports --- setup.py | 3 ++- transport/__init__.py | 3 ++- transport/nosql/__init__.py | 10 ++++++---- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index 40ba3fb..743746e 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,8 @@ args = { "version":__version__, "author":__author__,"author_email":"info@the-phi.com", "license":"MIT", - "packages":["transport","info"]} + # "packages":["transport","info","transport/sql"]}, + "packages": find_packages(include=['info', 'transport.*'])} args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3','sqlite'] args["install_requires"] = ['pyncclient','pymongo','sqlalchemy','pandas','typer','pandas-gbq','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python','numpy'] args["url"] = "https://healthcareio.the-phi.com/git/code/transport.git" diff --git a/transport/__init__.py b/transport/__init__.py index 387161d..2e2897a 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -18,13 +18,14 @@ Source Code is available under MIT License: """ import numpy as np -from transport import sql, nosql, cloud, other +import sql, nosql, cloud, other import pandas as pd import json import os from info import __version__,__author__ from transport.iowrapper import IWriter, IReader from transport.plugins import PluginLoader +from transport import providers PROVIDERS = {} def init(): global PROVIDERS diff --git a/transport/nosql/__init__.py b/transport/nosql/__init__.py index 465b912..c89b212 100644 --- a/transport/nosql/__init__.py +++ b/transport/nosql/__init__.py @@ -2,9 +2,11 @@ Steve L. Nyemba, nyemba@gmail.com This namespace implements support for cloud databases couchdb,mongodb, cloudant ... """ -from transport.nosql import couchdb -from transport.nosql import mongodb -# from . import mongodb -# from . import couchdb +# from transport.nosql import couchdb +# from transport.nosql import mongodb +from . import mongodb +from . import couchdb +# import mongodb +# import couchdb cloudant = couchdb \ No newline at end of file From 165f9913b519c89ac9061b8fbfd486a8d39daa7e Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 1 Apr 2024 16:04:00 -0500 Subject: [PATCH 10/23] bug fix: imports providers (backward compatibility) --- transport/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/transport/__init__.py b/transport/__init__.py index 2e2897a..a28b7d9 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -26,6 +26,7 @@ from info import __version__,__author__ from transport.iowrapper import IWriter, IReader from transport.plugins import PluginLoader from transport import providers + PROVIDERS = {} def init(): global PROVIDERS From 90ac26e53e239a99f296a2621090c289581e898e Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 1 Apr 2024 16:08:19 -0500 Subject: [PATCH 11/23] bug fixes --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 743746e..be572b0 100644 --- a/setup.py +++ b/setup.py @@ -19,6 +19,7 @@ args = { "author":__author__,"author_email":"info@the-phi.com", "license":"MIT", # "packages":["transport","info","transport/sql"]}, + "packages": find_packages(include=['info', 'transport.*'])} args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3','sqlite'] args["install_requires"] = ['pyncclient','pymongo','sqlalchemy','pandas','typer','pandas-gbq','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python','numpy'] From ed5acec4724e3761afb07bd6de660fc40766c08e Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 1 Apr 2024 16:09:51 -0500 Subject: [PATCH 12/23] bug fixes --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index be572b0..3df143d 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ args = { "license":"MIT", # "packages":["transport","info","transport/sql"]}, - "packages": find_packages(include=['info', 'transport.*'])} + "packages": find_packages(include=['info','transport', 'transport.*'])} args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3','sqlite'] args["install_requires"] = ['pyncclient','pymongo','sqlalchemy','pandas','typer','pandas-gbq','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python','numpy'] args["url"] = "https://healthcareio.the-phi.com/git/code/transport.git" From 549cc2082434deab857f968ff5bd3697dab674aa Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 1 Apr 2024 16:12:04 -0500 Subject: [PATCH 13/23] bug fix ... --- transport/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transport/__init__.py b/transport/__init__.py index a28b7d9..333931b 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -18,7 +18,7 @@ Source Code is available under MIT License: """ import numpy as np -import sql, nosql, cloud, other +from transport import sql, nosql, cloud, other import pandas as pd import json import os From 4b97994ec19fde14ffda010b8d6c977b883651e8 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 1 Apr 2024 18:37:47 -0500 Subject: [PATCH 14/23] bug fix: layout providers --- bin/transport | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bin/transport b/bin/transport index 363d2d9..6d5710d 100755 --- a/bin/transport +++ b/bin/transport @@ -80,7 +80,11 @@ def supported (format:str="table") : This function will print supported providers and their associated classifications """ _df = (transport.supported()) - print (json.dumps(_df.to_dict(orient="list"))) + if format in ['list','json'] : + print (json.dumps(_df.to_dict(orient="list"))) + else: + print (_df) + print () @app.command() def version(): From eb81f5a4d208598979c168e79b2043b7e7a6220d Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 2 Apr 2024 12:31:41 -0500 Subject: [PATCH 15/23] bug fix: mongodb inserts of structured objects with lists as elements --- transport/common.py | 3 ++- transport/nosql/mongodb.py | 20 +++++++++++++++++--- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/transport/common.py b/transport/common.py index e17c615..f439ea7 100644 --- a/transport/common.py +++ b/transport/common.py @@ -1,5 +1,6 @@ import json - +import numpy as np +from datetime import datetime class IEncoder (json.JSONEncoder): def default (self,object): diff --git a/transport/nosql/mongodb.py b/transport/nosql/mongodb.py index 2784cd2..2b94311 100644 --- a/transport/nosql/mongodb.py +++ b/transport/nosql/mongodb.py @@ -213,12 +213,26 @@ class Writer(Mongo): _uid = self.collection if 'doc' not in _args else _args['doc'] if self._lock : Mongo.lock.acquire() + if type(info) == list or type(info) == pd.DataFrame : - info if type(info) == list else info.to_dict(orient='records') - info = json.loads(json.dumps(info,cls=IEncoder)) + if type(info) == pd.DataFrame : + info = info.to_dict(orient='records') + # info if type(info) == list else info.to_dict(orient='records') + info = json.loads(json.dumps(info)) self.db[_uid].insert_many(info) else: - self.db[_uid].insert_one(json.loads(json.dumps(info,cls=IEncoder))) + # + # sometimes a dictionary can have keys with arrays (odd shaped) + # + _keycount = len(info.keys()) + _arraycount = [len(info[key]) for key in info if type(info[key]) in (list,np.array,np.ndarray)] + if _arraycount and len(_arraycount) == _keycount and np.max(_arraycount) == np.min(_arraycount) : + # + # In case an object with consistent structure is passed, we store it accordingly + # + self.write(pd.DataFrame(info),**_args) + else: + self.db[_uid].insert_one(json.loads(json.dumps(info,cls=IEncoder))) finally: if self._lock : Mongo.lock.release() From 9d75d420178eb3e6031c0e1972bb73b35f09c0d2 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 2 Apr 2024 12:59:26 -0500 Subject: [PATCH 16/23] bug fix: append mode/replace or truncate upon insert --- transport/sql/common.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/transport/sql/common.py b/transport/sql/common.py index 89dcefb..4c9d4a7 100644 --- a/transport/sql/common.py +++ b/transport/sql/common.py @@ -118,8 +118,12 @@ class BaseWriter (SQLBase): # _table = _args['table'] if 'table' in _args else self._table _mode = {'chunksize':2000000,'if_exists':'append','index':False} - if 'schema' in _args : - _mode['schema'] = _args['schema'] - if 'if_exists' in _args : - _mode['if_exists'] = _args['if_exists'] - _df.to_sql(_table,self._engine,**_args,index=False) \ No newline at end of file + for key in ['if_exists','index','chunksize'] : + if key in _args : + _mode[key] = _args[key] + # if 'schema' in _args : + # _mode['schema'] = _args['schema'] + # if 'if_exists' in _args : + # _mode['if_exists'] = _args['if_exists'] + + _df.to_sql(_table,self._engine,**_mode) \ No newline at end of file From 677239585c4520fbca494ba25d670815a11f768e Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 2 Apr 2024 16:58:58 -0500 Subject: [PATCH 17/23] bug fix, with bigquery write --- transport/cloud/bigquery.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/transport/cloud/bigquery.py b/transport/cloud/bigquery.py index 479c060..ba720af 100644 --- a/transport/cloud/bigquery.py +++ b/transport/cloud/bigquery.py @@ -104,12 +104,12 @@ class Writer (BigQuery): """ try: if self.parallel or 'lock' in _args : - Write.lock.acquire() + Writer.lock.acquire() _args['table'] = self.table if 'table' not in _args else _args['table'] self._write(_data,**_args) finally: if self.parallel: - Write.lock.release() + Writer.lock.release() def submit(self,_sql): """ Write the output of a massive query to a given table, biquery will handle this as a job @@ -144,13 +144,16 @@ class Writer (BigQuery): # Let us insure that the types are somewhat compatible ... # _map = {'INTEGER':np.int64,'DATETIME':'datetime64[ns]','TIMESTAMP':'datetime64[ns]','FLOAT':np.float64,'DOUBLE':np.float64,'STRING':str} # _mode = copy.deepcopy(self.mode) - _mode = self.mode + # _mode = self.mode # _df.to_gbq(**self.mode) #if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000) # # Let us adjust the chunking here + if 'if_exists' in _args : + self.mode['if_exists'] = _args['if_exists'] self._chunks = 10 if _df.shape[0] > MAX_CHUNK and self._chunks == 1 else self._chunks _indexes = np.array_split(np.arange(_df.shape[0]),self._chunks) for i in _indexes : - _df.iloc[i].to_gbq(**self.mode) + # _df.iloc[i].to_gbq(**self.mode) + pd_gbq.to_gbq(_df.iloc[i],**self.mode) time.sleep(1) pass \ No newline at end of file From 715e40407a4ca8d638d0a92a6299cf8a34354484 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 2 Apr 2024 17:00:11 -0500 Subject: [PATCH 18/23] adding notebooks (test/examples --- notebooks/bigquery.ipynb | 169 +++++++++++++++++++++++++++++++++++++ notebooks/mongodb.ipynb | 155 ++++++++++++++++++++++++++++++++++ notebooks/mysql.ipynb | 150 ++++++++++++++++++++++++++++++++ notebooks/postgresql.ipynb | 157 ++++++++++++++++++++++++++++++++++ notebooks/sqlite.ipynb | 139 ++++++++++++++++++++++++++++++ 5 files changed, 770 insertions(+) create mode 100644 notebooks/bigquery.ipynb create mode 100644 notebooks/mongodb.ipynb create mode 100644 notebooks/mysql.ipynb create mode 100644 notebooks/postgresql.ipynb create mode 100644 notebooks/sqlite.ipynb diff --git a/notebooks/bigquery.ipynb b/notebooks/bigquery.ipynb new file mode 100644 index 0000000..750f167 --- /dev/null +++ b/notebooks/bigquery.ipynb @@ -0,0 +1,169 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Writing to Google Bigquery\n", + "\n", + "1. Insure you have a Google Bigquery service account key on disk\n", + "2. The service key location is set as an environment variable **BQ_KEY**\n", + "3. The dataset will be automatically created within the project associated with the service key\n", + "\n", + "The cell below creates a dataframe that will be stored within Google Bigquery" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1/1 [00:00<00:00, 5440.08it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['data transport version ', '2.0.0']\n" + ] + } + ], + "source": [ + "#\n", + "# Writing to Google Bigquery database\n", + "#\n", + "import transport\n", + "from transport import providers\n", + "import pandas as pd\n", + "import os\n", + "\n", + "PRIVATE_KEY = os.environ['BQ_KEY'] #-- location of the service key\n", + "DATASET = 'demo'\n", + "_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n", + "bqw = transport.factory.instance(provider=providers.BIGQUERY,dataset=DATASET,table='friends',context='write',private_key=PRIVATE_KEY)\n", + "bqw.write(_data,if_exists='replace') #-- default is append\n", + "print (['data transport version ', transport.__version__])\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Reading from Google Bigquery\n", + "\n", + "The cell below reads the data that has been written by the cell above and computes the average age within a Google Bigquery (simple query). \n", + "\n", + "- Basic read of the designated table (friends) created above\n", + "- Execute an aggregate SQL against the table\n", + "\n", + "**NOTE**\n", + "\n", + "It is possible to use **transport.factory.instance** or **transport.instance** they are the same. It allows the maintainers to know that we used a factory design pattern." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading: 100%|\u001b[32m██████████\u001b[0m|\n", + "Downloading: 100%|\u001b[32m██████████\u001b[0m|\n", + " name age\n", + "0 James Bond 55\n", + "1 Steve Rogers 150\n", + "2 Steve Nyemba 44\n", + "--------- STATISTICS ------------\n", + " _counts f0_\n", + "0 3 83.0\n" + ] + } + ], + "source": [ + "\n", + "import transport\n", + "from transport import providers\n", + "import os\n", + "PRIVATE_KEY=os.environ['BQ_KEY']\n", + "pgr = transport.instance(provider=providers.BIGQUERY,dataset='demo',table='friends',private_key=PRIVATE_KEY)\n", + "_df = pgr.read()\n", + "_query = 'SELECT COUNT(*) _counts, AVG(age) from demo.friends'\n", + "_sdf = pgr.read(sql=_query)\n", + "print (_df)\n", + "print ('--------- STATISTICS ------------')\n", + "print (_sdf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The cell bellow show the content of an auth_file, in this case if the dataset/table in question is not to be shared then you can use auth_file with information associated with the parameters.\n", + "\n", + "**NOTE**:\n", + "\n", + "The auth_file is intended to be **JSON** formatted" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'dataset': 'demo', 'table': 'friends'}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "{\n", + " \n", + " \"dataset\":\"demo\",\"table\":\"friends\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/mongodb.ipynb b/notebooks/mongodb.ipynb new file mode 100644 index 0000000..0554669 --- /dev/null +++ b/notebooks/mongodb.ipynb @@ -0,0 +1,155 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Writing to mongodb\n", + "\n", + "Insure mongodb is actually installed on the system, The cell below creates a dataframe that will be stored within mongodb" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.0.0\n" + ] + } + ], + "source": [ + "#\n", + "# Writing to mongodb database\n", + "#\n", + "import transport\n", + "from transport import providers\n", + "import pandas as pd\n", + "_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n", + "mgw = transport.factory.instance(provider=providers.MONGODB,db='demo',collection='friends',context='write')\n", + "mgw.write(_data)\n", + "print (transport.__version__)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Reading from mongodb\n", + "\n", + "The cell below reads the data that has been written by the cell above and computes the average age within a mongodb pipeline. The code in the background executes an aggregation using **db.runCommand**\n", + "\n", + "- Basic read of the designated collection **find=\\**\n", + "- Executing an aggregate pipeline against a collection **aggreate=\\**\n", + "\n", + "**NOTE**\n", + "\n", + "It is possible to use **transport.factory.instance** or **transport.instance** they are the same. It allows the maintainers to know that we used a factory design pattern." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " name age\n", + "0 James Bond 55\n", + "1 Steve Rogers 150\n", + "--------- STATISTICS ------------\n", + " _id _counts _mean\n", + "0 0 2 102.5\n" + ] + } + ], + "source": [ + "\n", + "import transport\n", + "from transport import providers\n", + "mgr = transport.instance(provider=providers.MONGODB,db='foo',collection='friends')\n", + "_df = mgr.read()\n", + "PIPELINE = [{\"$group\":{\"_id\":0,\"_counts\":{\"$sum\":1}, \"_mean\":{\"$avg\":\"$age\"}}}]\n", + "_sdf = mgr.read(aggregate='friends',pipeline=PIPELINE)\n", + "print (_df)\n", + "print ('--------- STATISTICS ------------')\n", + "print (_sdf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The cell bellow show the content of an auth_file, in this case if the dataset/table in question is not to be shared then you can use auth_file with information associated with the parameters.\n", + "\n", + "**NOTE**:\n", + "\n", + "The auth_file is intended to be **JSON** formatted" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'host': 'klingon.io',\n", + " 'port': 27017,\n", + " 'username': 'me',\n", + " 'password': 'foobar',\n", + " 'db': 'foo',\n", + " 'collection': 'friends',\n", + " 'authSource': '',\n", + " 'mechamism': ''}" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "{\n", + " \"host\":\"klingon.io\",\"port\":27017,\"username\":\"me\",\"password\":\"foobar\",\"db\":\"foo\",\"collection\":\"friends\",\n", + " \"authSource\":\"\",\"mechamism\":\"\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/mysql.ipynb b/notebooks/mysql.ipynb new file mode 100644 index 0000000..a54d46d --- /dev/null +++ b/notebooks/mysql.ipynb @@ -0,0 +1,150 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Writing to MySQL\n", + "\n", + "1. Insure MySQL is actually installed on the system, \n", + "2. There is a database called demo created on the said system\n", + "\n", + "The cell below creates a dataframe that will be stored within postgreSQL" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.0.0\n" + ] + } + ], + "source": [ + "#\n", + "# Writing to PostgreSQL database\n", + "#\n", + "import transport\n", + "from transport import providers\n", + "import pandas as pd\n", + "_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n", + "myw = transport.factory.instance(provider=providers.MYSQL,database='demo',table='friends',context='write',auth_file=\"/home/steve/auth-mysql.json\")\n", + "myw.write(_data,if_exists='replace') #-- default is append\n", + "print (transport.__version__)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Reading from MySQL\n", + "\n", + "The cell below reads the data that has been written by the cell above and computes the average age within a MySQL (simple query). \n", + "\n", + "- Basic read of the designated table (friends) created above\n", + "- Execute an aggregate SQL against the table\n", + "\n", + "**NOTE**\n", + "\n", + "It is possible to use **transport.factory.instance** or **transport.instance** they are the same. It allows the maintainers to know that we used a factory design pattern." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " name age\n", + "0 James Bond 55\n", + "1 Steve Rogers 150\n", + "2 Steve Nyemba 44\n", + "--------- STATISTICS ------------\n", + " _counts avg\n", + "0 3 83.0\n" + ] + } + ], + "source": [ + "\n", + "import transport\n", + "from transport import providers\n", + "myr = transport.instance(provider=providers.POSTGRESQL,database='demo',table='friends',auth_file='/home/steve/auth-mysql.json')\n", + "_df = myr.read()\n", + "_query = 'SELECT COUNT(*) _counts, AVG(age) from friends'\n", + "_sdf = myr.read(sql=_query)\n", + "print (_df)\n", + "print ('--------- STATISTICS ------------')\n", + "print (_sdf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The cell bellow show the content of an auth_file, in this case if the dataset/table in question is not to be shared then you can use auth_file with information associated with the parameters.\n", + "\n", + "**NOTE**:\n", + "\n", + "The auth_file is intended to be **JSON** formatted" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'host': 'klingon.io',\n", + " 'port': 3306,\n", + " 'username': 'me',\n", + " 'password': 'foobar',\n", + " 'database': 'demo',\n", + " 'table': 'friends'}" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "{\n", + " \"host\":\"klingon.io\",\"port\":3306,\"username\":\"me\",\"password\":\"foobar\",\n", + " \"database\":\"demo\",\"table\":\"friends\"\n", + "}" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/postgresql.ipynb b/notebooks/postgresql.ipynb new file mode 100644 index 0000000..5046f4d --- /dev/null +++ b/notebooks/postgresql.ipynb @@ -0,0 +1,157 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Writing to PostgreSQL\n", + "\n", + "1. Insure PostgreSQL is actually installed on the system, \n", + "2. There is a database called demo created on the said system\n", + "\n", + "The cell below creates a dataframe that will be stored within postgreSQL" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.0.0\n" + ] + } + ], + "source": [ + "#\n", + "# Writing to PostgreSQL database\n", + "#\n", + "import transport\n", + "from transport import providers\n", + "import pandas as pd\n", + "_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n", + "pgw = transport.factory.instance(provider=providers.POSTGRESQL,database='demo',table='friends',context='write')\n", + "pgw.write(_data,if_exists='replace') #-- default is append\n", + "print (transport.__version__)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Reading from PostgreSQL\n", + "\n", + "The cell below reads the data that has been written by the cell above and computes the average age within a PostreSQL (simple query). \n", + "\n", + "- Basic read of the designated table (friends) created above\n", + "- Execute an aggregate SQL against the table\n", + "\n", + "**NOTE**\n", + "\n", + "It is possible to use **transport.factory.instance** or **transport.instance** they are the same. It allows the maintainers to know that we used a factory design pattern." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " name age\n", + "0 James Bond 55\n", + "1 Steve Rogers 150\n", + "2 Steve Nyemba 44\n", + "--------- STATISTICS ------------\n", + " _counts avg\n", + "0 3 83.0\n" + ] + } + ], + "source": [ + "\n", + "import transport\n", + "from transport import providers\n", + "pgr = transport.instance(provider=providers.POSTGRESQL,database='demo',table='friends')\n", + "_df = pgr.read()\n", + "_query = 'SELECT COUNT(*) _counts, AVG(age) from friends'\n", + "_sdf = pgr.read(sql=_query)\n", + "print (_df)\n", + "print ('--------- STATISTICS ------------')\n", + "print (_sdf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The cell bellow show the content of an auth_file, in this case if the dataset/table in question is not to be shared then you can use auth_file with information associated with the parameters.\n", + "\n", + "**NOTE**:\n", + "\n", + "The auth_file is intended to be **JSON** formatted" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'host': 'klingon.io',\n", + " 'port': 5432,\n", + " 'username': 'me',\n", + " 'password': 'foobar',\n", + " 'database': 'demo',\n", + " 'table': 'friends'}" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "{\n", + " \"host\":\"klingon.io\",\"port\":5432,\"username\":\"me\",\"password\":\"foobar\",\n", + " \"database\":\"demo\",\"table\":\"friends\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/sqlite.ipynb b/notebooks/sqlite.ipynb new file mode 100644 index 0000000..5c249de --- /dev/null +++ b/notebooks/sqlite.ipynb @@ -0,0 +1,139 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Writing to SQLite3+\n", + "\n", + "The requirements to get started are minimal (actually none). The cell below creates a dataframe that will be stored within SQLite 3+" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.0.0\n" + ] + } + ], + "source": [ + "#\n", + "# Writing to PostgreSQL database\n", + "#\n", + "import transport\n", + "from transport import providers\n", + "import pandas as pd\n", + "_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n", + "sqw = transport.factory.instance(provider=providers.SQLITE,database='/home/steve/demo.db3',table='friends',context='write')\n", + "sqw.write(_data,if_exists='replace') #-- default is append\n", + "print (transport.__version__)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Reading from SQLite3+\n", + "\n", + "The cell below reads the data that has been written by the cell above and computes the average age within a PostreSQL (simple query). \n", + "\n", + "- Basic read of the designated table (friends) created above\n", + "- Execute an aggregate SQL against the table\n", + "\n", + "**NOTE**\n", + "\n", + "It is possible to use **transport.factory.instance** or **transport.instance** they are the same. It allows the maintainers to know that we used a factory design pattern." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " name age\n", + "0 James Bond 55\n", + "1 Steve Rogers 150\n", + "2 Steve Nyemba 44\n", + "--------- STATISTICS ------------\n", + " _counts AVG(age)\n", + "0 3 83.0\n" + ] + } + ], + "source": [ + "\n", + "import transport\n", + "from transport import providers\n", + "pgr = transport.instance(provider=providers.SQLITE,database='/home/steve/demo.db3',table='friends')\n", + "_df = pgr.read()\n", + "_query = 'SELECT COUNT(*) _counts, AVG(age) from friends'\n", + "_sdf = pgr.read(sql=_query)\n", + "print (_df)\n", + "print ('--------- STATISTICS ------------')\n", + "print (_sdf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The cell bellow show the content of an auth_file, in this case if the dataset/table in question is not to be shared then you can use auth_file with information associated with the parameters.\n", + "\n", + "**NOTE**:\n", + "\n", + "The auth_file is intended to be **JSON** formatted. This is an overkill for SQLite ;-)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "{\n", + " \"provider\":\"sqlite\",\n", + " \"database\":\"/home/steve/demo.db3\",\"table\":\"friends\"\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From e1763b1b192bc34359a7691b6f695c0a6b319977 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 2 Apr 2024 20:59:01 -0500 Subject: [PATCH 19/23] bug fix: ETL, Mongodb --- bin/transport | 8 +++++++- transport/etl.py | 7 ++++++- transport/nosql/mongodb.py | 2 +- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/bin/transport b/bin/transport index 6d5710d..f483d94 100755 --- a/bin/transport +++ b/bin/transport @@ -62,8 +62,14 @@ def wait(jobs): time.sleep(1) @app.command(name="apply") -def move (path,index=None): +def apply (path,index=None): + """ + This function applies data transport from one source to one or several others + :path path of the configuration file + + :index index of the _item of interest (otherwise everything will be processed) + """ _proxy = lambda _object: _object.write(_object.read()) if os.path.exists(path): file = open(path) diff --git a/transport/etl.py b/transport/etl.py index 162e185..25750de 100644 --- a/transport/etl.py +++ b/transport/etl.py @@ -83,7 +83,12 @@ class Transporter(Process): _reader = transport.factory.instance(**self._source) # # If arguments are provided then a query is to be executed (not just a table dump) - return _reader.read() if 'args' not in self._source else _reader.read(**self._source['args']) + if 'cmd' in self._source or 'query' in self._source : + _query = self._source['cmd'] if 'cmd' in self._source else self._source['query'] + return _reader.read(**_query) + else: + return _reader.read() + # return _reader.read() if 'query' not in self._source else _reader.read(**self._source['query']) def _delegate_write(self,_data,**_args): """ diff --git a/transport/nosql/mongodb.py b/transport/nosql/mongodb.py index 2b94311..c498704 100644 --- a/transport/nosql/mongodb.py +++ b/transport/nosql/mongodb.py @@ -218,7 +218,7 @@ class Writer(Mongo): if type(info) == pd.DataFrame : info = info.to_dict(orient='records') # info if type(info) == list else info.to_dict(orient='records') - info = json.loads(json.dumps(info)) + info = json.loads(json.dumps(info,cls=IEncoder)) self.db[_uid].insert_many(info) else: # From f6919ccd9324afe34835fa708c544cca0fcd5513 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 16 Apr 2024 09:42:33 -0500 Subject: [PATCH 20/23] bug fix: set function mongodb used for updates --- transport/nosql/mongodb.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/transport/nosql/mongodb.py b/transport/nosql/mongodb.py index c498704..7c5b8b2 100644 --- a/transport/nosql/mongodb.py +++ b/transport/nosql/mongodb.py @@ -12,7 +12,8 @@ from bson.binary import Binary from datetime import datetime import pandas as pd import numpy as np -import gridfs +# import gridfs +from gridfs import GridFS import sys import json import re @@ -243,13 +244,17 @@ class Writer(Mongo): """ collection = self.db[self.collection] - if collection.count_document() > 0 and '_id' in document: + if collection.count_documents() > 0 and '_id' in document: id = document['_id'] del document['_id'] collection.find_one_and_replace({'_id':id},document) else: - collection.delete_many({}) - self.write(info) + # + # Nothing to be done if we did not find anything + # + pass + # collection.delete_many({}) + # self.write(info) def close(self): Mongo.close(self) # collecton.update_one({"_id":self.collection},document,True) From 1eda49b63a93d17d9262b6ecdde7d465c5a617e8 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 17 Apr 2024 23:56:31 -0500 Subject: [PATCH 21/23] documentation --- README.md | 189 +----------------------------------- notebooks/mssqlserver.ipynb | 160 ++++++++++++++++++++++++++++++ 2 files changed, 164 insertions(+), 185 deletions(-) create mode 100644 notebooks/mssqlserver.ipynb diff --git a/README.md b/README.md index eaa176d..ff8bd39 100644 --- a/README.md +++ b/README.md @@ -1,204 +1,23 @@ # Introduction -This project implements an abstraction of objects that can have access to a variety of data stores, implementing read/write with a simple and expressive interface. This abstraction works with **NoSQL** and **SQL** data stores and leverages **pandas**. - -The supported data store providers : - -| Provider | Underlying Drivers | Description | -| :---- | :----: | ----: | -| sqlite| Native SQLite|SQLite3| -| postgresql| psycopg2 | PostgreSQL -| redshift| psycopg2 | Amazon Redshift -| s3| boto3 | Amazon Simple Storage Service -| netezza| nzpsql | IBM Neteeza -| Files: CSV, TSV| pandas| pandas data-frame -| Couchdb| cloudant | Couchbase/Couchdb -| mongodb| pymongo | Mongodb -| mysql| mysql| Mysql -| bigquery| google-bigquery| Google BigQuery -| mariadb| mysql| Mariadb -| rabbitmq|pika| RabbitMQ Publish/Subscribe +This project implements an abstraction of objects that can have access to a variety of data stores, implementing read/write with a simple and expressive interface. This abstraction works with **NoSQL**, **SQL** and **Cloud** data stores and leverages **pandas**. # Why Use Data-Transport ? -Mostly data scientists that don't really care about the underlying database and would like to manipulate data transparently. +Mostly data scientists that don't really care about the underlying database and would like a simple and consistent way to read/write data and have will be well served. Additionally we implemented lightweight Extract Transform Loading API and command line (CLI) tool. 1. Familiarity with **pandas data-frames** 2. Connectivity **drivers** are included 3. Mining data from various sources 4. Useful for data migrations or ETL -# Usage - ## Installation Within the virtual environment perform the following : pip install git+https://github.com/lnyemba/data-transport.git -Once installed **data-transport** can be used as a library in code or a command line interface (CLI), as a CLI it is used for ETL and requires a configuration file. +## Learn More -## Data Transport as a Library (in code) ---- - -The data-transport can be used within code as a library, and offers the following capabilities: - -* Read/Write against [mongodb](https://github.com/lnyemba/data-transport/wiki/mongodb) -* Read/Write against tranditional [RDBMS](https://github.com/lnyemba/data-transport/wiki/rdbms) -* Read/Write against [bigquery](https://github.com/lnyemba/data-transport/wiki/bigquery) -* ETL CLI/Code [ETL](https://github.com/lnyemba/data-transport/wiki/etl) -* Support for pre/post conditions i.e it is possible to specify queries to run before or after a read or write - -The read/write functions make data-transport a great candidate for **data-science**; **data-engineering** or all things pertaining to data. It enables operations across multiple data-stores(relational or not) - -## ETL - -**Embedded in Code** - -It is possible to perform ETL within custom code as follows : - -``` - import transport - import time - - _info = [{source:{'provider':'sqlite','path':'/home/me/foo.csv','table':'me',"pipeline":{"pre":[],"post":[]}},target:{provider:'bigquery',private_key='/home/me/key.json','table':'me','dataset':'mydataset'}}, ...] - procs = transport.factory.instance(provider='etl',info=_info) - # - # - while procs: - procs = [pthread for pthread in procs if pthread.is_alive()] - time.sleep(1) -``` - -**Command Line Interface (CLI):** ---- -The CLI program is called **transport** and it requires a configuration file. The program is intended to move data from one location to another. Supported data stores are in the above paragraphs. - -``` -[ - { - "id":"logs", - "source":{ - "provider":"postgresql","context":"read","database":"mydb", - "cmd":{"sql":"SELECT * FROM logs limit 10"} - }, - "target":{ - "provider":"bigquery","private_key":"/bgqdrive/account/bq-service-account-key.json", - "dataset":"mydataset" - } - }, - -] -``` - -Assuming the above content is stored in a file called **etl-config.json**, we would perform the following in a terminal window: - -``` -[steve@data-transport]$ transport --config ./etl-config.json [--index ] -``` - -**Reading/Writing Mongodb** - -For this example we assume here we are tunneling through port 27018 and there is not access control: - -``` -import transport -reader = factory.instance(provider='mongodb',context='read',host='localhost',port='27018',db='example',doc='logs') - -df = reader.read() #-- reads the entire collection -print (df.head()) -# -#-- Applying mongodb command -PIPELINE = [{"$group":{"_id":None,"count":{"$sum":1}}}] -_command_={"cursor":{},"allowDiskUse":True,"aggregate":"logs","pipeline":PIPLINE} -df = reader.read(mongo=_command) -print (df.head()) -reader.close() -``` -**Read/Writing to Mongodb** ---- - -Scenario 1: Mongodb with security in place - -1. Define an authentication file on disk - - The semantics of the attributes are provided by mongodb, please visit [mongodb documentation](https://mongodb.org/docs). In this example the file is located on _/transport/mongo.json_ -
-
-configuration file - -``` -{ - "username":"me","password":"changeme", - "mechanism":"SCRAM-SHA-1", - "authSource":"admin" -} -``` -Connecting to Mongodb - -``` -import transport -PIPELINE = ... #-- do this yourself -MONGO_KEY = '/transport/mongo.json' -mreader = transport.factory.instance(provider=transport.providers.MONGODB,auth_file=MONGO_KEY,context='read',db='mydb',doc='logs') -_aggregateDF = mreader.read(mongo=PIPELINE) #--results of a aggregate pipeline -_collectionDF= mreader.read() - - -``` - -In order to enable write, change **context** attribute to **'read'**. -
-
-- The configuration file is in JSON format -- The commands passed to mongodb are the same as you would if you applied runCommand in mongodb -- The output is a pandas data-frame -- By default the transport reads, to enable write operations use **context='write'** - -|parameters|description | -| --- | --- | -|db| Name of the database| -|port| Port number to connect to -|doc| Name of the collection of documents| -|username|Username | -|password|password| -|authSource|user database that has authentication info| -|mechanism|Mechnism used for authentication| - -**NOTE** - -Arguments like **db** or **doc** can be placed in the authentication file -
-
- -**Limitations** - -Reads and writes aren't encapsulated in the same object, this is to allow the calling code to deliberately perform actions and hopefully minimize accidents associated with data wrangling. - - -``` -import transport -improt pandas as pd -writer = factory.instance(provider=transport.providers.MONGODB,context='write',host='localhost',port='27018',db='example',doc='logs') - -df = pd.DataFrame({"names":["steve","nico"],"age":[40,30]}) -writer.write(df) -writer.close() -``` - - - - # - # reading from postgresql - - pgreader = factory.instance(type='postgresql',database=,table=) - pg.read() #-- will read the table by executing a SELECT - pg.read(sql=) - - # - # Reading a document and executing a view - # - document = dreader.read() - result = couchdb.view(id='',view_name=) - +We have available notebooks with sample code to read/write against mongodb, couchdb, Netezza, PostgreSQL, Google Bigquery, Databricks, Microsoft SQL Server, MySQL ... Visit [data-transport homepage](https://healthcareio.the-phi.com/data-transport) \ No newline at end of file diff --git a/notebooks/mssqlserver.ipynb b/notebooks/mssqlserver.ipynb new file mode 100644 index 0000000..f2bee85 --- /dev/null +++ b/notebooks/mssqlserver.ipynb @@ -0,0 +1,160 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Writing to Microsoft SQLServer\n", + "\n", + "1. Insure the Microsoft SQL Server is installed and you have access i.e account information\n", + "2. The target database must be created before hand.\n", + "3. We created an authentication file that will contain user account and location of the database\n", + "\n", + "The cell below creates a dataframe that will be stored in a Microsoft SQL Server database.\n", + "\n", + "**NOTE** This was not tested with a cloud instance" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['data transport version ', '2.0.0']\n" + ] + } + ], + "source": [ + "#\n", + "# Writing to Google Bigquery database\n", + "#\n", + "import transport\n", + "from transport import providers\n", + "import pandas as pd\n", + "import os\n", + "\n", + "AUTH_FOLDER = os.environ['DT_AUTH_FOLDER'] #-- location of the service key\n", + "MSSQL_AUTH_FILE= os.sep.join([AUTH_FOLDER,'mssql.json'])\n", + "\n", + "_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n", + "msw = transport.factory.instance(provider=providers.MSSQL,table='friends',context='write',auth_file=MSSQL_AUTH_FILE)\n", + "msw.write(_data,if_exists='replace') #-- default is append\n", + "print (['data transport version ', transport.__version__])\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Reading from Microsoft SQL Server database\n", + "\n", + "The cell below reads the data that has been written by the cell above and computes the average age within an MS SQL Server (simple query). \n", + "\n", + "- Basic read of the designated table (friends) created above\n", + "- Execute an aggregate SQL against the table\n", + "\n", + "**NOTE**\n", + "\n", + "It is possible to use **transport.factory.instance** or **transport.instance** they are the same. It allows the maintainers to know that we used a factory design pattern." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " name age\n", + "0 James Bond 55\n", + "1 Steve Rogers 150\n", + "2 Steve Nyemba 44\n", + "\n", + "--------- STATISTICS ------------\n", + "\n", + " _counts \n", + "0 3 83\n" + ] + } + ], + "source": [ + "\n", + "import transport\n", + "from transport import providers\n", + "import os\n", + "AUTH_FOLDER = os.environ['DT_AUTH_FOLDER'] #-- location of the service key\n", + "MSSQL_AUTH_FILE= os.sep.join([AUTH_FOLDER,'mssql.json'])\n", + "\n", + "msr = transport.instance(provider=providers.MSSQL,table='friends',auth_file=MSSQL_AUTH_FILE)\n", + "_df = msr.read()\n", + "_query = 'SELECT COUNT(*) _counts, AVG(age) from friends'\n", + "_sdf = msr.read(sql=_query)\n", + "print (_df)\n", + "print ('\\n--------- STATISTICS ------------\\n')\n", + "print (_sdf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The cell bellow show the content of an auth_file, in this case if the dataset/table in question is not to be shared then you can use auth_file with information associated with the parameters.\n", + "\n", + "**NOTE**:\n", + "\n", + "The auth_file is intended to be **JSON** formatted" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'dataset': 'demo', 'table': 'friends'}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "{\n", + " \n", + " \"dataset\":\"demo\",\"table\":\"friends\",\"username\":\"\",\"password\":\"\"\n", + "}" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 67b91b43ab24c47abb987cfe3f03a7f2b64bfba3 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 23 Apr 2024 13:00:14 -0500 Subject: [PATCH 22/23] new: sqlserver and other refactoring --- info/__init__.py | 2 +- setup.py | 8 ++++---- transport/providers/__init__.py | 4 +++- transport/sql/__init__.py | 2 +- transport/sql/sqlserver.py | 24 ++++++++++++++++++++++++ 5 files changed, 33 insertions(+), 7 deletions(-) create mode 100644 transport/sql/sqlserver.py diff --git a/info/__init__.py b/info/__init__.py index 2d27032..0594d12 100644 --- a/info/__init__.py +++ b/info/__init__.py @@ -1,5 +1,5 @@ __author__ = 'The Phi Technology' -__version__= '2.0.0' +__version__= '2.0.2' __license__=""" diff --git a/setup.py b/setup.py index 3df143d..8e9de26 100644 --- a/setup.py +++ b/setup.py @@ -22,10 +22,10 @@ args = { "packages": find_packages(include=['info','transport', 'transport.*'])} args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3','sqlite'] -args["install_requires"] = ['pyncclient','pymongo','sqlalchemy','pandas','typer','pandas-gbq','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python','numpy'] +args["install_requires"] = ['pyncclient','pymongo','sqlalchemy','pandas','typer','pandas-gbq','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python','numpy','pymssql'] args["url"] = "https://healthcareio.the-phi.com/git/code/transport.git" args['scripts'] = ['bin/transport'] -if sys.version_info[0] == 2 : - args['use_2to3'] = True - args['use_2to3_exclude_fixers']=['lib2to3.fixes.fix_import'] +# if sys.version_info[0] == 2 : +# args['use_2to3'] = True +# args['use_2to3_exclude_fixers']=['lib2to3.fixes.fix_import'] setup(**args) diff --git a/transport/providers/__init__.py b/transport/providers/__init__.py index fc0f1e7..4a583f7 100644 --- a/transport/providers/__init__.py +++ b/transport/providers/__init__.py @@ -26,7 +26,9 @@ S3 = 's3' CALLBACK = 'callback' CONSOLE = 'console' RABBITMQ = 'rabbitmq' -DATABRICKS= 'databricks' +DATABRICKS = 'databricks' +MSSQL ='sqlserver' +SQLSERVER ='sqlserver' # # synonyms of the above diff --git a/transport/sql/__init__.py b/transport/sql/__init__.py index 557d36d..9d026bf 100644 --- a/transport/sql/__init__.py +++ b/transport/sql/__init__.py @@ -3,7 +3,7 @@ This namespace/package wrap the sql functionalities for a certain data-stores - netezza, postgresql, mysql and sqlite - mariadb, redshift (also included) """ -from . import postgresql, mysql, netezza, sqlite +from . import postgresql, mysql, netezza, sqlite, sqlserver # diff --git a/transport/sql/sqlserver.py b/transport/sql/sqlserver.py new file mode 100644 index 0000000..6a53842 --- /dev/null +++ b/transport/sql/sqlserver.py @@ -0,0 +1,24 @@ +""" +Handling Microsoft SQL Server via pymssql driver/connector +""" +import sqlalchemy +import pandas as pd +from transport.sql.common import Base, BaseReader, BaseWriter + + +class MsSQLServer: + def __init__(self,**_args) : + super().__init__(**_args) + pass + def get_provider(self): + # mssql+pymssql://scott:tiger@hostname:port/dbname" + return "mssql+pymssql" + def get_default_port(self): + return "1433" +class Reader (MsSQLServer,BaseReader): + def __init__(self,**_args): + super().__init__(**_args) + +class Writer (MsSQLServer,BaseWriter): + def __init__(self,**_args): + super().__init__(**_args) \ No newline at end of file From 5adbb5a61e423f32f3e90fb36527bac399d55e3b Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 24 Apr 2024 13:00:03 -0500 Subject: [PATCH 23/23] bug fixes and documentation --- README.md | 5 +++-- transport/iowrapper.py | 14 +++++++------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index ff8bd39..528176d 100644 --- a/README.md +++ b/README.md @@ -4,12 +4,13 @@ This project implements an abstraction of objects that can have access to a vari # Why Use Data-Transport ? -Mostly data scientists that don't really care about the underlying database and would like a simple and consistent way to read/write data and have will be well served. Additionally we implemented lightweight Extract Transform Loading API and command line (CLI) tool. +Mostly data scientists that don't really care about the underlying database and would like a simple and consistent way to read/write and move data are well served. Additionally we implemented lightweight Extract Transform Loading API and command line (CLI) tool. Finally it is possible to add pre/post processing pipeline functions to read/write 1. Familiarity with **pandas data-frames** 2. Connectivity **drivers** are included 3. Mining data from various sources -4. Useful for data migrations or ETL +4. Useful for data migrations or **ETL** + ## Installation diff --git a/transport/iowrapper.py b/transport/iowrapper.py index f113d85..df6b2ec 100644 --- a/transport/iowrapper.py +++ b/transport/iowrapper.py @@ -6,9 +6,9 @@ class IO: """ Base wrapper class for read/write """ - def __init__(self,_agent,loader): + def __init__(self,_agent,plugins): self._agent = _agent - self._loader = loader + self._plugins = plugins def meta (self,**_args): if hasattr(self._agent,'meta') : return self._agent.meta(**_args) @@ -21,7 +21,7 @@ class IO: """ applying pre/post conditions given a pipeline expression """ - for _pointer in self._loader : + for _pointer in self._plugins : _data = _pointer(_data) def apply(self,_query): if hasattr(self._agent,'apply') : @@ -32,8 +32,8 @@ class IReader(IO): super().__init__(_agent,pipeline) def read(self,**_args): _data = self._agent.read(**_args) - if self._loader and self._loader.ratio() > 0 : - _data = self._loader.apply(_data) + if self._plugins and self._plugins.ratio() > 0 : + _data = self._plugins.apply(_data) # # output data return _data @@ -41,7 +41,7 @@ class IWriter(IO): def __init__(self,_agent,pipeline=None): super().__init__(_agent,pipeline) def write(self,_data,**_args): - if self._loader and self._loader.ratio() > 0 : - _data = self._loader.apply(_data) + if self._plugins and self._plugins.ratio() > 0 : + _data = self._plugins.apply(_data) self._agent.write(_data,**_args)