data-transport/transport/mongo.py

"""
Data Transport - 1.0
Steve L. Nyemba, The Phi Technology LLC

This file is a wrapper around mongodb for reading/writing content against a mongodb server and executing views (mapreduce)
"""
from pymongo        import MongoClient
from bson.objectid  import ObjectId
from bson.binary    import Binary
# import nujson as json
from datetime import datetime
import pandas as pd
import numpy as np
import gridfs
# from transport import Reader,Writer
import sys
if sys.version_info[0] > 2 :
	from transport.common import Reader, Writer
else:
	from common import Reader, Writer
import json
import re
from multiprocessing import Lock, RLock
class Mongo :
    lock = RLock()
    """
    Basic mongodb functions are captured here
    """
    def __init__(self,**args):
        """
            :dbname     database name/identifier
            :host       host and port of the database by default localhost:27017
            :username   username for authentication
            :password   password for current user
        """

        self.mechanism= 'SCRAM-SHA-256' if 'mechanism' not in args else args['mechanism']
        # authSource=(args['authSource'] if 'authSource' in args else self.dbname)
        self._lock = False if 'lock' not in args else args['lock']

        username = password = None
        if 'auth_file' in args :
            _info = json.loads((open(args['auth_file'])).read())

            
        else:
            _info = {}
        _args = dict(args,**_info)
        for key in _args :
            if key in ['username','password'] :
                username = _args['username'] if key=='username' else username
                password = _args['password'] if key == 'password' else password
                continue
            value = _args[key]
            
            self.setattr(key,value)
        #
        # Let us perform aliasing in order to remain backwards compatible

        self.dbname = self.db if hasattr(self,'db')else self.dbname
        self.uid    = _args['table'] if 'table' in _args else (_args['doc'] if 'doc' in _args else (_args['collection'] if 'collection' in _args else None))
        if username and password :
            self.client = MongoClient(self.host,
                      username=username,
                      password=password ,
                      authSource=self.authSource,
                      authMechanism=self.mechanism)
            
        else:
            self.client = MongoClient(self.host,maxPoolSize=10000)                    
        
        self.db = self.client[self.dbname]
        
    def isready(self):
        p = self.dbname in self.client.list_database_names() 
        q = self.uid in self.client[self.dbname].list_collection_names()
        return p and q
    def setattr(self,key,value):
        _allowed = ['host','port','db','doc','authSource','mechanism']
        if key in _allowed :
            setattr(self,key,value)
        pass
    def close(self):
        self.client.close()
    def meta(self,**_args):
        return []
class MongoReader(Mongo,Reader):
    """
    This class will read from a mongodb data store and return the content of a document (not a collection)
    """
    def __init__(self,**args):
        Mongo.__init__(self,**args)
    def read(self,**args):
        
        if 'mongo' in args or 'cmd' in args:
            #
            # @TODO:
            cmd = args['mongo'] if 'mongo' in args else args['cmd']
            if "aggregate" in cmd :
                if "allowDiskUse" not in cmd :
                    cmd["allowDiskUse"] = True
                if "cursor" not in cmd :
                    cmd["cursor"] = {}
            r =  []
            out = self.db.command(cmd)
            #@TODO: consider using a yield (generator) works wonders
            while True :
                if 'values' in out :
                    r += out['values']
                if 'cursor' in out :
                    key = 'firstBatch' if 'firstBatch' in out['cursor'] else 'nextBatch'
                else:
                    key = 'n'
                if 'cursor' in out and out['cursor'][key] :
                    r += list(out['cursor'][key])
                elif key in out and out[key]:
                    r.append (out[key]) 
                    # yield out['cursor'][key]
                if key not in ['firstBatch','nextBatch'] or ('cursor' in out and out['cursor']['id']  == 0) :
                    break
                else:
                    out = self.db.command({"getMore":out['cursor']['id'],"collection":out['cursor']['ns'].split(".")[-1]}) 
                
                
            return pd.DataFrame(r)
        else:
            collection = self.db[self.uid]                
            _filter = args['filter'] if 'filter' in args else {}
            _df =  pd.DataFrame(collection.find(_filter))
            columns = _df.columns.tolist()[1:]
            return _df[columns]
    def view(self,**args):
        """
        This function is designed to execute a view (map/reduce) operation
        """
        pass
class MongoWriter(Mongo,Writer):
    """
    This class is designed to write to a mongodb collection within a database
    """
    def __init__(self,**args):
        Mongo.__init__(self,**args)
    def upload(self,**args) :
        """
        This function will upload a file to the current database (using GridFS)
        :param  data        binary stream/text to be stored
        :param  filename    filename to be used
        :param  encoding    content_encoding (default utf-8)
        
        """
        if 'encoding' not in args :
            args['encoding'] = 'utf-8'
        gfs = GridFS(self.db)
        gfs.put(**args)

    def archive(self):
        """
        This function will archive documents to the 
        """
        collection = self.db[self.uid]
        rows  = list(collection.find())
        for row in rows :
            if type(row['_id']) == ObjectId :
                row['_id'] = str(row['_id'])
        stream = Binary(json.dumps(collection).encode())
        collection.delete_many({})
        now = "-".join([str(datetime.now().year()),str(datetime.now().month), str(datetime.now().day)])
        name = ".".join([self.uid,'archive',now])+".json"
        description = " ".join([self.uid,'archive',str(len(rows))])
        self.upload(filename=name,data=stream,description=description,content_type='application/json')
        # gfs = GridFS(self.db)
        # gfs.put(filename=name,description=description,data=stream,encoding='utf-8')
        # self.write({{"filename":name,"file":stream,"description":descriptions}})
        
            
        pass
    def write(self,info,**_args):
        """
        This function will write to a given collection i.e add a record to a collection (no updates)
        @param info new record in the collection to be added
        """
        # document  = self.db[self.uid].find()
        #collection = self.db[self.uid]
        # if type(info) == list :
        #     self.db[self.uid].insert_many(info)
        # else:
        try:
            _uid = self.uid if 'doc' not in _args else _args['doc']
            if self._lock :
                Mongo.lock.acquire()
            if type(info) == list or type(info) == pd.DataFrame :
                self.db[_uid].insert_many(info if type(info) == list else info.to_dict(orient='records'))
            else:
                self.db[_uid].insert_one(info)
        finally:
            if self._lock :
                Mongo.lock.release()
    def set(self,document):
        """
        if no identifier is provided the function will delete the entire collection and set the new document.
        Please use this function with great care (archive the content first before using it... for safety)
        """

        collection = self.db[self.uid]
        if collection.count_document() > 0  and '_id' in document:
            id = document['_id']
            del document['_id']
            collection.find_one_and_replace({'_id':id},document)
        else:
            collection.delete_many({})
            self.write(info)
    def close(self):
        Mongo.close(self)
        # collecton.update_one({"_id":self.uid},document,True)
housekeeping/documentation 2019-09-17 17:00:45 +00:00			`"""`
			`Data Transport - 1.0`
			`Steve L. Nyemba, The Phi Technology LLC`

			`This file is a wrapper around mongodb for reading/writing content against a mongodb server and executing views (mapreduce)`
			`"""`
bug fixes for version 1.0.8, streamlining interface 2020-02-01 15:42:45 +00:00			`from pymongo import MongoClient`
			`from bson.objectid import ObjectId`
			`from bson.binary import Binary`
bug fix: removing unused library 2023-01-19 03:13:38 +00:00			`# import nujson as json`
bug fixes for version 1.0.8, streamlining interface 2020-02-01 15:42:45 +00:00			`from datetime import datetime`
bug fix: imports, testing code removed 2021-09-01 07:33:40 +00:00			`import pandas as pd`
bug fix & new feature 2022-09-19 15:01:34 +00:00			`import numpy as np`
bug fixes for version 1.0.8, streamlining interface 2020-02-01 15:42:45 +00:00			`import gridfs`
data transport framework for rabbitmq, mongodb, couchdb, ... 2019-09-17 04:08:43 +00:00			`# from transport import Reader,Writer`
compatibility to python 3.6 ... 2019-11-05 03:51:20 +00:00			`import sys`
			`if sys.version_info[0] > 2 :`
			`from transport.common import Reader, Writer`
			`else:`
			`from common import Reader, Writer`
data transport framework for rabbitmq, mongodb, couchdb, ... 2019-09-17 04:08:43 +00:00			`import json`
bug fix 2020-09-26 21:53:33 +00:00			`import re`
optimizations mongodb 2022-03-19 05:02:53 +00:00			`from multiprocessing import Lock, RLock`
data transport framework for rabbitmq, mongodb, couchdb, ... 2019-09-17 04:08:43 +00:00			`class Mongo :`
optimizations mongodb 2022-03-19 05:02:53 +00:00			`lock = RLock()`
data transport framework for rabbitmq, mongodb, couchdb, ... 2019-09-17 04:08:43 +00:00			`"""`
			`Basic mongodb functions are captured here`
			`"""`
			`def __init__(self,**args):`
			`"""`
			`:dbname database name/identifier`
bug fixes for version 1.0.8, streamlining interface 2020-02-01 15:42:45 +00:00			`:host host and port of the database by default localhost:27017`
data transport framework for rabbitmq, mongodb, couchdb, ... 2019-09-17 04:08:43 +00:00			`:username username for authentication`
			`:password password for current user`
			`"""`
bug fix & new feature 2022-09-19 15:01:34 +00:00
bug fix mongodb,bigquery 2022-11-13 21:45:21 +00:00			`self.mechanism= 'SCRAM-SHA-256' if 'mechanism' not in args else args['mechanism']`
bug fix: environment variable on default database 2022-06-25 18:59:49 +00:00			`# authSource=(args['authSource'] if 'authSource' in args else self.dbname)`
bug fix: authentication mongodb 2022-05-06 19:25:12 +00:00			`self._lock = False if 'lock' not in args else args['lock']`

new: authentication using file on disk, misc bug fixes 2022-05-16 16:27:36 +00:00			`username = password = None`
			`if 'auth_file' in args :`
			`_info = json.loads((open(args['auth_file'])).read())`
bug fix & new feature 2022-09-19 15:01:34 +00:00
bug fix: environment variable on default database 2022-06-25 18:59:49 +00:00
			`else:`
			`_info = {}`
			`_args = dict(args,**_info)`
			`for key in _args :`
			`if key in ['username','password'] :`
			`username = _args['username'] if key=='username' else username`
			`password = _args['password'] if key == 'password' else password`
			`continue`
			`value = _args[key]`

			`self.setattr(key,value)`
			`#`
			`# Let us perform aliasing in order to remain backwards compatible`

			`self.dbname = self.db if hasattr(self,'db')else self.dbname`
			`self.uid = _args['table'] if 'table' in _args else (_args['doc'] if 'doc' in _args else (_args['collection'] if 'collection' in _args else None))`
new: authentication using file on disk, misc bug fixes 2022-05-16 16:27:36 +00:00			`if username and password :`
bug fix: environment variable on default database 2022-06-25 18:59:49 +00:00			`self.client = MongoClient(self.host,`
new: authentication using file on disk, misc bug fixes 2022-05-16 16:27:36 +00:00			`username=username,`
			`password=password ,`
bug fix: environment variable on default database 2022-06-25 18:59:49 +00:00			`authSource=self.authSource,`
bug fix mongodb,bigquery 2022-11-13 21:45:21 +00:00			`authMechanism=self.mechanism)`
bug fix: environment variable on default database 2022-06-25 18:59:49 +00:00
data transport framework for rabbitmq, mongodb, couchdb, ... 2019-09-17 04:08:43 +00:00			`else:`
bug fix: environment variable on default database 2022-06-25 18:59:49 +00:00			`self.client = MongoClient(self.host,maxPoolSize=10000)`
data transport framework for rabbitmq, mongodb, couchdb, ... 2019-09-17 04:08:43 +00:00
			`self.db = self.client[self.dbname]`

			`def isready(self):`
			`p = self.dbname in self.client.list_database_names()`
			`q = self.uid in self.client[self.dbname].list_collection_names()`
			`return p and q`
bug fix: environment variable on default database 2022-06-25 18:59:49 +00:00			`def setattr(self,key,value):`
bug fix mongodb,bigquery 2022-11-13 21:45:21 +00:00			`_allowed = ['host','port','db','doc','authSource','mechanism']`
bug fix: environment variable on default database 2022-06-25 18:59:49 +00:00			`if key in _allowed :`
			`setattr(self,key,value)`
			`pass`
close function for mongodb 2020-10-06 20:26:06 +00:00			`def close(self):`
bug fix with mongodb client 2020-11-16 03:19:40 +00:00			`self.client.close()`
bug fix & new feature 2022-09-19 15:01:34 +00:00			`def meta(self,**_args):`
			`return []`
data transport framework for rabbitmq, mongodb, couchdb, ... 2019-09-17 04:08:43 +00:00			`class MongoReader(Mongo,Reader):`
			`"""`
			`This class will read from a mongodb data store and return the content of a document (not a collection)`
			`"""`
			`def __init__(self,**args):`
			`Mongo.__init__(self,**args)`
support for filters in read 2020-05-18 02:57:18 +00:00			`def read(self,**args):`
bug fix 2022-08-19 20:32:35 +00:00
bug fix: mongodb read 2022-12-14 22:48:40 +00:00			`if 'mongo' in args or 'cmd' in args:`
bug fix 2020-09-26 21:53:33 +00:00			`#`
			`# @TODO:`
bug fix mongodb,bigquery 2022-11-13 21:45:21 +00:00			`cmd = args['mongo'] if 'mongo' in args else args['cmd']`
bug fix & new feature 2022-09-19 15:01:34 +00:00			`if "aggregate" in cmd :`
			`if "allowDiskUse" not in cmd :`
			`cmd["allowDiskUse"] = True`
			`if "cursor" not in cmd :`
			`cmd["cursor"] = {}`
bug fix 2020-09-26 21:53:33 +00:00			`r = []`
			`out = self.db.command(cmd)`
			`#@TODO: consider using a yield (generator) works wonders`
			`while True :`
bug fix: mongodb commands with values 2020-12-26 18:20:40 +00:00			`if 'values' in out :`
			`r += out['values']`
bug fix 2020-09-26 21:53:33 +00:00			`if 'cursor' in out :`
			`key = 'firstBatch' if 'firstBatch' in out['cursor'] else 'nextBatch'`
			`else:`
			`key = 'n'`
			`if 'cursor' in out and out['cursor'][key] :`
			`r += list(out['cursor'][key])`
bug fixes: mongodb and s3 bucket 2020-11-04 20:26:46 +00:00			`elif key in out and out[key]:`
bug fix 2020-09-26 21:53:33 +00:00			`r.append (out[key])`
			`# yield out['cursor'][key]`
			`if key not in ['firstBatch','nextBatch'] or ('cursor' in out and out['cursor']['id'] == 0) :`
			`break`
			`else:`
			`out = self.db.command({"getMore":out['cursor']['id'],"collection":out['cursor']['ns'].split(".")[-1]})`


bug fix: simplyfying factory interface 2021-11-18 21:21:26 +00:00			`return pd.DataFrame(r)`
bug fix 2020-09-26 21:53:33 +00:00			`else:`
			`collection = self.db[self.uid]`
			`_filter = args['filter'] if 'filter' in args else {}`
bug fix: environment variable on default database 2022-06-25 18:59:49 +00:00			`_df = pd.DataFrame(collection.find(_filter))`
			`columns = _df.columns.tolist()[1:]`
			`return _df[columns]`
data transport framework for rabbitmq, mongodb, couchdb, ... 2019-09-17 04:08:43 +00:00			`def view(self,**args):`
			`"""`
			`This function is designed to execute a view (map/reduce) operation`
			`"""`
			`pass`
			`class MongoWriter(Mongo,Writer):`
			`"""`
			`This class is designed to write to a mongodb collection within a database`
			`"""`
			`def __init__(self,**args):`
			`Mongo.__init__(self,**args)`
bug fixes for version 1.0.8, streamlining interface 2020-02-01 15:42:45 +00:00			`def upload(self,**args) :`
			`"""`
			`This function will upload a file to the current database (using GridFS)`
			`:param data binary stream/text to be stored`
			`:param filename filename to be used`
			`:param encoding content_encoding (default utf-8)`

			`"""`
			`if 'encoding' not in args :`
			`args['encoding'] = 'utf-8'`
			`gfs = GridFS(self.db)`
			`gfs.put(**args)`

			`def archive(self):`
			`"""`
			`This function will archive documents to the`
			`"""`
			`collection = self.db[self.uid]`
			`rows = list(collection.find())`
			`for row in rows :`
			`if type(row['_id']) == ObjectId :`
			`row['_id'] = str(row['_id'])`
			`stream = Binary(json.dumps(collection).encode())`
			`collection.delete_many({})`
			`now = "-".join([str(datetime.now().year()),str(datetime.now().month), str(datetime.now().day)])`
			`name = ".".join([self.uid,'archive',now])+".json"`
			`description = " ".join([self.uid,'archive',str(len(rows))])`
			`self.upload(filename=name,data=stream,description=description,content_type='application/json')`
			`# gfs = GridFS(self.db)`
			`# gfs.put(filename=name,description=description,data=stream,encoding='utf-8')`
			`# self.write({{"filename":name,"file":stream,"description":descriptions}})`


			`pass`
enhancement mongodb 2022-04-29 16:15:32 +00:00			`def write(self,info,**_args):`
bug fixes for version 1.0.8, streamlining interface 2020-02-01 15:42:45 +00:00			`"""`
			`This function will write to a given collection i.e add a record to a collection (no updates)`
			`@param info new record in the collection to be added`
			`"""`
data transport framework for rabbitmq, mongodb, couchdb, ... 2019-09-17 04:08:43 +00:00			`# document = self.db[self.uid].find()`
bug fix: environment variable on default database 2022-06-25 18:59:49 +00:00			`#collection = self.db[self.uid]`
bug fixes for version 1.0.8, streamlining interface 2020-02-01 15:42:45 +00:00			`# if type(info) == list :`
			`# self.db[self.uid].insert_many(info)`
			`# else:`
optimizations mongodb 2022-03-19 05:02:53 +00:00			`try:`
enhancement mongodb 2022-04-29 16:15:32 +00:00			`_uid = self.uid if 'doc' not in _args else _args['doc']`
optimizations mongodb 2022-03-19 05:02:53 +00:00			`if self._lock :`
			`Mongo.lock.acquire()`
			`if type(info) == list or type(info) == pd.DataFrame :`
enhancement mongodb 2022-04-29 16:15:32 +00:00			`self.db[_uid].insert_many(info if type(info) == list else info.to_dict(orient='records'))`
optimizations mongodb 2022-03-19 05:02:53 +00:00			`else:`
enhancement mongodb 2022-04-29 16:15:32 +00:00			`self.db[_uid].insert_one(info)`
optimizations mongodb 2022-03-19 05:02:53 +00:00			`finally:`
			`if self._lock :`
			`Mongo.lock.release()`
data transport framework for rabbitmq, mongodb, couchdb, ... 2019-09-17 04:08:43 +00:00			`def set(self,document):`
bug fixes for version 1.0.8, streamlining interface 2020-02-01 15:42:45 +00:00			`"""`
			`if no identifier is provided the function will delete the entire collection and set the new document.`
			`Please use this function with great care (archive the content first before using it... for safety)`
			`"""`

data transport framework for rabbitmq, mongodb, couchdb, ... 2019-09-17 04:08:43 +00:00			`collection = self.db[self.uid]`
bug fixes for version 1.0.8, streamlining interface 2020-02-01 15:42:45 +00:00			`if collection.count_document() > 0 and '_id' in document:`
			`id = document['_id']`
			`del document['_id']`
			`collection.find_one_and_replace({'_id':id},document)`
			`else:`
			`collection.delete_many({})`
			`self.write(info)`
bug fix with mongodb client 2020-11-16 03:19:40 +00:00			`def close(self):`
			`Mongo.close(self)`
bug fixes for version 1.0.8, streamlining interface 2020-02-01 15:42:45 +00:00			`# collecton.update_one({"_id":self.uid},document,True)`
data transport framework for rabbitmq, mongodb, couchdb, ... 2019-09-17 04:08:43 +00:00