2019-09-17 17:00:45 +00:00
|
|
|
"""
|
|
|
|
Data Transport - 1.0
|
|
|
|
Steve L. Nyemba, The Phi Technology LLC
|
|
|
|
|
|
|
|
This file is a wrapper around mongodb for reading/writing content against a mongodb server and executing views (mapreduce)
|
|
|
|
"""
|
2020-02-01 15:42:45 +00:00
|
|
|
from pymongo import MongoClient
|
|
|
|
from bson.objectid import ObjectId
|
|
|
|
from bson.binary import Binary
|
2023-01-19 03:13:38 +00:00
|
|
|
# import nujson as json
|
2020-02-01 15:42:45 +00:00
|
|
|
from datetime import datetime
|
2021-09-01 07:33:40 +00:00
|
|
|
import pandas as pd
|
2022-09-19 15:01:34 +00:00
|
|
|
import numpy as np
|
2020-02-01 15:42:45 +00:00
|
|
|
import gridfs
|
2019-09-17 04:08:43 +00:00
|
|
|
# from transport import Reader,Writer
|
2019-11-05 03:51:20 +00:00
|
|
|
import sys
|
|
|
|
if sys.version_info[0] > 2 :
|
|
|
|
from transport.common import Reader, Writer
|
|
|
|
else:
|
|
|
|
from common import Reader, Writer
|
2019-09-17 04:08:43 +00:00
|
|
|
import json
|
2020-09-26 21:53:33 +00:00
|
|
|
import re
|
2022-03-19 05:02:53 +00:00
|
|
|
from multiprocessing import Lock, RLock
|
2019-09-17 04:08:43 +00:00
|
|
|
class Mongo :
|
2022-03-19 05:02:53 +00:00
|
|
|
lock = RLock()
|
2019-09-17 04:08:43 +00:00
|
|
|
"""
|
|
|
|
Basic mongodb functions are captured here
|
|
|
|
"""
|
|
|
|
def __init__(self,**args):
|
|
|
|
"""
|
|
|
|
:dbname database name/identifier
|
2020-02-01 15:42:45 +00:00
|
|
|
:host host and port of the database by default localhost:27017
|
2019-09-17 04:08:43 +00:00
|
|
|
:username username for authentication
|
|
|
|
:password password for current user
|
|
|
|
"""
|
2022-09-19 15:01:34 +00:00
|
|
|
|
2022-11-13 21:45:21 +00:00
|
|
|
self.mechanism= 'SCRAM-SHA-256' if 'mechanism' not in args else args['mechanism']
|
2022-06-25 18:59:49 +00:00
|
|
|
# authSource=(args['authSource'] if 'authSource' in args else self.dbname)
|
2022-05-06 19:25:12 +00:00
|
|
|
self._lock = False if 'lock' not in args else args['lock']
|
|
|
|
|
2022-05-16 16:27:36 +00:00
|
|
|
username = password = None
|
|
|
|
if 'auth_file' in args :
|
|
|
|
_info = json.loads((open(args['auth_file'])).read())
|
2022-09-19 15:01:34 +00:00
|
|
|
|
2022-06-25 18:59:49 +00:00
|
|
|
|
|
|
|
else:
|
|
|
|
_info = {}
|
|
|
|
_args = dict(args,**_info)
|
|
|
|
for key in _args :
|
|
|
|
if key in ['username','password'] :
|
|
|
|
username = _args['username'] if key=='username' else username
|
|
|
|
password = _args['password'] if key == 'password' else password
|
|
|
|
continue
|
|
|
|
value = _args[key]
|
|
|
|
|
|
|
|
self.setattr(key,value)
|
|
|
|
#
|
|
|
|
# Let us perform aliasing in order to remain backwards compatible
|
|
|
|
|
|
|
|
self.dbname = self.db if hasattr(self,'db')else self.dbname
|
|
|
|
self.uid = _args['table'] if 'table' in _args else (_args['doc'] if 'doc' in _args else (_args['collection'] if 'collection' in _args else None))
|
2022-05-16 16:27:36 +00:00
|
|
|
if username and password :
|
2022-06-25 18:59:49 +00:00
|
|
|
self.client = MongoClient(self.host,
|
2022-05-16 16:27:36 +00:00
|
|
|
username=username,
|
|
|
|
password=password ,
|
2022-06-25 18:59:49 +00:00
|
|
|
authSource=self.authSource,
|
2022-11-13 21:45:21 +00:00
|
|
|
authMechanism=self.mechanism)
|
2022-06-25 18:59:49 +00:00
|
|
|
|
2019-09-17 04:08:43 +00:00
|
|
|
else:
|
2022-06-25 18:59:49 +00:00
|
|
|
self.client = MongoClient(self.host,maxPoolSize=10000)
|
2019-09-17 04:08:43 +00:00
|
|
|
|
|
|
|
self.db = self.client[self.dbname]
|
|
|
|
|
|
|
|
def isready(self):
|
|
|
|
p = self.dbname in self.client.list_database_names()
|
|
|
|
q = self.uid in self.client[self.dbname].list_collection_names()
|
|
|
|
return p and q
|
2022-06-25 18:59:49 +00:00
|
|
|
def setattr(self,key,value):
|
2022-11-13 21:45:21 +00:00
|
|
|
_allowed = ['host','port','db','doc','authSource','mechanism']
|
2022-06-25 18:59:49 +00:00
|
|
|
if key in _allowed :
|
|
|
|
setattr(self,key,value)
|
|
|
|
pass
|
2020-10-06 20:26:06 +00:00
|
|
|
def close(self):
|
2020-11-16 03:19:40 +00:00
|
|
|
self.client.close()
|
2022-09-19 15:01:34 +00:00
|
|
|
def meta(self,**_args):
|
|
|
|
return []
|
2019-09-17 04:08:43 +00:00
|
|
|
class MongoReader(Mongo,Reader):
|
|
|
|
"""
|
|
|
|
This class will read from a mongodb data store and return the content of a document (not a collection)
|
|
|
|
"""
|
|
|
|
def __init__(self,**args):
|
|
|
|
Mongo.__init__(self,**args)
|
2020-05-18 02:57:18 +00:00
|
|
|
def read(self,**args):
|
2022-08-19 20:32:35 +00:00
|
|
|
|
2022-12-14 22:48:40 +00:00
|
|
|
if 'mongo' in args or 'cmd' in args:
|
2020-09-26 21:53:33 +00:00
|
|
|
#
|
|
|
|
# @TODO:
|
2022-11-13 21:45:21 +00:00
|
|
|
cmd = args['mongo'] if 'mongo' in args else args['cmd']
|
2022-09-19 15:01:34 +00:00
|
|
|
if "aggregate" in cmd :
|
|
|
|
if "allowDiskUse" not in cmd :
|
|
|
|
cmd["allowDiskUse"] = True
|
|
|
|
if "cursor" not in cmd :
|
|
|
|
cmd["cursor"] = {}
|
2020-09-26 21:53:33 +00:00
|
|
|
r = []
|
|
|
|
out = self.db.command(cmd)
|
|
|
|
#@TODO: consider using a yield (generator) works wonders
|
|
|
|
while True :
|
2020-12-26 18:20:40 +00:00
|
|
|
if 'values' in out :
|
|
|
|
r += out['values']
|
2020-09-26 21:53:33 +00:00
|
|
|
if 'cursor' in out :
|
|
|
|
key = 'firstBatch' if 'firstBatch' in out['cursor'] else 'nextBatch'
|
|
|
|
else:
|
|
|
|
key = 'n'
|
|
|
|
if 'cursor' in out and out['cursor'][key] :
|
|
|
|
r += list(out['cursor'][key])
|
2020-11-04 20:26:46 +00:00
|
|
|
elif key in out and out[key]:
|
2020-09-26 21:53:33 +00:00
|
|
|
r.append (out[key])
|
|
|
|
# yield out['cursor'][key]
|
|
|
|
if key not in ['firstBatch','nextBatch'] or ('cursor' in out and out['cursor']['id'] == 0) :
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
out = self.db.command({"getMore":out['cursor']['id'],"collection":out['cursor']['ns'].split(".")[-1]})
|
|
|
|
|
|
|
|
|
2021-11-18 21:21:26 +00:00
|
|
|
return pd.DataFrame(r)
|
2020-09-26 21:53:33 +00:00
|
|
|
else:
|
|
|
|
collection = self.db[self.uid]
|
|
|
|
_filter = args['filter'] if 'filter' in args else {}
|
2022-06-25 18:59:49 +00:00
|
|
|
_df = pd.DataFrame(collection.find(_filter))
|
|
|
|
columns = _df.columns.tolist()[1:]
|
|
|
|
return _df[columns]
|
2019-09-17 04:08:43 +00:00
|
|
|
def view(self,**args):
|
|
|
|
"""
|
|
|
|
This function is designed to execute a view (map/reduce) operation
|
|
|
|
"""
|
|
|
|
pass
|
|
|
|
class MongoWriter(Mongo,Writer):
|
|
|
|
"""
|
|
|
|
This class is designed to write to a mongodb collection within a database
|
|
|
|
"""
|
|
|
|
def __init__(self,**args):
|
|
|
|
Mongo.__init__(self,**args)
|
2020-02-01 15:42:45 +00:00
|
|
|
def upload(self,**args) :
|
|
|
|
"""
|
|
|
|
This function will upload a file to the current database (using GridFS)
|
|
|
|
:param data binary stream/text to be stored
|
|
|
|
:param filename filename to be used
|
|
|
|
:param encoding content_encoding (default utf-8)
|
|
|
|
|
|
|
|
"""
|
|
|
|
if 'encoding' not in args :
|
|
|
|
args['encoding'] = 'utf-8'
|
|
|
|
gfs = GridFS(self.db)
|
|
|
|
gfs.put(**args)
|
|
|
|
|
|
|
|
def archive(self):
|
|
|
|
"""
|
|
|
|
This function will archive documents to the
|
|
|
|
"""
|
|
|
|
collection = self.db[self.uid]
|
|
|
|
rows = list(collection.find())
|
|
|
|
for row in rows :
|
|
|
|
if type(row['_id']) == ObjectId :
|
|
|
|
row['_id'] = str(row['_id'])
|
|
|
|
stream = Binary(json.dumps(collection).encode())
|
|
|
|
collection.delete_many({})
|
|
|
|
now = "-".join([str(datetime.now().year()),str(datetime.now().month), str(datetime.now().day)])
|
|
|
|
name = ".".join([self.uid,'archive',now])+".json"
|
|
|
|
description = " ".join([self.uid,'archive',str(len(rows))])
|
|
|
|
self.upload(filename=name,data=stream,description=description,content_type='application/json')
|
|
|
|
# gfs = GridFS(self.db)
|
|
|
|
# gfs.put(filename=name,description=description,data=stream,encoding='utf-8')
|
|
|
|
# self.write({{"filename":name,"file":stream,"description":descriptions}})
|
|
|
|
|
|
|
|
|
|
|
|
pass
|
2022-04-29 16:15:32 +00:00
|
|
|
def write(self,info,**_args):
|
2020-02-01 15:42:45 +00:00
|
|
|
"""
|
|
|
|
This function will write to a given collection i.e add a record to a collection (no updates)
|
|
|
|
@param info new record in the collection to be added
|
|
|
|
"""
|
2019-09-17 04:08:43 +00:00
|
|
|
# document = self.db[self.uid].find()
|
2022-06-25 18:59:49 +00:00
|
|
|
#collection = self.db[self.uid]
|
2020-02-01 15:42:45 +00:00
|
|
|
# if type(info) == list :
|
|
|
|
# self.db[self.uid].insert_many(info)
|
|
|
|
# else:
|
2022-03-19 05:02:53 +00:00
|
|
|
try:
|
2022-04-29 16:15:32 +00:00
|
|
|
_uid = self.uid if 'doc' not in _args else _args['doc']
|
2022-03-19 05:02:53 +00:00
|
|
|
if self._lock :
|
|
|
|
Mongo.lock.acquire()
|
|
|
|
if type(info) == list or type(info) == pd.DataFrame :
|
2022-04-29 16:15:32 +00:00
|
|
|
self.db[_uid].insert_many(info if type(info) == list else info.to_dict(orient='records'))
|
2022-03-19 05:02:53 +00:00
|
|
|
else:
|
2022-04-29 16:15:32 +00:00
|
|
|
self.db[_uid].insert_one(info)
|
2022-03-19 05:02:53 +00:00
|
|
|
finally:
|
|
|
|
if self._lock :
|
|
|
|
Mongo.lock.release()
|
2019-09-17 04:08:43 +00:00
|
|
|
def set(self,document):
|
2020-02-01 15:42:45 +00:00
|
|
|
"""
|
|
|
|
if no identifier is provided the function will delete the entire collection and set the new document.
|
|
|
|
Please use this function with great care (archive the content first before using it... for safety)
|
|
|
|
"""
|
|
|
|
|
2019-09-17 04:08:43 +00:00
|
|
|
collection = self.db[self.uid]
|
2020-02-01 15:42:45 +00:00
|
|
|
if collection.count_document() > 0 and '_id' in document:
|
|
|
|
id = document['_id']
|
|
|
|
del document['_id']
|
|
|
|
collection.find_one_and_replace({'_id':id},document)
|
|
|
|
else:
|
|
|
|
collection.delete_many({})
|
|
|
|
self.write(info)
|
2020-11-16 03:19:40 +00:00
|
|
|
def close(self):
|
|
|
|
Mongo.close(self)
|
2020-02-01 15:42:45 +00:00
|
|
|
# collecton.update_one({"_id":self.uid},document,True)
|
2019-09-17 04:08:43 +00:00
|
|
|
|