data-transport/transport/mongo.py

179 lines
6.5 KiB
Python
Raw Normal View History

2019-09-17 17:00:45 +00:00
"""
Data Transport - 1.0
Steve L. Nyemba, The Phi Technology LLC
This file is a wrapper around mongodb for reading/writing content against a mongodb server and executing views (mapreduce)
"""
from pymongo import MongoClient
from bson.objectid import ObjectId
from bson.binary import Binary
import json
from datetime import datetime
2021-09-01 07:33:40 +00:00
import pandas as pd
import gridfs
# from transport import Reader,Writer
2019-11-05 03:51:20 +00:00
import sys
if sys.version_info[0] > 2 :
from transport.common import Reader, Writer
else:
from common import Reader, Writer
import json
2020-09-26 21:53:33 +00:00
import re
2022-03-19 05:02:53 +00:00
from multiprocessing import Lock, RLock
class Mongo :
2022-03-19 05:02:53 +00:00
lock = RLock()
"""
Basic mongodb functions are captured here
"""
def __init__(self,**args):
"""
:dbname database name/identifier
:host host and port of the database by default localhost:27017
:username username for authentication
:password password for current user
"""
host = args['host'] if 'host' in args else 'localhost:27017'
if 'user' in args and 'password' in args:
self.client = MongoClient(host,
username=args['username'] ,
password=args['password'] ,
authMechanism='SCRAM-SHA-256')
else:
2020-10-08 22:14:35 +00:00
self.client = MongoClient(host,maxPoolSize=10000)
2019-09-17 16:21:42 +00:00
self.uid = args['doc'] #-- document identifier
2020-05-18 03:53:54 +00:00
self.dbname = args['dbname'] if 'dbname' in args else args['db']
self.db = self.client[self.dbname]
2022-03-19 05:02:53 +00:00
self._lock = False if 'lock' not in args else args['lock']
def isready(self):
p = self.dbname in self.client.list_database_names()
q = self.uid in self.client[self.dbname].list_collection_names()
return p and q
2020-10-06 20:26:06 +00:00
def close(self):
2020-11-16 03:19:40 +00:00
self.client.close()
class MongoReader(Mongo,Reader):
"""
This class will read from a mongodb data store and return the content of a document (not a collection)
"""
def __init__(self,**args):
Mongo.__init__(self,**args)
2020-05-18 02:57:18 +00:00
def read(self,**args):
2020-09-26 21:53:33 +00:00
if 'mongo' in args :
#
# @TODO:
cmd = args['mongo']
r = []
out = self.db.command(cmd)
#@TODO: consider using a yield (generator) works wonders
while True :
2020-12-26 18:20:40 +00:00
if 'values' in out :
r += out['values']
2020-09-26 21:53:33 +00:00
if 'cursor' in out :
key = 'firstBatch' if 'firstBatch' in out['cursor'] else 'nextBatch'
else:
key = 'n'
if 'cursor' in out and out['cursor'][key] :
r += list(out['cursor'][key])
2020-11-04 20:26:46 +00:00
elif key in out and out[key]:
2020-09-26 21:53:33 +00:00
r.append (out[key])
# yield out['cursor'][key]
if key not in ['firstBatch','nextBatch'] or ('cursor' in out and out['cursor']['id'] == 0) :
break
else:
out = self.db.command({"getMore":out['cursor']['id'],"collection":out['cursor']['ns'].split(".")[-1]})
2021-11-18 21:21:26 +00:00
return pd.DataFrame(r)
2020-09-26 21:53:33 +00:00
else:
collection = self.db[self.uid]
_filter = args['filter'] if 'filter' in args else {}
return collection.find(_filter)
def view(self,**args):
"""
This function is designed to execute a view (map/reduce) operation
"""
pass
class MongoWriter(Mongo,Writer):
"""
This class is designed to write to a mongodb collection within a database
"""
def __init__(self,**args):
Mongo.__init__(self,**args)
def upload(self,**args) :
"""
This function will upload a file to the current database (using GridFS)
:param data binary stream/text to be stored
:param filename filename to be used
:param encoding content_encoding (default utf-8)
"""
if 'encoding' not in args :
args['encoding'] = 'utf-8'
gfs = GridFS(self.db)
gfs.put(**args)
def archive(self):
"""
This function will archive documents to the
"""
collection = self.db[self.uid]
rows = list(collection.find())
for row in rows :
if type(row['_id']) == ObjectId :
row['_id'] = str(row['_id'])
stream = Binary(json.dumps(collection).encode())
collection.delete_many({})
now = "-".join([str(datetime.now().year()),str(datetime.now().month), str(datetime.now().day)])
name = ".".join([self.uid,'archive',now])+".json"
description = " ".join([self.uid,'archive',str(len(rows))])
self.upload(filename=name,data=stream,description=description,content_type='application/json')
# gfs = GridFS(self.db)
# gfs.put(filename=name,description=description,data=stream,encoding='utf-8')
# self.write({{"filename":name,"file":stream,"description":descriptions}})
pass
def write(self,info):
"""
This function will write to a given collection i.e add a record to a collection (no updates)
@param info new record in the collection to be added
"""
# document = self.db[self.uid].find()
collection = self.db[self.uid]
# if type(info) == list :
# self.db[self.uid].insert_many(info)
# else:
2022-03-19 05:02:53 +00:00
try:
if self._lock :
Mongo.lock.acquire()
if type(info) == list or type(info) == pd.DataFrame :
self.db[self.uid].insert_many(info if type(info) == list else info.to_dict(orient='records'))
else:
self.db[self.uid].insert_one(info)
finally:
if self._lock :
Mongo.lock.release()
def set(self,document):
"""
if no identifier is provided the function will delete the entire collection and set the new document.
Please use this function with great care (archive the content first before using it... for safety)
"""
collection = self.db[self.uid]
if collection.count_document() > 0 and '_id' in document:
id = document['_id']
del document['_id']
collection.find_one_and_replace({'_id':id},document)
else:
collection.delete_many({})
self.write(info)
2020-11-16 03:19:40 +00:00
def close(self):
Mongo.close(self)
# collecton.update_one({"_id":self.uid},document,True)