bug fix: random shuffle improvements

This commit is contained in:
Steve Nyemba 2023-07-25 14:30:20 -05:00
parent ef43f20e9c
commit b9596edd8e
4 changed files with 47 additions and 18 deletions

View File

@ -22,7 +22,7 @@ import nujson as json
from multiprocessing import Process, RLock
from datetime import datetime, timedelta
from multiprocessing import Queue
from version import __version__
import time
@ -179,6 +179,7 @@ class Learner(Process):
for name in columns :
#
# randomly sampling 5 elements to make sense of data-types
if self._df[name].size < 5 :
continue
_index = np.random.choice(np.arange(self._df[name].size),5,False)
@ -552,27 +553,53 @@ class Shuffle(Generator):
"""
def __init__(self,**_args):
super().__init__(**_args)
if 'data' not in _args :
reader = transport.factory.instance(**self.store['source'])
self._df = reader.read(sql=self.info['sql'])
def run(self):
np.random.seed(1)
self.initalize()
_index = np.arange(self._df.shape[0])
np.random.shuffle(_index)
np.random.shuffle(_index)
_iocolumns = self.info['columns']
_ocolumns = list(set(self._df.columns) - set(_iocolumns) )
# _iodf = pd.DataFrame(self._df[_ocolumns],self._df.loc[_index][_iocolumns],index=np.arange(_index.size))
_iodf = pd.DataFrame(self._df[_iocolumns].copy(),index = np.arange(_index.size))
# self._df = self._df.loc[_index][_ocolumns].join(_iodf)
self._df = self._df.loc[_index][_ocolumns]
self._df.index = np.arange(self._df.shape[0])
self._df = self._df.join(_iodf)
#
# The following is a full shuffle
self._df = self._df.loc[_index]
self._df.index = np.arange(self._df.shape[0])
# If we are given lists of columns instead of a list-of-list
# unpack the list
_invColumns = []
_colNames = []
_ucolNames= []
for _item in self.info['columns'] :
if type(_item) == list :
_invColumns.append(_item)
elif _item in self._df.columns.tolist():
_colNames.append(_item)
#
# At this point we build the matrix of elements we are interested in considering the any unspecified column
#
if _colNames :
_invColumns.append(_colNames)
_ucolNames = list(set(self._df.columns) - set(_colNames))
if _ucolNames :
_invColumns += [ [_name] for _name in _ucolNames]
_xdf = pd.DataFrame()
_xdf = pd.DataFrame()
_index = np.arange(self._df.shape[0])
for _columns in _invColumns :
_tmpdf = self._df[_columns].copy()[_columns]
np.random.shuffle(_index)
_tmpdf = _tmpdf.iloc[_index]
if _xdf.shape[0] == 0 :
_xdf = _tmpdf
else:
_xdf = _xdf.join(_tmpdf)
_xdf = _xdf[self._df.columns]
self._df = _xdf
_log = {'action':'io-data','input':{'candidates':1,'rows':int(self._df.shape[0])}}
self.log(**_log)
try:

1
data/maker/version.py Normal file
View File

@ -0,0 +1 @@
__version__='1.7.0'

View File

@ -1,10 +1,10 @@
from setuptools import setup, find_packages
import os
import sys
import version
def read(fname):
return open(os.path.join(os.path.dirname(__file__), fname)).read()
args = {"name":"data-maker","version":"1.6.8",
args = {"name":"data-maker","version":version.__version__,
"author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT",
"packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
args["install_requires"] = ['data-transport@git+https://github.com/lnyemba/data-transport.git','tensorflow']

1
version.py Symbolic link
View File

@ -0,0 +1 @@
data/maker/version.py