bug fix: random shuffle improvements

This commit is contained in:
Steve Nyemba 2023-07-25 14:30:20 -05:00
parent ef43f20e9c
commit b9596edd8e
4 changed files with 47 additions and 18 deletions

View File

@ -22,7 +22,7 @@ import nujson as json
from multiprocessing import Process, RLock from multiprocessing import Process, RLock
from datetime import datetime, timedelta from datetime import datetime, timedelta
from multiprocessing import Queue from multiprocessing import Queue
from version import __version__
import time import time
@ -179,6 +179,7 @@ class Learner(Process):
for name in columns : for name in columns :
# #
# randomly sampling 5 elements to make sense of data-types # randomly sampling 5 elements to make sense of data-types
if self._df[name].size < 5 : if self._df[name].size < 5 :
continue continue
_index = np.random.choice(np.arange(self._df[name].size),5,False) _index = np.random.choice(np.arange(self._df[name].size),5,False)
@ -552,27 +553,53 @@ class Shuffle(Generator):
""" """
def __init__(self,**_args): def __init__(self,**_args):
super().__init__(**_args) super().__init__(**_args)
if 'data' not in _args :
reader = transport.factory.instance(**self.store['source'])
self._df = reader.read(sql=self.info['sql'])
def run(self): def run(self):
np.random.seed(1) np.random.seed(1)
self.initalize() self.initalize()
_index = np.arange(self._df.shape[0])
np.random.shuffle(_index)
np.random.shuffle(_index)
_iocolumns = self.info['columns']
_ocolumns = list(set(self._df.columns) - set(_iocolumns) )
# _iodf = pd.DataFrame(self._df[_ocolumns],self._df.loc[_index][_iocolumns],index=np.arange(_index.size))
_iodf = pd.DataFrame(self._df[_iocolumns].copy(),index = np.arange(_index.size))
# self._df = self._df.loc[_index][_ocolumns].join(_iodf)
self._df = self._df.loc[_index][_ocolumns]
self._df.index = np.arange(self._df.shape[0])
self._df = self._df.join(_iodf)
# #
# The following is a full shuffle # If we are given lists of columns instead of a list-of-list
self._df = self._df.loc[_index] # unpack the list
self._df.index = np.arange(self._df.shape[0]) _invColumns = []
_colNames = []
_ucolNames= []
for _item in self.info['columns'] :
if type(_item) == list :
_invColumns.append(_item)
elif _item in self._df.columns.tolist():
_colNames.append(_item)
#
# At this point we build the matrix of elements we are interested in considering the any unspecified column
#
if _colNames :
_invColumns.append(_colNames)
_ucolNames = list(set(self._df.columns) - set(_colNames))
if _ucolNames :
_invColumns += [ [_name] for _name in _ucolNames]
_xdf = pd.DataFrame()
_xdf = pd.DataFrame()
_index = np.arange(self._df.shape[0])
for _columns in _invColumns :
_tmpdf = self._df[_columns].copy()[_columns]
np.random.shuffle(_index)
_tmpdf = _tmpdf.iloc[_index]
if _xdf.shape[0] == 0 :
_xdf = _tmpdf
else:
_xdf = _xdf.join(_tmpdf)
_xdf = _xdf[self._df.columns]
self._df = _xdf
_log = {'action':'io-data','input':{'candidates':1,'rows':int(self._df.shape[0])}} _log = {'action':'io-data','input':{'candidates':1,'rows':int(self._df.shape[0])}}
self.log(**_log) self.log(**_log)
try: try:

1
data/maker/version.py Normal file
View File

@ -0,0 +1 @@
__version__='1.7.0'

View File

@ -1,10 +1,10 @@
from setuptools import setup, find_packages from setuptools import setup, find_packages
import os import os
import sys import sys
import version
def read(fname): def read(fname):
return open(os.path.join(os.path.dirname(__file__), fname)).read() return open(os.path.join(os.path.dirname(__file__), fname)).read()
args = {"name":"data-maker","version":"1.6.8", args = {"name":"data-maker","version":version.__version__,
"author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT", "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT",
"packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
args["install_requires"] = ['data-transport@git+https://github.com/lnyemba/data-transport.git','tensorflow'] args["install_requires"] = ['data-transport@git+https://github.com/lnyemba/data-transport.git','tensorflow']

1
version.py Symbolic link
View File

@ -0,0 +1 @@
data/maker/version.py