bug fix: random shuffle improvements
This commit is contained in:
parent
ef43f20e9c
commit
b9596edd8e
|
@ -22,7 +22,7 @@ import nujson as json
|
|||
from multiprocessing import Process, RLock
|
||||
from datetime import datetime, timedelta
|
||||
from multiprocessing import Queue
|
||||
|
||||
from version import __version__
|
||||
import time
|
||||
|
||||
|
||||
|
@ -179,6 +179,7 @@ class Learner(Process):
|
|||
for name in columns :
|
||||
#
|
||||
# randomly sampling 5 elements to make sense of data-types
|
||||
|
||||
if self._df[name].size < 5 :
|
||||
continue
|
||||
_index = np.random.choice(np.arange(self._df[name].size),5,False)
|
||||
|
@ -552,27 +553,53 @@ class Shuffle(Generator):
|
|||
"""
|
||||
def __init__(self,**_args):
|
||||
super().__init__(**_args)
|
||||
if 'data' not in _args :
|
||||
reader = transport.factory.instance(**self.store['source'])
|
||||
self._df = reader.read(sql=self.info['sql'])
|
||||
|
||||
def run(self):
|
||||
|
||||
np.random.seed(1)
|
||||
|
||||
self.initalize()
|
||||
_index = np.arange(self._df.shape[0])
|
||||
np.random.shuffle(_index)
|
||||
np.random.shuffle(_index)
|
||||
_iocolumns = self.info['columns']
|
||||
_ocolumns = list(set(self._df.columns) - set(_iocolumns) )
|
||||
# _iodf = pd.DataFrame(self._df[_ocolumns],self._df.loc[_index][_iocolumns],index=np.arange(_index.size))
|
||||
_iodf = pd.DataFrame(self._df[_iocolumns].copy(),index = np.arange(_index.size))
|
||||
# self._df = self._df.loc[_index][_ocolumns].join(_iodf)
|
||||
self._df = self._df.loc[_index][_ocolumns]
|
||||
self._df.index = np.arange(self._df.shape[0])
|
||||
self._df = self._df.join(_iodf)
|
||||
#
|
||||
# The following is a full shuffle
|
||||
self._df = self._df.loc[_index]
|
||||
self._df.index = np.arange(self._df.shape[0])
|
||||
|
||||
# If we are given lists of columns instead of a list-of-list
|
||||
# unpack the list
|
||||
_invColumns = []
|
||||
_colNames = []
|
||||
_ucolNames= []
|
||||
for _item in self.info['columns'] :
|
||||
if type(_item) == list :
|
||||
_invColumns.append(_item)
|
||||
elif _item in self._df.columns.tolist():
|
||||
_colNames.append(_item)
|
||||
#
|
||||
# At this point we build the matrix of elements we are interested in considering the any unspecified column
|
||||
#
|
||||
if _colNames :
|
||||
_invColumns.append(_colNames)
|
||||
_ucolNames = list(set(self._df.columns) - set(_colNames))
|
||||
if _ucolNames :
|
||||
_invColumns += [ [_name] for _name in _ucolNames]
|
||||
|
||||
_xdf = pd.DataFrame()
|
||||
_xdf = pd.DataFrame()
|
||||
_index = np.arange(self._df.shape[0])
|
||||
|
||||
for _columns in _invColumns :
|
||||
|
||||
_tmpdf = self._df[_columns].copy()[_columns]
|
||||
np.random.shuffle(_index)
|
||||
|
||||
_tmpdf = _tmpdf.iloc[_index]
|
||||
|
||||
if _xdf.shape[0] == 0 :
|
||||
_xdf = _tmpdf
|
||||
else:
|
||||
_xdf = _xdf.join(_tmpdf)
|
||||
|
||||
_xdf = _xdf[self._df.columns]
|
||||
self._df = _xdf
|
||||
_log = {'action':'io-data','input':{'candidates':1,'rows':int(self._df.shape[0])}}
|
||||
self.log(**_log)
|
||||
try:
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
__version__='1.7.0'
|
4
setup.py
4
setup.py
|
@ -1,10 +1,10 @@
|
|||
from setuptools import setup, find_packages
|
||||
import os
|
||||
import sys
|
||||
|
||||
import version
|
||||
def read(fname):
|
||||
return open(os.path.join(os.path.dirname(__file__), fname)).read()
|
||||
args = {"name":"data-maker","version":"1.6.8",
|
||||
args = {"name":"data-maker","version":version.__version__,
|
||||
"author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT",
|
||||
"packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
|
||||
args["install_requires"] = ['data-transport@git+https://github.com/lnyemba/data-transport.git','tensorflow']
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
data/maker/version.py
|
Loading…
Reference in New Issue