diff --git a/Dockerfile b/Dockerfile index f308336..d489f50 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ from ubuntu RUN ["apt-get","update"] RUN ["apt-get","upgrade","-y"] RUN ["apt-get","install","-y","git", "python3-dev","tmux","locales","python3-pip","python3-numpy","python3-pandas","locales"] -RUN ["pip3","install","pandas-gbq","tensorflow"] +RUN ["pip3","install","pandas-gbq","tensorflow","git+https://hiplab.mc.vanderbilt.edu/git/aou/"] RUN ["mkdir","-p","/usr/apps"] WORKDIR /usr/apps -RUN ["git","clone","https://hiplab.mc.vanderbilt.edu/git/aou/gan.git@release","aou-gan"] +RUN ["git","clone","https://hiplab.mc.vanderbilt.edu/git/aou/bridge.git@release","aou-gan"] diff --git a/README.md b/README.md index 8eb92d1..48b01aa 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,49 @@ -# bridge +## Introduction +--- + +This package is designed to generate synthetic data from a dataset from an original dataset using deep learning techniques + + - Generative Adversarial Networks + - With "Earth mover's distance" + +## Installation +--- + + pip install git+https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git@release + +## Usage +--- + +After installing the easiest way to get started is as follows (using pandas). The process is as follows: +1. Train the GAN on the original/raw dataset + + + import pandas as pd + import data.maker + + df = pd.read_csv('myfile.csv') + cols= ['f1','f2','f2'] + data.maker.train(data=df,cols=cols,logs='logs') + +2. Generate a candidate dataset from the learnt features + + + import pandas as pd + import data.maker + + df = data.maker.generate(logs='logs') + df.head() + + +## Limitations +--- + +GANS will generate data assuming the original data has all the value space needed: + +- No new data will be created + + Assuming we have a dataset with an gender attribute with values [M,F]. The synthetic data will not be able to generate genders outside [M,F] +- Not advised on continuous values + + GANS work well on discrete values and thus are not advised to be used to synthesize things like measurements (height, blood pressure, ...)