diff --git a/Impala/DataImport/OMOP_CDM_synpuf_load_Impala.sql b/Impala/DataImport/OMOP_CDM_synpuf_load_Impala.sql new file mode 100644 index 0000000..59c6fa4 --- /dev/null +++ b/Impala/DataImport/OMOP_CDM_synpuf_load_Impala.sql @@ -0,0 +1,15 @@ +LOAD DATA INPATH '${VAR:OMOP_SYNPUF_PATH}/CDM_CARE_SITE.csv' OVERWRITE INTO TABLE care_site; +LOAD DATA INPATH '${VAR:OMOP_SYNPUF_PATH}/CDM_CONDITION_OCCURRENCE.csv' OVERWRITE INTO TABLE condition_occurrence; +LOAD DATA INPATH '${VAR:OMOP_SYNPUF_PATH}/CDM_DEATH.csv' OVERWRITE INTO TABLE death; +LOAD DATA INPATH '${VAR:OMOP_SYNPUF_PATH}/CDM_DRUG_EXPOSURE.csv' OVERWRITE INTO TABLE drug_exposure; +LOAD DATA INPATH '${VAR:OMOP_SYNPUF_PATH}/CDM_DEVICE_EXPOSURE.csv' OVERWRITE INTO TABLE device_exposure; +LOAD DATA INPATH '${VAR:OMOP_SYNPUF_PATH}/CDM_LOCATION.csv' OVERWRITE INTO TABLE `location`; +LOAD DATA INPATH '${VAR:OMOP_SYNPUF_PATH}/CDM_MEASUREMENT.csv' OVERWRITE INTO TABLE measurement; +LOAD DATA INPATH '${VAR:OMOP_SYNPUF_PATH}/CDM_OBSERVATION.csv' OVERWRITE INTO TABLE observation; +LOAD DATA INPATH '${VAR:OMOP_SYNPUF_PATH}/CDM_OBSERVATION_PERIOD.csv' OVERWRITE INTO TABLE observation_period; +LOAD DATA INPATH '${VAR:OMOP_SYNPUF_PATH}/CDM_PERSON.csv' OVERWRITE INTO TABLE person; +LOAD DATA INPATH '${VAR:OMOP_SYNPUF_PATH}/CDM_PROCEDURE_OCCURRENCE.csv' OVERWRITE INTO TABLE procedure_occurrence; +LOAD DATA INPATH '${VAR:OMOP_SYNPUF_PATH}/CDM_PROVIDER.csv' OVERWRITE INTO TABLE provider; +LOAD DATA INPATH '${VAR:OMOP_SYNPUF_PATH}/CDM_VISIT_OCCURRENCE.csv' OVERWRITE INTO TABLE visit_occurrence; +LOAD DATA INPATH '${VAR:OMOP_SYNPUF_PATH}/CDM_DRUG_ERA.csv' OVERWRITE INTO TABLE drug_era; +LOAD DATA INPATH '${VAR:OMOP_SYNPUF_PATH}/CDM_CONDITION_ERA.csv' OVERWRITE INTO TABLE condition_era; diff --git a/Impala/OMOP_CDM_ddl_Impala.sql b/Impala/OMOP_CDM_ddl_Impala.sql new file mode 100644 index 0000000..ab1bdf5 --- /dev/null +++ b/Impala/OMOP_CDM_ddl_Impala.sql @@ -0,0 +1,763 @@ +/********************************************************************************* +# Copyright 2014-6 Observational Health Data Sciences and Informatics +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +********************************************************************************/ + +/************************ + + ####### # # ####### ###### ##### ###### # # ####### + # # ## ## # # # # # # # # ## ## # # # + # # # # # # # # # # # # # # # # # # # # + # # # # # # # ###### # # # # # # # # ###### + # # # # # # # # # # # # # # # + # # # # # # # # # # # # # # # # # + ####### # # ####### # ##### ###### # # ## ##### + + +script to create OMOP common data model, version 5.0 for Hadoop (Hive/Impala) database + +Based on the PostgreSQL version, with the following changes: +* NULL/NOT NULL is not used. +* Dates are stored as VARCHAR(8). See http://stackoverflow.com/questions/33024309/convert-yyyymmdd-string-to-date-in-impala for how to treat date columns as dates. +* PostgreSQL NUMERIC is stored as DOUBLE for amounts, and DECIMAL(19,4) for prices. +* PostgreSQL TEXT is stored as STRING. + +*************************/ + + +/************************ + +Standardized vocabulary + +************************/ + + +CREATE TABLE concept ( + concept_id INTEGER, + concept_name VARCHAR(255), + domain_id VARCHAR(20), + vocabulary_id VARCHAR(20), + concept_class_id VARCHAR(20), + standard_concept VARCHAR(1), + concept_code VARCHAR(50), + valid_start_date VARCHAR(8), -- DATE + valid_end_date VARCHAR(8), -- DATE + invalid_reason VARCHAR(1) +) +ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' +TBLPROPERTIES ("skip.header.line.count"="1") +; + + + + +CREATE TABLE vocabulary ( + vocabulary_id VARCHAR(20), + vocabulary_name VARCHAR(255), + vocabulary_reference VARCHAR(255), + vocabulary_version VARCHAR(255), + vocabulary_concept_id INTEGER +) +ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' +TBLPROPERTIES ("skip.header.line.count"="1") +; + + + + +CREATE TABLE domain ( + domain_id VARCHAR(20), + domain_name VARCHAR(255), + domain_concept_id INTEGER +) +ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' +TBLPROPERTIES ("skip.header.line.count"="1") +; + + + +CREATE TABLE concept_class ( + concept_class_id VARCHAR(20), + concept_class_name VARCHAR(255), + concept_class_concept_id INTEGER +) +ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' +TBLPROPERTIES ("skip.header.line.count"="1") +; + + + + +CREATE TABLE concept_relationship ( + concept_id_1 INTEGER, + concept_id_2 INTEGER, + relationship_id VARCHAR(20), + valid_start_date VARCHAR(8), -- DATE + valid_end_date VARCHAR(8), -- DATE + invalid_reason VARCHAR(1) +) +ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' +TBLPROPERTIES ("skip.header.line.count"="1") +; + + + +CREATE TABLE relationship ( + relationship_id VARCHAR(20), + relationship_name VARCHAR(255), + is_hierarchical VARCHAR(1), + defines_ancestry VARCHAR(1), + reverse_relationship_id VARCHAR(20), + relationship_concept_id INTEGER +) +ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' +TBLPROPERTIES ("skip.header.line.count"="1") +; + + +CREATE TABLE concept_synonym ( + concept_id INTEGER, + concept_synonym_name VARCHAR(1000), + language_concept_id INTEGER +) +ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' +TBLPROPERTIES ("skip.header.line.count"="1") +; + + +CREATE TABLE concept_ancestor ( + ancestor_concept_id INTEGER, + descendant_concept_id INTEGER, + min_levels_of_separation INTEGER, + max_levels_of_separation INTEGER +) +ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' +TBLPROPERTIES ("skip.header.line.count"="1") +; + + + +CREATE TABLE source_to_concept_map ( + source_code VARCHAR(50), + source_concept_id INTEGER, + source_vocabulary_id VARCHAR(20), + source_code_description VARCHAR(255), + target_concept_id INTEGER, + target_vocabulary_id VARCHAR(20), + valid_start_date VARCHAR(8), -- DATE + valid_end_date VARCHAR(8), -- DATE + invalid_reason VARCHAR(1) +) +ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' +TBLPROPERTIES ("skip.header.line.count"="1") +; + + + + +CREATE TABLE drug_strength ( + drug_concept_id INTEGER, + ingredient_concept_id INTEGER, + amount_value DOUBLE, -- NUMERIC + amount_unit_concept_id INTEGER, + numerator_value DOUBLE, -- NUMERIC + numerator_unit_concept_id INTEGER, + denominator_value DOUBLE, -- NUMERIC + denominator_unit_concept_id INTEGER, + box_size INTEGER, + valid_start_date VARCHAR(8), -- DATE + valid_end_date VARCHAR(8), -- DATE + invalid_reason VARCHAR(1) +) +ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' +TBLPROPERTIES ("skip.header.line.count"="1") +; + + + +CREATE TABLE cohort_definition ( + cohort_definition_id INTEGER, + cohort_definition_name VARCHAR(255), + cohort_definition_description STRING, -- TEXT + definition_type_concept_id INTEGER, + cohort_definition_syntax STRING, -- TEXT + subject_concept_id INTEGER, + cohort_initiation_date VARCHAR(8) -- DATE +) +; + + +CREATE TABLE attribute_definition ( + attribute_definition_id INTEGER, + attribute_name VARCHAR(255), + attribute_description STRING, -- TEXT + attribute_type_concept_id INTEGER, + attribute_syntax STRING -- TEXT +) +; + +/************************** + +Standardized meta-data + +***************************/ + + +CREATE TABLE cdm_source + ( + cdm_source_name VARCHAR(255), + cdm_source_abbreviation VARCHAR(25), + cdm_holder VARCHAR(255), + source_description STRING, -- TEXT + source_documentation_reference VARCHAR(255), + cdm_etl_reference VARCHAR(255), + source_release_date VARCHAR(8), -- DATE + cdm_release_date VARCHAR(8), -- DATE + cdm_version VARCHAR(10), + vocabulary_version VARCHAR(20) + ) +; + + + + + + +/************************ + +Standardized clinical data + +************************/ + + +CREATE TABLE person ( + person_id INTEGER, + gender_concept_id INTEGER, + year_of_birth INTEGER, + month_of_birth INTEGER, + day_of_birth INTEGER, + time_of_birth VARCHAR(10), + race_concept_id INTEGER, + ethnicity_concept_id INTEGER, + location_id INTEGER, + provider_id INTEGER, + care_site_id INTEGER, + person_source_value VARCHAR(50), + gender_source_value VARCHAR(50), + gender_source_concept_id INTEGER, + race_source_value VARCHAR(50), + race_source_concept_id INTEGER, + ethnicity_source_value VARCHAR(50), + ethnicity_source_concept_id INTEGER +) +ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' +TBLPROPERTIES ("skip.header.line.count"="1") +; + + + + + +CREATE TABLE observation_period ( + observation_period_id INTEGER, + person_id INTEGER, + observation_period_start_date VARCHAR(8), -- DATE + observation_period_end_date VARCHAR(8), -- DATE + period_type_concept_id INTEGER +) +ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' +TBLPROPERTIES ("skip.header.line.count"="1") +; + + + +CREATE TABLE specimen ( + specimen_id INTEGER, + person_id INTEGER, + specimen_concept_id INTEGER, + specimen_type_concept_id INTEGER, + specimen_date VARCHAR(8), -- DATE + specimen_time VARCHAR(10), + quantity DOUBLE, -- NUMERIC + unit_concept_id INTEGER, + anatomic_site_concept_id INTEGER, + disease_status_concept_id INTEGER, + specimen_source_id VARCHAR(50), + specimen_source_value VARCHAR(50), + unit_source_value VARCHAR(50), + anatomic_site_source_value VARCHAR(50), + disease_status_source_value VARCHAR(50) +) +; + + + +CREATE TABLE death ( + person_id INTEGER, + death_date VARCHAR(8), -- DATE + death_type_concept_id INTEGER, + cause_concept_id INTEGER, + cause_source_value VARCHAR(50), + cause_source_concept_id INTEGER +) +ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' +TBLPROPERTIES ("skip.header.line.count"="1") +; + + + +CREATE TABLE visit_occurrence ( + visit_occurrence_id INTEGER, + person_id INTEGER, + visit_concept_id INTEGER, + visit_start_date VARCHAR(8), -- DATE + visit_start_time VARCHAR(10), + visit_end_date VARCHAR(8), -- DATE + visit_end_time VARCHAR(10), + visit_type_concept_id INTEGER, + provider_id INTEGER, + care_site_id INTEGER, + visit_source_value VARCHAR(50), + visit_source_concept_id INTEGER +) +ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' +TBLPROPERTIES ("skip.header.line.count"="1") +; + + + +CREATE TABLE procedure_occurrence ( + procedure_occurrence_id INTEGER, + person_id INTEGER, + procedure_concept_id INTEGER, + procedure_date VARCHAR(8), -- DATE + procedure_type_concept_id INTEGER, + modifier_concept_id INTEGER, + quantity INTEGER, + provider_id INTEGER, + visit_occurrence_id INTEGER, + procedure_source_value VARCHAR(50), + procedure_source_concept_id INTEGER, + qualifier_source_value VARCHAR(50) +) +ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' +TBLPROPERTIES ("skip.header.line.count"="1") +; + + + +CREATE TABLE drug_exposure ( + drug_exposure_id INTEGER, + person_id INTEGER, + drug_concept_id INTEGER, + drug_exposure_start_date VARCHAR(8), -- DATE + drug_exposure_end_date VARCHAR(8), -- DATE + drug_type_concept_id INTEGER, + stop_reason VARCHAR(20), + refills INTEGER, + quantity DOUBLE, -- NUMERIC + days_supply INTEGER, + sig STRING, -- TEXT + route_concept_id INTEGER, + effective_drug_dose DOUBLE, -- NUMERIC + dose_unit_concept_id INTEGER, + lot_number VARCHAR(50), + provider_id INTEGER, + visit_occurrence_id INTEGER, + drug_source_value VARCHAR(50), + drug_source_concept_id INTEGER, + route_source_value VARCHAR(50), + dose_unit_source_value VARCHAR(50) +) +ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' +TBLPROPERTIES ("skip.header.line.count"="1") +; + + +CREATE TABLE device_exposure ( + device_exposure_id INTEGER, + person_id INTEGER, + device_concept_id INTEGER, + device_exposure_start_date VARCHAR(8), -- DATE + device_exposure_end_date VARCHAR(8), -- DATE + device_type_concept_id INTEGER, + unique_device_id VARCHAR(50), + quantity INTEGER, + provider_id INTEGER, + visit_occurrence_id INTEGER, + device_source_value VARCHAR(100), + device_source_concept_id INTEGER +) +ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' +TBLPROPERTIES ("skip.header.line.count"="1") +; + + +CREATE TABLE condition_occurrence ( + condition_occurrence_id INTEGER, + person_id INTEGER, + condition_concept_id INTEGER, + condition_start_date VARCHAR(8), -- DATE + condition_end_date VARCHAR(8), -- DATE + condition_type_concept_id INTEGER, + stop_reason VARCHAR(20), + provider_id INTEGER, + visit_occurrence_id INTEGER, + condition_source_value VARCHAR(50), + condition_source_concept_id INTEGER +) +ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' +TBLPROPERTIES ("skip.header.line.count"="1") +; + + + +CREATE TABLE measurement ( + measurement_id INTEGER, + person_id INTEGER, + measurement_concept_id INTEGER, + measurement_date VARCHAR(8), -- DATE + measurement_time VARCHAR(10), + measurement_type_concept_id INTEGER, + operator_concept_id INTEGER, + value_as_number DOUBLE, -- NUMERIC + value_as_concept_id INTEGER, + unit_concept_id INTEGER, + range_low DOUBLE, -- NUMERIC + range_high DOUBLE, -- NUMERIC + provider_id INTEGER, + visit_occurrence_id INTEGER, + measurement_source_value VARCHAR(50), + measurement_source_concept_id INTEGER, + unit_source_value VARCHAR(50), + value_source_value VARCHAR(50) +) +ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' +TBLPROPERTIES ("skip.header.line.count"="1") +; + + + +CREATE TABLE note ( + note_id INTEGER, + person_id INTEGER, + note_date VARCHAR(8), -- DATE + note_time VARCHAR(10), + note_type_concept_id INTEGER, + note_text STRING, -- TEXT + provider_id INTEGER, + visit_occurrence_id INTEGER, + note_source_value VARCHAR(50) +) +; + + + +CREATE TABLE observation ( + observation_id INTEGER, + person_id INTEGER, + observation_concept_id INTEGER, + observation_date VARCHAR(8), -- DATE + observation_time VARCHAR(10), + observation_type_concept_id INTEGER, + value_as_number DOUBLE, -- NUMERIC + value_as_string VARCHAR(60), + value_as_concept_id INTEGER, + qualifier_concept_id INTEGER, + unit_concept_id INTEGER, + provider_id INTEGER, + visit_occurrence_id INTEGER, + observation_source_value VARCHAR(50), + observation_source_concept_id INTEGER, + unit_source_value VARCHAR(50), + qualifier_source_value VARCHAR(50) +) +ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' +TBLPROPERTIES ("skip.header.line.count"="1") +; + + + +CREATE TABLE fact_relationship ( + domain_concept_id_1 INTEGER, + fact_id_1 INTEGER, + domain_concept_id_2 INTEGER, + fact_id_2 INTEGER, + relationship_concept_id INTEGER +) +; + + + + +/************************ + +Standardized health system data + +************************/ + + + +CREATE TABLE `location` ( + location_id INTEGER, + address_1 VARCHAR(50), + address_2 VARCHAR(50), + city VARCHAR(50), + state VARCHAR(2), + zip VARCHAR(9), + county VARCHAR(20), + location_source_value VARCHAR(50) +) +ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' +TBLPROPERTIES ("skip.header.line.count"="1") +; + + + +CREATE TABLE care_site ( + care_site_id INTEGER, + care_site_name VARCHAR(255), + place_of_service_concept_id INTEGER, + location_id INTEGER, + care_site_source_value VARCHAR(50), + place_of_service_source_value VARCHAR(50) +) +ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' +TBLPROPERTIES ("skip.header.line.count"="1") +; + + + +CREATE TABLE provider ( + provider_id INTEGER, + provider_name VARCHAR(255), + NPI VARCHAR(20), + DEA VARCHAR(20), + specialty_concept_id INTEGER, + care_site_id INTEGER, + year_of_birth INTEGER, + gender_concept_id INTEGER, + provider_source_value VARCHAR(50), + specialty_source_value VARCHAR(50), + specialty_source_concept_id INTEGER, + gender_source_value VARCHAR(50), + gender_source_concept_id INTEGER +) +ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' +TBLPROPERTIES ("skip.header.line.count"="1") +; + + + + +/************************ + +Standardized health economics + +************************/ + + +CREATE TABLE payer_plan_period ( + payer_plan_period_id INTEGER, + person_id INTEGER, + payer_plan_period_start_date VARCHAR(8), -- DATE + payer_plan_period_end_date VARCHAR(8), -- DATE + payer_source_value VARCHAR (50), + plan_source_value VARCHAR (50), + family_source_value VARCHAR (50) +) +; + + +/* The individual cost tables are being phased out and will disappear soon + +CREATE TABLE visit_cost ( + visit_cost_id INTEGER, + visit_occurrence_id INTEGER, + currency_concept_id INTEGER, + paid_copay DECIMAL(19,4), -- NUMERIC + paid_coinsurance DECIMAL(19,4), -- NUMERIC + paid_toward_deductible DECIMAL(19,4), -- NUMERIC + paid_by_payer DECIMAL(19,4), -- NUMERIC + paid_by_coordination_benefits DECIMAL(19,4), -- NUMERIC + total_out_of_pocket DECIMAL(19,4), -- NUMERIC + total_paid DECIMAL(19,4), -- NUMERIC + payer_plan_period_id INTEGER +) +; + + + +CREATE TABLE procedure_cost ( + procedure_cost_id INTEGER, + procedure_occurrence_id INTEGER, + currency_concept_id INTEGER, + paid_copay DECIMAL(19,4), -- NUMERIC + paid_coinsurance DECIMAL(19,4), -- NUMERIC + paid_toward_deductible DECIMAL(19,4), -- NUMERIC + paid_by_payer DECIMAL(19,4), -- NUMERIC + paid_by_coordination_benefits DECIMAL(19,4), -- NUMERIC + total_out_of_pocket DECIMAL(19,4), -- NUMERIC + total_paid DECIMAL(19,4), -- NUMERIC + revenue_code_concept_id INTEGER, + payer_plan_period_id INTEGER, + revenue_code_source_value VARCHAR(50) +) +; + + + +CREATE TABLE drug_cost ( + drug_cost_id INTEGER, + drug_exposure_id INTEGER, + currency_concept_id INTEGER, + paid_copay DECIMAL(19,4), -- NUMERIC + paid_coinsurance DECIMAL(19,4), -- NUMERIC + paid_toward_deductible DECIMAL(19,4), -- NUMERIC + paid_by_payer DECIMAL(19,4), -- NUMERIC + paid_by_coordination_benefits DECIMAL(19,4), -- NUMERIC + total_out_of_pocket DECIMAL(19,4), -- NUMERIC + total_paid DECIMAL(19,4), -- NUMERIC + ingredient_cost DECIMAL(19,4), -- NUMERIC + dispensing_fee DECIMAL(19,4), -- NUMERIC + average_wholesale_price DECIMAL(19,4), -- NUMERIC + payer_plan_period_id INTEGER +) +; + + + + + +CREATE TABLE device_cost ( + device_cost_id INTEGER, + device_exposure_id INTEGER, + currency_concept_id INTEGER, + paid_copay DECIMAL(19,4), -- NUMERIC + paid_coinsurance DECIMAL(19,4), -- NUMERIC + paid_toward_deductible DECIMAL(19,4), -- NUMERIC + paid_by_payer DECIMAL(19,4), -- NUMERIC + paid_by_coordination_benefits DECIMAL(19,4), -- NUMERIC + total_out_of_pocket DECIMAL(19,4), -- NUMERIC + total_paid DECIMAL(19,4), -- NUMERIC + payer_plan_period_id INTEGER +) +; +*/ + + +CREATE TABLE cost ( + cost_id INTEGER, + cost_event_id INTEGER, + cost_domain_id VARCHAR(20), + cost_type_concept_id INTEGER, + currency_concept_id INTEGER, + total_charge DECIMAL(19,4), -- NUMERIC + total_cost DECIMAL(19,4), -- NUMERIC + total_paid DECIMAL(19,4), -- NUMERIC + paid_by_payer DECIMAL(19,4), -- NUMERIC + paid_by_patient DECIMAL(19,4), -- NUMERIC + paid_patient_copay DECIMAL(19,4), -- NUMERIC + paid_patient_coinsurance DECIMAL(19,4), -- NUMERIC + paid_patient_deductible DECIMAL(19,4), -- NUMERIC + paid_by_primary DECIMAL(19,4), -- NUMERIC + paid_ingredient_cost DECIMAL(19,4), -- NUMERIC + paid_dispensing_fee DECIMAL(19,4), -- NUMERIC + payer_plan_period_id INTEGER, + amount_allowed DECIMAL(19,4), -- NUMERIC + revenue_code_concept_id INTEGER, + reveue_code_source_value VARCHAR(50) +) +; + + + + + +/************************ + +Standardized derived elements + +************************/ + +CREATE TABLE cohort ( + cohort_definition_id INTEGER, + subject_id INTEGER, + cohort_start_date VARCHAR(8), -- DATE + cohort_end_date VARCHAR(8) -- DATE +) +; + + +CREATE TABLE cohort_attribute ( + cohort_definition_id INTEGER, + cohort_start_date VARCHAR(8), -- DATE + cohort_end_date VARCHAR(8), -- DATE + subject_id INTEGER, + attribute_definition_id INTEGER, + value_as_number DOUBLE, -- NUMERIC + value_as_concept_id INTEGER +) +; + + + + +CREATE TABLE drug_era ( + drug_era_id INTEGER, + person_id INTEGER, + drug_concept_id INTEGER, + drug_era_start_date VARCHAR(8), -- DATE + drug_era_end_date VARCHAR(8), -- DATE + drug_exposure_count INTEGER, + gap_days INTEGER +) +ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' +TBLPROPERTIES ("skip.header.line.count"="1") +; + + +CREATE TABLE dose_era ( + dose_era_id INTEGER, + person_id INTEGER, + drug_concept_id INTEGER, + unit_concept_id INTEGER, + dose_value DOUBLE, -- NUMERIC + dose_era_start_date VARCHAR(8), -- DATE + dose_era_end_date VARCHAR(8) -- DATE +) +; + + + + +CREATE TABLE condition_era ( + condition_era_id INTEGER, + person_id INTEGER, + condition_concept_id INTEGER, + condition_era_start_date VARCHAR(8), -- DATE + condition_era_end_date VARCHAR(8), -- DATE + condition_occurrence_count INTEGER +) +ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' +TBLPROPERTIES ("skip.header.line.count"="1") +; + + + + + + + diff --git a/Impala/README.md b/Impala/README.md new file mode 100644 index 0000000..2d0fb5e --- /dev/null +++ b/Impala/README.md @@ -0,0 +1,51 @@ +Common-Data-Model / Impala +================= + +This folder contains the SQL scripts for Impala. + +In order to create your instantiation of the Common Data Model, we recommend following these steps: + +1. Create an empty schema. + +```bash +impala-shell -q 'CREATE DATABASE omop_cdm' +``` + +2. Execute the script `OMOP_CDM_ddl_Impala.sql` to create the tables and fields. + +```bash +impala-shell -d omop_cdm -f OMOP_CDM_ddl_Impala.sql +``` + +3. Load your data into the schema. + +a. Load the vocabulary tables. + +First, download the data from +[http://www.ohdsi.org/web/athena/](http://www.ohdsi.org/web/athena/) +and unzip into a _cdmv5vocab_ directory, then run + +```bash +hadoop fs -put cdmv5vocab cdmv5vocab +hadoop fs -chmod +w cdmv5vocab +impala-shell -d omop_cdm -f VocabImport/OMOP_CDM_vocabulary_load_Impala.sql --var=OMOP_VOCAB_PATH=/user/$USER/cdmv5vocab +``` + +b. Load the patient data. + +For example, download the 1000 person sample of simulated CMS SynPUF patient data from +[http://www.ltscomputingllc.com/downloads/](http://www.ltscomputingllc.com/downloads/) +and unzip into a _synpuf_ directory, then run + +```bash +hadoop fs -put synpuf synpuf +hadoop fs -chmod +w synpuf +impala-shell -d omop_cdm -f DataImport/OMOP_CDM_synpuf_load_Impala.sql --var=OMOP_SYNPUF_PATH=/user/$USER/synpuf +``` + +4. Run simple queries to sanity check. + +```bash +impala-shell -d omop_cdm -q 'SELECT COUNT(1) FROM concept' +impala-shell -d omop_cdm -q 'SELECT COUNT(1) FROM person' +``` \ No newline at end of file diff --git a/Impala/VocabImport/OMOP_CDM_vocabulary_load_Impala.sql b/Impala/VocabImport/OMOP_CDM_vocabulary_load_Impala.sql new file mode 100644 index 0000000..57c5093 --- /dev/null +++ b/Impala/VocabImport/OMOP_CDM_vocabulary_load_Impala.sql @@ -0,0 +1,9 @@ +LOAD DATA INPATH '${VAR:OMOP_VOCAB_PATH}/CONCEPT.csv' OVERWRITE INTO TABLE concept; +LOAD DATA INPATH '${VAR:OMOP_VOCAB_PATH}/VOCABULARY.csv' OVERWRITE INTO TABLE vocabulary; +LOAD DATA INPATH '${VAR:OMOP_VOCAB_PATH}/DOMAIN.csv' OVERWRITE INTO TABLE domain; +LOAD DATA INPATH '${VAR:OMOP_VOCAB_PATH}/CONCEPT_CLASS.csv' OVERWRITE INTO TABLE concept_class; +LOAD DATA INPATH '${VAR:OMOP_VOCAB_PATH}/CONCEPT_RELATIONSHIP.csv' OVERWRITE INTO TABLE concept_relationship; +LOAD DATA INPATH '${VAR:OMOP_VOCAB_PATH}/RELATIONSHIP.csv' OVERWRITE INTO TABLE relationship; +LOAD DATA INPATH '${VAR:OMOP_VOCAB_PATH}/CONCEPT_SYNONYM.csv' OVERWRITE INTO TABLE concept_synonym; +LOAD DATA INPATH '${VAR:OMOP_VOCAB_PATH}/CONCEPT_ANCESTOR.csv' OVERWRITE INTO TABLE concept_ancestor; +LOAD DATA INPATH '${VAR:OMOP_VOCAB_PATH}/DRUG_STRENGTH.csv' OVERWRITE INTO TABLE drug_strength;