Add a step to transform data into Parquet, and use TIMESTAMP type for dates.
This commit is contained in:
parent
a8d6557a56
commit
7d2c9f3081
|
@ -0,0 +1,461 @@
|
|||
-- Use the search/replace regex in an editor to fix DATE columns:
|
||||
-- ([^ ]+) VARCHAR\(8\), \-\- DATE
|
||||
-- TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST($1 AS STRING), 1, 4), SUBSTR(CAST($1 AS STRING), 5, 2), SUBSTR(CAST($1 AS STRING), 7, 2)), 'UTC') AS $1,
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.concept
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT
|
||||
concept_id,
|
||||
concept_name,
|
||||
domain_id,
|
||||
vocabulary_id,
|
||||
concept_class_id,
|
||||
standard_concept,
|
||||
concept_code,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(valid_start_date AS STRING), 1, 4), SUBSTR(CAST(valid_start_date AS STRING), 5, 2), SUBSTR(CAST(valid_start_date AS STRING), 7, 2)), 'UTC') AS valid_start_date,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(valid_end_date AS STRING), 1, 4), SUBSTR(CAST(valid_end_date AS STRING), 5, 2), SUBSTR(CAST(valid_end_date AS STRING), 7, 2)), 'UTC') AS valid_end_date,
|
||||
invalid_reason
|
||||
FROM omop_cdm.concept;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.vocabulary
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT * from omop_cdm.vocabulary;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.domain
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT * from omop_cdm.domain;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.concept_class
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT * from omop_cdm.concept_class;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.concept_relationship
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT
|
||||
concept_id_1,
|
||||
concept_id_2,
|
||||
relationship_id,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(valid_start_date AS STRING), 1, 4), SUBSTR(CAST(valid_start_date AS STRING), 5, 2), SUBSTR(CAST(valid_start_date AS STRING), 7, 2)), 'UTC') AS valid_start_date,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(valid_end_date AS STRING), 1, 4), SUBSTR(CAST(valid_end_date AS STRING), 5, 2), SUBSTR(CAST(valid_end_date AS STRING), 7, 2)), 'UTC') AS valid_end_date,
|
||||
invalid_reason
|
||||
FROM omop_cdm.concept_relationship;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.relationship
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT * from omop_cdm.relationship;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.concept_synonym
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT * from omop_cdm.concept_synonym;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.concept_ancestor
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT * from omop_cdm.concept_ancestor;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.source_to_concept_map
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT
|
||||
source_code,
|
||||
source_concept_id,
|
||||
source_vocabulary_id,
|
||||
source_code_description,
|
||||
target_concept_id,
|
||||
target_vocabulary_id,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(valid_start_date AS STRING), 1, 4), SUBSTR(CAST(valid_start_date AS STRING), 5, 2), SUBSTR(CAST(valid_start_date AS STRING), 7, 2)), 'UTC') AS valid_start_date,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(valid_end_date AS STRING), 1, 4), SUBSTR(CAST(valid_end_date AS STRING), 5, 2), SUBSTR(CAST(valid_end_date AS STRING), 7, 2)), 'UTC') AS valid_end_date,
|
||||
invalid_reason
|
||||
FROM omop_cdm.source_to_concept_map;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.drug_strength
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT
|
||||
drug_concept_id,
|
||||
ingredient_concept_id,
|
||||
amount_value, -- NUMERIC
|
||||
amount_unit_concept_id,
|
||||
numerator_value, -- NUMERIC
|
||||
numerator_unit_concept_id,
|
||||
denominator_value, -- NUMERIC
|
||||
denominator_unit_concept_id,
|
||||
box_size,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(valid_start_date AS STRING), 1, 4), SUBSTR(CAST(valid_start_date AS STRING), 5, 2), SUBSTR(CAST(valid_start_date AS STRING), 7, 2)), 'UTC') AS valid_start_date,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(valid_end_date AS STRING), 1, 4), SUBSTR(CAST(valid_end_date AS STRING), 5, 2), SUBSTR(CAST(valid_end_date AS STRING), 7, 2)), 'UTC') AS valid_end_date,
|
||||
invalid_reason
|
||||
FROM omop_cdm.drug_strength;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.cohort_definition
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT
|
||||
cohort_definition_id,
|
||||
cohort_definition_name,
|
||||
cohort_definition_description, -- TEXT
|
||||
definition_type_concept_id,
|
||||
cohort_definition_syntax, -- TEXT
|
||||
subject_concept_id,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(cohort_initiation_date AS STRING), 1, 4), SUBSTR(CAST(cohort_initiation_date AS STRING), 5, 2), SUBSTR(CAST(cohort_initiation_date AS STRING), 7, 2)), 'UTC') AS cohort_initiation_date
|
||||
FROM omop_cdm.cohort_definition;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.attribute_definition
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT * from omop_cdm.attribute_definition;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.cdm_source
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT
|
||||
cdm_source_name,
|
||||
cdm_source_abbreviation,
|
||||
cdm_holder,
|
||||
source_description, -- TEXT
|
||||
source_documentation_reference,
|
||||
cdm_etl_reference,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(source_release_date AS STRING), 1, 4), SUBSTR(CAST(source_release_date AS STRING), 5, 2), SUBSTR(CAST(source_release_date AS STRING), 7, 2)), 'UTC') AS source_release_date,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(cdm_release_date AS STRING), 1, 4), SUBSTR(CAST(cdm_release_date AS STRING), 5, 2), SUBSTR(CAST(cdm_release_date AS STRING), 7, 2)), 'UTC') AS cdm_release_date,
|
||||
cdm_version,
|
||||
vocabulary_version
|
||||
FROM omop_cdm.cdm_source;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.person
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT * from omop_cdm.person;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.observation_period
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT
|
||||
observation_period_id,
|
||||
person_id,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(observation_period_start_date AS STRING), 1, 4), SUBSTR(CAST(observation_period_start_date AS STRING), 5, 2), SUBSTR(CAST(observation_period_start_date AS STRING), 7, 2)), 'UTC') AS observation_period_start_date,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(observation_period_end_date AS STRING), 1, 4), SUBSTR(CAST(observation_period_end_date AS STRING), 5, 2), SUBSTR(CAST(observation_period_end_date AS STRING), 7, 2)), 'UTC') AS observation_period_end_date,
|
||||
period_type_concept_id
|
||||
FROM omop_cdm.observation_period;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.specimen
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT
|
||||
specimen_id,
|
||||
person_id,
|
||||
specimen_concept_id,
|
||||
specimen_type_concept_id,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(specimen_date AS STRING), 1, 4), SUBSTR(CAST(specimen_date AS STRING), 5, 2), SUBSTR(CAST(specimen_date AS STRING), 7, 2)), 'UTC') AS specimen_date,
|
||||
specimen_time,
|
||||
quantity, -- NUMERIC
|
||||
unit_concept_id,
|
||||
anatomic_site_concept_id,
|
||||
disease_status_concept_id,
|
||||
specimen_source_id,
|
||||
specimen_source_value,
|
||||
unit_source_value,
|
||||
anatomic_site_source_value,
|
||||
disease_status_source_value
|
||||
FROM omop_cdm.specimen;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.death
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT
|
||||
person_id,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(death_date AS STRING), 1, 4), SUBSTR(CAST(death_date AS STRING), 5, 2), SUBSTR(CAST(death_date AS STRING), 7, 2)), 'UTC') AS death_date,
|
||||
death_type_concept_id,
|
||||
cause_concept_id,
|
||||
cause_source_value,
|
||||
cause_source_concept_id
|
||||
FROM omop_cdm.death;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.visit_occurrence
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT
|
||||
visit_occurrence_id,
|
||||
person_id,
|
||||
visit_concept_id,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(visit_start_date AS STRING), 1, 4), SUBSTR(CAST(visit_start_date AS STRING), 5, 2), SUBSTR(CAST(visit_start_date AS STRING), 7, 2)), 'UTC') AS visit_start_date,
|
||||
visit_start_time,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(visit_end_date AS STRING), 1, 4), SUBSTR(CAST(visit_end_date AS STRING), 5, 2), SUBSTR(CAST(visit_end_date AS STRING), 7, 2)), 'UTC') AS visit_end_date,
|
||||
visit_end_time,
|
||||
visit_type_concept_id,
|
||||
provider_id,
|
||||
care_site_id,
|
||||
visit_source_value,
|
||||
visit_source_concept_id
|
||||
FROM omop_cdm.visit_occurrence;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.procedure_occurrence
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT
|
||||
procedure_occurrence_id,
|
||||
person_id,
|
||||
procedure_concept_id,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(procedure_date AS STRING), 1, 4), SUBSTR(CAST(procedure_date AS STRING), 5, 2), SUBSTR(CAST(procedure_date AS STRING), 7, 2)), 'UTC') AS procedure_date,
|
||||
procedure_type_concept_id,
|
||||
modifier_concept_id,
|
||||
quantity,
|
||||
provider_id,
|
||||
visit_occurrence_id,
|
||||
procedure_source_value,
|
||||
procedure_source_concept_id,
|
||||
qualifier_source_value
|
||||
FROM omop_cdm.procedure_occurrence;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.drug_exposure
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT
|
||||
drug_exposure_id,
|
||||
person_id,
|
||||
drug_concept_id,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(drug_exposure_start_date AS STRING), 1, 4), SUBSTR(CAST(drug_exposure_start_date AS STRING), 5, 2), SUBSTR(CAST(drug_exposure_start_date AS STRING), 7, 2)), 'UTC') AS drug_exposure_start_date,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(drug_exposure_end_date AS STRING), 1, 4), SUBSTR(CAST(drug_exposure_end_date AS STRING), 5, 2), SUBSTR(CAST(drug_exposure_end_date AS STRING), 7, 2)), 'UTC') AS drug_exposure_end_date,
|
||||
drug_type_concept_id,
|
||||
stop_reason,
|
||||
refills,
|
||||
quantity, -- NUMERIC
|
||||
days_supply,
|
||||
sig, -- TEXT
|
||||
route_concept_id,
|
||||
effective_drug_dose, -- NUMERIC
|
||||
dose_unit_concept_id,
|
||||
lot_number,
|
||||
provider_id,
|
||||
visit_occurrence_id,
|
||||
drug_source_value,
|
||||
drug_source_concept_id,
|
||||
route_source_value,
|
||||
dose_unit_source_value
|
||||
FROM omop_cdm.drug_exposure;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.device_exposure
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT
|
||||
device_exposure_id,
|
||||
person_id,
|
||||
device_concept_id,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(device_exposure_start_date AS STRING), 1, 4), SUBSTR(CAST(device_exposure_start_date AS STRING), 5, 2), SUBSTR(CAST(device_exposure_start_date AS STRING), 7, 2)), 'UTC') AS device_exposure_start_date,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(device_exposure_end_date AS STRING), 1, 4), SUBSTR(CAST(device_exposure_end_date AS STRING), 5, 2), SUBSTR(CAST(device_exposure_end_date AS STRING), 7, 2)), 'UTC') AS device_exposure_end_date,
|
||||
device_type_concept_id,
|
||||
unique_device_id,
|
||||
quantity,
|
||||
provider_id,
|
||||
visit_occurrence_id,
|
||||
device_source_value,
|
||||
device_source_concept_id
|
||||
FROM omop_cdm.device_exposure;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.condition_occurrence
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT
|
||||
condition_occurrence_id,
|
||||
person_id,
|
||||
condition_concept_id,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(condition_start_date AS STRING), 1, 4), SUBSTR(CAST(condition_start_date AS STRING), 5, 2), SUBSTR(CAST(condition_start_date AS STRING), 7, 2)), 'UTC') AS condition_start_date,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(condition_end_date AS STRING), 1, 4), SUBSTR(CAST(condition_end_date AS STRING), 5, 2), SUBSTR(CAST(condition_end_date AS STRING), 7, 2)), 'UTC') AS condition_end_date,
|
||||
condition_type_concept_id,
|
||||
stop_reason,
|
||||
provider_id,
|
||||
visit_occurrence_id,
|
||||
condition_source_value,
|
||||
condition_source_concept_id
|
||||
FROM omop_cdm.condition_occurrence;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.measurement
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT
|
||||
measurement_id,
|
||||
person_id,
|
||||
measurement_concept_id,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(measurement_date AS STRING), 1, 4), SUBSTR(CAST(measurement_date AS STRING), 5, 2), SUBSTR(CAST(measurement_date AS STRING), 7, 2)), 'UTC') AS measurement_date,
|
||||
measurement_time,
|
||||
measurement_type_concept_id,
|
||||
operator_concept_id,
|
||||
value_as_number, -- NUMERIC
|
||||
value_as_concept_id,
|
||||
unit_concept_id,
|
||||
range_low, -- NUMERIC
|
||||
range_high, -- NUMERIC
|
||||
provider_id,
|
||||
visit_occurrence_id,
|
||||
measurement_source_value,
|
||||
measurement_source_concept_id,
|
||||
unit_source_value,
|
||||
value_source_value
|
||||
FROM omop_cdm.measurement;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.note
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT
|
||||
note_id,
|
||||
person_id,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(note_date AS STRING), 1, 4), SUBSTR(CAST(note_date AS STRING), 5, 2), SUBSTR(CAST(note_date AS STRING), 7, 2)), 'UTC') AS note_date,
|
||||
note_time,
|
||||
note_type_concept_id,
|
||||
note_text, -- TEXT
|
||||
provider_id,
|
||||
visit_occurrence_id,
|
||||
note_source_value
|
||||
FROM omop_cdm.note;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.observation
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT
|
||||
observation_id,
|
||||
person_id,
|
||||
observation_concept_id,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(observation_date AS STRING), 1, 4), SUBSTR(CAST(observation_date AS STRING), 5, 2), SUBSTR(CAST(observation_date AS STRING), 7, 2)), 'UTC') AS observation_date,
|
||||
observation_time,
|
||||
observation_type_concept_id,
|
||||
value_as_number, -- NUMERIC
|
||||
value_as_string,
|
||||
value_as_concept_id,
|
||||
qualifier_concept_id,
|
||||
unit_concept_id,
|
||||
provider_id,
|
||||
visit_occurrence_id,
|
||||
observation_source_value,
|
||||
observation_source_concept_id ,
|
||||
unit_source_value,
|
||||
qualifier_source_value
|
||||
FROM omop_cdm.observation;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.fact_relationship
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT * from omop_cdm.fact_relationship;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.`location`
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT * from omop_cdm.`location`;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.care_site
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT * from omop_cdm.care_site;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.provider
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT * from omop_cdm.provider;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.payer_plan_period
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT
|
||||
payer_plan_period_id,
|
||||
person_id,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(payer_plan_period_start_date AS STRING), 1, 4), SUBSTR(CAST(payer_plan_period_start_date AS STRING), 5, 2), SUBSTR(CAST(payer_plan_period_start_date AS STRING), 7, 2)), 'UTC') AS payer_plan_period_start_date,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(payer_plan_period_end_date AS STRING), 1, 4), SUBSTR(CAST(payer_plan_period_end_date AS STRING), 5, 2), SUBSTR(CAST(payer_plan_period_end_date AS STRING), 7, 2)), 'UTC') AS payer_plan_period_end_date,
|
||||
payer_source_value,
|
||||
plan_source_value,
|
||||
family_source_value
|
||||
FROM omop_cdm.payer_plan_period;
|
||||
|
||||
|
||||
/* The individual cost tables are being phased out and will disappear soon
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.visit_cost
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT * from omop_cdm.visit_cost;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.procedure_cost
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT * from omop_cdm.procedure_cost;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.drug_cost
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT * from omop_cdm.drug_cost;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.device_cost
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT * from omop_cdm.device_cost;
|
||||
*/
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.cost
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT * from omop_cdm.cost;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.cohort
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT
|
||||
cohort_definition_id,
|
||||
subject_id,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(cohort_start_date AS STRING), 1, 4), SUBSTR(CAST(cohort_start_date AS STRING), 5, 2), SUBSTR(CAST(cohort_start_date AS STRING), 7, 2)), 'UTC') AS cohort_start_date,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(cohort_end_date AS STRING), 1, 4), SUBSTR(CAST(cohort_end_date AS STRING), 5, 2), SUBSTR(CAST(cohort_end_date AS STRING), 7, 2)), 'UTC') AS cohort_end_date
|
||||
FROM omop_cdm.cohort;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.cohort_attribute
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT
|
||||
cohort_definition_id,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(cohort_start_date AS STRING), 1, 4), SUBSTR(CAST(cohort_start_date AS STRING), 5, 2), SUBSTR(CAST(cohort_start_date AS STRING), 7, 2)), 'UTC') AS cohort_start_date,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(cohort_end_date AS STRING), 1, 4), SUBSTR(CAST(cohort_end_date AS STRING), 5, 2), SUBSTR(CAST(cohort_end_date AS STRING), 7, 2)), 'UTC') AS cohort_end_date,
|
||||
subject_id,
|
||||
attribute_definition_id,
|
||||
value_as_number, -- NUMERIC
|
||||
value_as_concept_id
|
||||
FROM omop_cdm.cohort_attribute;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.drug_era
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT
|
||||
drug_era_id,
|
||||
person_id,
|
||||
drug_concept_id,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(drug_era_start_date AS STRING), 1, 4), SUBSTR(CAST(drug_era_start_date AS STRING), 5, 2), SUBSTR(CAST(drug_era_start_date AS STRING), 7, 2)), 'UTC') AS drug_era_start_date,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(drug_era_end_date AS STRING), 1, 4), SUBSTR(CAST(drug_era_end_date AS STRING), 5, 2), SUBSTR(CAST(drug_era_end_date AS STRING), 7, 2)), 'UTC') AS drug_era_end_date,
|
||||
drug_exposure_count,
|
||||
gap_days
|
||||
FROM omop_cdm.drug_era;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.dose_era
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT
|
||||
dose_era_id,
|
||||
person_id,
|
||||
drug_concept_id,
|
||||
unit_concept_id,
|
||||
dose_value, -- NUMERIC
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(dose_era_start_date AS STRING), 1, 4), SUBSTR(CAST(dose_era_start_date AS STRING), 5, 2), SUBSTR(CAST(dose_era_start_date AS STRING), 7, 2)), 'UTC') AS dose_era_start_date,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(dose_era_end_date AS STRING), 1, 4), SUBSTR(CAST(dose_era_end_date AS STRING), 5, 2), SUBSTR(CAST(dose_era_end_date AS STRING), 7, 2)), 'UTC') AS dose_era_end_date
|
||||
FROM omop_cdm.dose_era;
|
||||
|
||||
CREATE TABLE omop_cdm_parquet.condition_era
|
||||
STORED AS PARQUET
|
||||
AS
|
||||
SELECT
|
||||
condition_era_id,
|
||||
person_id,
|
||||
condition_concept_id,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(condition_era_start_date AS STRING), 1, 4), SUBSTR(CAST(condition_era_start_date AS STRING), 5, 2), SUBSTR(CAST(condition_era_start_date AS STRING), 7, 2)), 'UTC') AS condition_era_start_date,
|
||||
TO_UTC_TIMESTAMP(CONCAT_WS('-', SUBSTR(CAST(condition_era_end_date AS STRING), 1, 4), SUBSTR(CAST(condition_era_end_date AS STRING), 5, 2), SUBSTR(CAST(condition_era_end_date AS STRING), 7, 2)), 'UTC') AS condition_era_end_date,
|
||||
condition_occurrence_count
|
||||
FROM omop_cdm.condition_era;
|
||||
|
|
@ -43,9 +43,16 @@ hadoop fs -chmod +w synpuf
|
|||
impala-shell -d omop_cdm -f DataImport/OMOP_CDM_synpuf_load_Impala.sql --var=OMOP_SYNPUF_PATH=/user/$USER/synpuf
|
||||
```
|
||||
|
||||
4. Run simple queries to sanity check.
|
||||
4. Convert to Parquet format.
|
||||
|
||||
```bash
|
||||
impala-shell -d omop_cdm -q 'SELECT COUNT(1) FROM concept'
|
||||
impala-shell -d omop_cdm -q 'SELECT COUNT(1) FROM person'
|
||||
impala-shell -q 'CREATE DATABASE omop_cdm_parquet'
|
||||
impala-shell -f OMOP_Parquet.sql
|
||||
```
|
||||
|
||||
5. Run simple queries to sanity check.
|
||||
|
||||
```bash
|
||||
impala-shell -d omop_cdm_parquet -q 'SELECT COUNT(1) FROM concept'
|
||||
impala-shell -d omop_cdm_parquet -q 'SELECT COUNT(1) FROM person'
|
||||
```
|
Loading…
Reference in New Issue