Skip to contents

Set up

Let’s again load required packages and connect to our Eunomia dataset in duckdb.

library(CDMConnector)
library(dplyr)

write_schema <- "main"
cdm_schema <- "main"

con <- DBI::dbConnect(duckdb::duckdb(), dbdir = eunomia_dir())
cdm <- cdm_from_con(con, cdm_schema = cdm_schema, write_schema = write_schema)

CDM reference attributes

Our cdm reference has various attributes associated with it. These can be useful both when programming and when developing analytic packages on top of CDMConnector.

CDM name

It’s a requirement that every cdm reference has name associated with it. This is particularly useful for network studies so that we can associate results with a particular cdm. We can access this attribute like so

attr(cdm, "cdm_name")
#> [1] "Synthea synthetic health database"

Because it is so regularly used, to make getting the cdm name even easier, we can also use cdmName (or it’s snake case equivalent cdm_name)

cdmName(cdm)
#> [1] "Synthea synthetic health database"
cdm_name(cdm)
#> [1] "Synthea synthetic health database"

CDM version

The OMOP CDM has various versions. We also have an attribute giving the version of the cdm we have connected to.

attr(cdm, "cdm_version")
#> [1] "5.3"

Database connection

We also have an attribute identifying the database connection underlying the cdm reference.

attr(cdm, "dbcon")
#> <duckdb_connection c9c40 driver=<duckdb_driver 14b90 dbdir='/var/folders/xx/01v98b6546ldnm1rg1_bvk000000gn/T//Rtmpeo0Nlk/file9b286d2dfb88.duckdb' read_only=FALSE bigint=numeric>>

This can be useful, for example, if we want to make use of DBI functions to work with the database. For example we could use dbListTables to list the names of remote tables accessible through the connection, dbListFields to list the field names of a specific remote table, and dbGetQuery to returns the result of a query

DBI::dbListTables(attr(cdm, "dbcon"))
#>  [1] "care_site"             "cdm_source"            "concept"              
#>  [4] "concept_ancestor"      "concept_class"         "concept_relationship" 
#>  [7] "concept_synonym"       "condition_era"         "condition_occurrence" 
#> [10] "cost"                  "death"                 "device_exposure"      
#> [13] "domain"                "dose_era"              "drug_era"             
#> [16] "drug_exposure"         "drug_strength"         "fact_relationship"    
#> [19] "location"              "measurement"           "metadata"             
#> [22] "note"                  "note_nlp"              "observation"          
#> [25] "observation_period"    "payer_plan_period"     "person"               
#> [28] "procedure_occurrence"  "provider"              "relationship"         
#> [31] "source_to_concept_map" "specimen"              "visit_detail"         
#> [34] "visit_occurrence"      "vocabulary"
DBI::dbListFields(attr(cdm, "dbcon"), "person")
#>  [1] "person_id"                   "gender_concept_id"          
#>  [3] "year_of_birth"               "month_of_birth"             
#>  [5] "day_of_birth"                "birth_datetime"             
#>  [7] "race_concept_id"             "ethnicity_concept_id"       
#>  [9] "location_id"                 "provider_id"                
#> [11] "care_site_id"                "person_source_value"        
#> [13] "gender_source_value"         "gender_source_concept_id"   
#> [15] "race_source_value"           "race_source_concept_id"     
#> [17] "ethnicity_source_value"      "ethnicity_source_concept_id"
DBI::dbGetQuery(attr(cdm, "dbcon"), "SELECT * FROM person LIMIT 5")
#>   person_id gender_concept_id year_of_birth month_of_birth day_of_birth
#> 1         6              8532          1963             12           31
#> 2       123              8507          1950              4           12
#> 3       129              8507          1974             10            7
#> 4        16              8532          1971             10           13
#> 5        65              8532          1967              3           31
#>   birth_datetime race_concept_id ethnicity_concept_id location_id provider_id
#> 1     1963-12-31            8516                    0          NA          NA
#> 2     1950-04-12            8527                    0          NA          NA
#> 3     1974-10-07            8527                    0          NA          NA
#> 4     1971-10-13            8527                    0          NA          NA
#> 5     1967-03-31            8516                    0          NA          NA
#>   care_site_id                  person_source_value gender_source_value
#> 1           NA 001f4a87-70d0-435c-a4b9-1425f6928d33                   F
#> 2           NA 052d9254-80e8-428f-b8b6-69518b0ef3f3                   M
#> 3           NA 054d32d5-904f-4df4-846b-8c08d165b4e9                   M
#> 4           NA 00444703-f2c9-45c9-a247-f6317a43a929                   F
#> 5           NA 02a3dad9-f9d5-42fb-8074-c16d45b4f5c8                   F
#>   gender_source_concept_id race_source_value race_source_concept_id
#> 1                        0             black                      0
#> 2                        0             white                      0
#> 3                        0             white                      0
#> 4                        0             white                      0
#> 5                        0             black                      0
#>   ethnicity_source_value ethnicity_source_concept_id
#> 1            west_indian                           0
#> 2                italian                           0
#> 3                 polish                           0
#> 4               american                           0
#> 5              dominican                           0

Cohort attributes

Generated cohort set

When we generate a cohort in addition to the cohort table itself we also have various attributes that can be useful for subsequent analysis.

Here we create a cohort table with a single cohort.


# debugonce(generateConceptCohortSet)
cdm <- generateConceptCohortSet(cdm = cdm, 
                                conceptSet = list("gi_bleed" = 192671,
                                                  "celecoxib" = 1118084), 
                                name = "study_cohorts",
                                overwrite = TRUE)

cdm$study_cohorts %>% 
  glimpse()
#> Rows: ??
#> Columns: 4
#> Database: DuckDB v0.9.2 [root@Darwin 23.0.0:R 4.3.1//var/folders/xx/01v98b6546ldnm1rg1_bvk000000gn/T//Rtmpeo0Nlk/file9b286d2dfb88.duckdb]
#> $ cohort_definition_id <int> 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2…
#> $ subject_id           <int> 2709, 4380, 5229, 260, 334, 384, 842, 1051, 1116,…
#> $ cohort_start_date    <date> 1984-04-03, 2001-10-25, 2004-05-03, 2010-03-19, …
#> $ cohort_end_date      <date> 2019-03-26, 2019-05-20, 2019-01-16, 2017-08-05, …

We have a cohort set attribute that gives details on the settings associated with the cohorts (along with utility functions to make it easier to access this attribute).

attr(cdm$study_cohorts, "cohort_set")
#> # Source:   table<study_cohorts_set> [2 x 6]
#> # Database: DuckDB v0.9.2 [root@Darwin 23.0.0:R 4.3.1//var/folders/xx/01v98b6546ldnm1rg1_bvk000000gn/T//Rtmpeo0Nlk/file9b286d2dfb88.duckdb]
#>   cohort_definition_id cohort_name limit prior_observation future_observation
#>                  <int> <chr>       <chr>             <dbl>              <dbl>
#> 1                    1 gi_bleed    first                 0                  0
#> 2                    2 celecoxib   first                 0                  0
#> # ℹ 1 more variable: end <chr>
cohortSet(cdm$study_cohorts)
cohort_set(cdm$study_cohorts) 

We have a cohort_count attribute with counts for each of the cohorts.

attr(cdm$study_cohorts, "cohort_count")
#> # Source:   table<study_cohorts_count> [2 x 3]
#> # Database: DuckDB v0.9.2 [root@Darwin 23.0.0:R 4.3.1//var/folders/xx/01v98b6546ldnm1rg1_bvk000000gn/T//Rtmpeo0Nlk/file9b286d2dfb88.duckdb]
#>   cohort_definition_id number_records number_subjects
#>                  <int>          <dbl>           <dbl>
#> 1                    2           1800            1800
#> 2                    1            479             479
cohortCount(cdm$study_cohorts)
cohort_count(cdm$study_cohorts)

And we also have an attribute, cohort attrition, with a summary of attrition when creating the cohorts.

attr(cdm$study_cohorts, "cohort_attrition")
cohortAttrition(cdm$study_cohorts)
cohort_attrition(cdm$study_cohorts)

In addition, we also have the cdm reference itself as an attribute of the cohorts. This is particularly useful when developing analytic packages on top of CDMConnector.

attr(cdm$study_cohorts, "cdm_reference")
#> # OMOP CDM reference (tbl_duckdb_connection)
#> 
#> Tables: person, observation_period, visit_occurrence, visit_detail, condition_occurrence, drug_exposure, procedure_occurrence, device_exposure, measurement, observation, death, note, note_nlp, specimen, fact_relationship, location, care_site, provider, payer_plan_period, cost, drug_era, dose_era, condition_era, metadata, cdm_source, concept, vocabulary, domain, concept_class, concept_relationship, relationship, concept_synonym, concept_ancestor, source_to_concept_map, drug_strength, study_cohorts

Creating a bespoke cohort

Say we create a custom GI bleed cohort with the standard cohort structure

cdm$GI_bleed <- cdm$condition_occurrence %>% 
  filter(condition_concept_id == 192671) %>% 
  mutate(cohort_definition_id = 1) %>% 
  select(cohort_definition_id, person_id,
         condition_start_date, condition_end_date) %>% 
  rename("subject_id" = "person_id", 
         "cohort_start_date" = "condition_start_date", 
         "cohort_end_date" = "condition_end_date") %>% 
  compute_query(temporary = FALSE,
                schema = write_schema,
                overwrite = TRUE)

cdm$GI_bleed %>% 
  glimpse()
#> Rows: ??
#> Columns: 4
#> Database: DuckDB v0.9.2 [root@Darwin 23.0.0:R 4.3.1//var/folders/xx/01v98b6546ldnm1rg1_bvk000000gn/T//Rtmpeo0Nlk/file9b286d2dfb88.duckdb]
#> $ cohort_definition_id <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
#> $ subject_id           <int> 273, 61, 351, 579, 549, 116, 163, 304, 326, 285, …
#> $ cohort_start_date    <date> 2011-10-10, 2005-09-15, 2018-06-28, 1999-11-06, …
#> $ cohort_end_date      <date> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …

We can add the required attributes using the newGeneratedCohortSet function. The minimum requirement for this is that we also define the cohort set to associate with our set of custom cohorts.

GI_bleed_cohort_ref <- data.frame(cohort_definition_id = 1,
                                  cohort_name = "custom_gi_bleed")

cdm$GI_bleed <- newGeneratedCohortSet(cohortRef = cdm$GI_bleed, 
                                      cohortSetRef = GI_bleed_cohort_ref, 
                                      overwrite = TRUE)

Now our custom cohort GI_bleed has the same attributes associated with it as if it had been created by generateConceptCohortSet. This will mean that it can be used by analytic packages designed to work with cdm cohorts.

cohort_set(cdm$GI_bleed)
#> # A tibble: 1 × 2
#>   cohort_definition_id cohort_name    
#>                  <dbl> <chr>          
#> 1                    1 custom_gi_bleed
cohort_count(cdm$GI_bleed)
#> # A tibble: 1 × 3
#>   cohort_definition_id number_records number_subjects
#>                  <dbl>          <dbl>           <dbl>
#> 1                    1            479             479
cohort_attrition(cdm$GI_bleed)
#> # A tibble: 1 × 7
#>   cohort_definition_id number_records number_subjects reason_id reason          
#>                  <dbl>          <dbl>           <dbl>     <dbl> <chr>           
#> 1                    1            479             479         1 Qualifying init…
#> # ℹ 2 more variables: excluded_records <dbl>, excluded_subjects <dbl>
attr(cdm$GI_bleed, "cdm_reference")
#> # OMOP CDM reference (tbl_duckdb_connection)
#> 
#> Tables: person, observation_period, visit_occurrence, visit_detail, condition_occurrence, drug_exposure, procedure_occurrence, device_exposure, measurement, observation, death, note, note_nlp, specimen, fact_relationship, location, care_site, provider, payer_plan_period, cost, drug_era, dose_era, condition_era, metadata, cdm_source, concept, vocabulary, domain, concept_class, concept_relationship, relationship, concept_synonym, concept_ancestor, source_to_concept_map, drug_strength, study_cohorts, GI_bleed