Article Case Study Generation
Source:vignettes/a13_article_case_studies.Rmd
a13_article_case_studies.RmdGoal
This vignette documents the code used to generate the article-style
CohortContrast studies shown at http://omop-apps.cloud.ut.ee/CohortContrast/.
The workflow uses cohort JSON definitions exported from ATLAS and runs against the OPTIMA OMOP Common Data Model database.
The workflow is designed to produce:
- one final summary-mode study folder per case
Expected JSON layout
Each cohort JSON definition should live in its own directory, because
CohortContrast::cohortFromJSON() expects a directory
containing the JSON cohort definition files.
Recommended package layout:
inst/extdata/article_cases/lung/cohort.jsoninst/extdata/article_cases/prostate/cohort.json
When the package is installed, these paths can be resolved with
system.file().
Notes
- This vignette is shown as code only and is not executed during package checks, because it depends on a real OMOP Common Data Model database.
- The workflow below uses one clean
CohortContrast()analysis followed by a single export of the final filtered and mapped study object to a temporary patient-mode folder. That temporary folder is used only to create the summary-mode study and is then removed. The retained study folders therefore contain only the final concepts that passed the statistical selection steps.
Required environment variables
# Database connection settings expected by the helper below.
Sys.getenv("DB_NAME")
Sys.getenv("DB_HOST")
Sys.getenv("DB_PORT")
Sys.getenv("DB_USERNAME")
Sys.getenv("DB_PASSWORD")
# OMOP schema settings.
Sys.getenv("OHDSI_CDM")
Sys.getenv("OHDSI_RESULTS")
Sys.getenv("OHDSI_WRITE")Connect to the OMOP database
library(CohortContrast)
library(CDMConnector)
library(DBI)
library(RPostgres)
library(dplyr)
library(purrr)
library(tibble)
connectArticleCdm <- function() {
# Open the database connection using environment variables instead of
# hard-coded credentials.
db <- DBI::dbConnect(
RPostgres::Postgres(),
dbname = Sys.getenv("DB_NAME"),
host = Sys.getenv("DB_HOST"),
user = Sys.getenv("DB_USERNAME"),
password = Sys.getenv("DB_PASSWORD"),
port = as.integer(Sys.getenv("DB_PORT"))
)
# Create the CDMConnector object used by CohortContrast.
cdm <- CDMConnector::cdmFromCon(
con = db,
cdmSchema = Sys.getenv("OHDSI_CDM"),
achillesSchema = Sys.getenv("OHDSI_RESULTS"),
writeSchema = c(
schema = Sys.getenv("OHDSI_WRITE"),
prefix = "cc_"
)
)
list(db = db, cdm = cdm)
}Define study cases
# Each case points to a directory containing one ATLAS cohort JSON definition.
# Add future studies by inserting a new row here.
cases <- tibble::tribble(
~case_name, ~json_dir, ~study_name,
"lung",
system.file("extdata", "article_cases", "lung", package = "CohortContrast"),
"malignant_neoplasm_of_lung_1year_after",
"prostate",
system.file("extdata", "article_cases", "prostate", package = "CohortContrast"),
"malignant_neoplasm_of_prostate_1year_after"
)Build the target cohort
buildTargetTable <- function(cdm, jsonDir) {
# Generate the target cohort from the JSON definition.
targetTable <- CohortContrast::cohortFromJSON(
pathToJSON = jsonDir,
cdm = cdm
) %>%
# Shift the cohort start slightly earlier and define a 1-year follow-up.
dplyr::mutate(
cohort_start_date = cohort_start_date - 14L,
cohort_end_date = cohort_start_date + 365L
)
# Resolve any overlaps before continuing with the analysis.
CohortContrast::resolveCohortTableOverlaps(
cohortTable = targetTable,
cdm = cdm
)
}Build the visit-based control cohort
buildVisitBasedControl <- function(cdm, targetTable, lookbackDays = 1095L, followupDays = 365L) {
# Shift the target windows backwards to define the control-search anchor.
controlSeed <- targetTable %>%
dplyr::mutate(
cohort_start_date = cohort_start_date - as.integer(lookbackDays),
cohort_end_date = cohort_start_date + as.integer(followupDays)
)
# Insert the shifted target windows as a temporary table used to retrieve the
# latest eligible visit before the control anchor date.
cdm <- CDMConnector::insertTable(
cdm = cdm,
name = "article_control_seed",
table = controlSeed,
overwrite = TRUE,
temporary = TRUE
)
controlTable <- cdm$visit_occurrence %>%
dplyr::inner_join(cdm$article_control_seed, by = c("person_id" = "subject_id")) %>%
dplyr::filter(visit_start_date <= cohort_start_date) %>%
dplyr::group_by(person_id) %>%
# To reduce bias from differential encounter density, align each patient's
# control window to the clinical visit start closest to the baseline anchor
# date, operationalized here as the latest eligible visit on or before that
# anchor date.
dplyr::slice_max(order_by = visit_start_date, n = 1, with_ties = FALSE) %>%
dplyr::ungroup() %>%
dplyr::transmute(
subject_id = person_id,
cohort_start_date = visit_start_date
) %>%
dplyr::collect() %>%
dplyr::mutate(
cohort_definition_id = "control",
cohort_end_date = .data$cohort_start_date + as.integer(followupDays)
) %>%
dplyr::select(
cohort_definition_id,
subject_id,
cohort_start_date,
cohort_end_date
)
CohortContrast::resolveCohortTableOverlaps(
cohortTable = controlTable,
cdm = cdm
)
}Keep only the final selected concepts
trimFinalStudy <- function(data) {
# selectedFeatureData is the final selected concept set after statistical
# filtering and optional post-processing.
finalIds <- unique(data$selectedFeatureData$selectedFeatureIds)
# Keep only the final selected concepts in the exported feature table.
data$data_features <- data$data_features %>%
dplyr::filter(.data$CONCEPT_ID %in% finalIds)
# Keep only patient-level rows belonging to the final selected concepts.
data$data_patients <- data$data_patients %>%
dplyr::filter(.data$CONCEPT_ID %in% finalIds)
# Keep only mapping rows connected to the final selected concepts.
data$complementaryMappingTable <- data$complementaryMappingTable %>%
dplyr::filter(
.data$CONCEPT_ID %in% finalIds |
.data$NEW_CONCEPT_ID %in% finalIds
)
# Keep the selected-feature payload aligned with the final study subset.
data$selectedFeatureData$selectedFeatures <- data$selectedFeatureData$selectedFeatures %>%
dplyr::filter(.data$CONCEPT_ID %in% finalIds)
data$selectedFeatureData$selectedFeatureIds <- finalIds
data$selectedFeatureData$selectedFeatureNames <- unique(
data$selectedFeatureData$selectedFeatures$CONCEPT_NAME
)
data$trajectoryDataList <- data$selectedFeatureData
data
}Run one final mapped study
runArticleCase <- function(cdm, jsonDir, studyName, summaryRoot, scratchRoot) {
# Build the target and control cohorts for the selected case.
targetTable <- buildTargetTable(cdm = cdm, jsonDir = jsonDir)
controlTable <- buildVisitBasedControl(cdm = cdm, targetTable = targetTable)
# Run the full CohortContrast workflow in memory first.
# Output writing is delayed until the final selected/mapped study object has
# been trimmed to the published concept subset.
result <- CohortContrast::CohortContrast(
cdm = cdm,
targetTable = targetTable,
controlTable = controlTable,
pathToResults = scratchRoot,
prevalenceCutOff = 1,
getSourceData = FALSE,
runChi2YTests = TRUE,
runLogitTests = TRUE,
createOutputFiles = FALSE,
complName = studyName,
runRemoveTemporalBias = TRUE,
removeTemporalBiasArgs = list(
removeIdentified = TRUE
),
runAutomaticHierarchyCombineConcepts = TRUE,
automaticHierarchyCombineConceptsArgs = list(
abstractionLevel = -1,
minDepthAllowed = 0,
allowOnlyMinors = TRUE
),
runAutomaticCorrelationCombineConcepts = TRUE,
automaticCorrelationCombineConceptsArgs = list(
abstractionLevel = -1,
minCorrelation = 0.7,
maxDaysInBetween = 1,
heritageDriftAllowed = FALSE
)
)
# Keep only the concepts that survived the statistical and post-processing
# pipeline, then write one temporary patient-mode study folder.
result <- trimFinalStudy(result)
CohortContrast:::saveResult(result, scratchRoot)
# Create the summary-mode study from the temporary patient-mode study.
CohortContrast::precomputeSummary(
studyPath = file.path(scratchRoot, studyName),
outputPath = file.path(summaryRoot, paste0(studyName, "_summary")),
clusterKValues = c(2, 3, 4, 5)
)
# Remove the temporary patient-mode folder and keep only the summary.
unlink(file.path(scratchRoot, studyName), recursive = TRUE, force = TRUE)
invisible(result)
}Run all configured cases
# Directory that will keep the published summary-mode study folders.
summaryRoot <- "article_studies"
# Temporary directory used only while precomputeSummary() is running.
scratchRoot <- file.path(tempdir(), "cohortcontrast_article_scratch")
dir.create(summaryRoot, recursive = TRUE, showWarnings = FALSE)
dir.create(scratchRoot, recursive = TRUE, showWarnings = FALSE)
conn <- connectArticleCdm()
db <- conn$db
cdm <- conn$cdm
purrr::pwalk(
cases,
\(case_name, json_dir, study_name) {
# Run one complete article workflow per case.
runArticleCase(
cdm = cdm,
jsonDir = json_dir,
studyName = study_name,
summaryRoot = summaryRoot,
scratchRoot = scratchRoot
)
}
)
# Close the database connection when finished.
DBI::dbDisconnect(db)Resulting study folders
After the workflow completes, the output directory will contain:
-
<study_name>_summary/for the corresponding summary-mode study
Temporary patient-mode study folders are created only while
precomputeSummary() is running and are removed
afterward.
The retained summary-mode study folders therefore contain the final published outputs.
For the two configured cases, that means:
malignant_neoplasm_of_lung_1year_after_summary/malignant_neoplasm_of_prostate_1year_after_summary/
These summary folders can then be opened directly in the Viewer.