ICD10 Atlas Generation • CohortContrast

Goal

This vignette documents the code used to generate an ICD10 three-digit CohortContrast atlas across multiple observation windows.

The resulting atlas based on EstHealth30 data can be explored at http://omop-apps.cloud.ut.ee/CohortContrastAtlas/.

The workflow runs the same analysis for all eligible three-digit ICD10 codes and produces summary-mode study folders for three observation periods:

diagnosis day 0 to 365
diagnosis day 0 to 30
diagnosis day -90 to 0

This pipeline keeps only the final summary folders.

Package versions used for the wiki page

The ICD10 atlas outputs were generated in an R package environment equivalent to the article case-study workflow.

Package	Version
CohortContrast	1.0.0
backports	1.5.0
bit	4.6.0
bit64	4.6.0-1
blob	1.2.4
CDMConnector	2.2.0
checkmate	2.3.3
cli	3.6.5
clipr	0.8.0
clock	0.7.3
cluster	2.1.8.1
codetools	0.2-20
CohortConstructor	0.5.0
compiler	4.5.1
cpp11	0.5.2
crayon	1.5.3
data.table	1.17.8
DBI	1.2.3
dbplyr	2.5.1
doParallel	1.0.17
dplyr	1.1.4
foreach	1.5.2
generics	0.1.4
glue	1.8.0
graphics	4.5.1
grDevices	4.5.1
grid	4.5.1
here	1.0.2
hms	1.1.3
igraph	2.1.4
iterators	1.0.14
jsonlite	2.0.0
lattice	0.22-7
lifecycle	1.0.4
lubridate	1.9.4
magrittr	2.0.4
MASS	7.3-65
Matrix	1.7-3
methods	4.5.1
mgcv	1.9-3
nanoparquet	0.4.3
nlme	3.1-168
omopgenerics	1.3.0
parallel	4.5.1
PatientProfiles	1.4.2
permute	0.9-8
pillar	1.11.0
pkgconfig	2.0.3
plyr	1.8.9
png	0.1-8
prettyunits	1.2.0
progress	1.2.3
purrr	1.1.0
R6	2.6.1
rappdirs	0.3.3
Rcpp	1.1.0
RcppTOML	0.2.3
readr	2.1.5
reshape2	1.4.4
reticulate	1.44.1
rlang	1.1.6
rprojroot	2.1.1
snakecase	0.11.1
splines	4.5.1
stats	4.5.1
stringi	1.8.7
stringr	1.5.2
tibble	3.3.0
tidyr	1.3.1
tidyselect	1.2.1
timechange	0.3.0
tools	4.5.1
tzdb	0.5.0
utf8	1.2.6
utils	4.5.1
vctrs	0.6.5
vegan	2.7-1
vroom	1.6.5
withr	3.0.2

Required environment variables

# Database connection settings expected by the helper below.
Sys.getenv("DB_NAME")
Sys.getenv("DB_HOST")
Sys.getenv("DB_PORT")
Sys.getenv("DB_USERNAME")
Sys.getenv("DB_PASSWORD")

# OMOP schema settings.
Sys.getenv("OHDSI_CDM")
Sys.getenv("OHDSI_RESULTS")
Sys.getenv("OHDSI_WRITE")

Connect to the OMOP database

library(CohortContrast)
library(CDMConnector)
library(DBI)
library(RPostgres)
library(dplyr)
library(purrr)
library(tibble)
library(icd.data)

connectAtlasCdm <- function() {
  # Open the database connection from environment variables instead of
  # hard-coded credentials.
  db <- DBI::dbConnect(
    RPostgres::Postgres(),
    dbname = Sys.getenv("DB_NAME"),
    host = Sys.getenv("DB_HOST"),
    user = Sys.getenv("DB_USERNAME"),
    password = Sys.getenv("DB_PASSWORD"),
    port = as.integer(Sys.getenv("DB_PORT"))
  )

  # Create the CDMConnector object used by CohortContrast.
  cdm <- CDMConnector::cdmFromCon(
    con = db,
    cdmSchema = Sys.getenv("OHDSI_CDM"),
    achillesSchema = Sys.getenv("OHDSI_RESULTS"),
    writeSchema = c(
      schema = Sys.getenv("OHDSI_WRITE"),
      prefix = "cc_"
    )
  )

  list(db = db, cdm = cdm)
}

Define the observation windows

# Each row defines one observation window relative to the first qualifying
# diagnosis date and one matching control-window lag.
windows <- tibble::tribble(
  ~window_name,   ~target_start_offset, ~target_end_offset, ~control_lag_days, ~minimum_observation_days,
  "after_1year",                   0L,                365L,             1095L,                     1095L,
  "after_30days",                  0L,                 30L,              1095L,                      1095L,
  "prior_90days",                -90L,                  0L,              1095L,                      1095L
)

Identify eligible ICD10 three-digit codes

getEligibleIcd10Codes <- function(cdm, minPatients = 100L) {
  # Keep only three-digit ICD10 codes from the icd.data lookup table.
  icd10Lookup <- icd.data::icd10cm2016 %>%
    dplyr::filter(nchar(.data$code) < 4) %>%
    dplyr::distinct(.data$code, .data$long_desc)

  # Count patients per three-digit source code in the OMOP condition table.
  counts <- cdm$condition_occurrence %>%
    dplyr::mutate(icd10code = substr(.data$condition_source_value, 1, 3)) %>%
    dplyr::group_by(.data$icd10code) %>%
    dplyr::summarise(n = dplyr::n_distinct(.data$person_id), .groups = "drop") %>%
    dplyr::collect() %>%
    dplyr::filter(.data$n >= minPatients)

  dplyr::inner_join(
    counts,
    icd10Lookup,
    by = c("icd10code" = "code")
  )
}

Helper for clean study names

makeSafeStudyName <- function(code, label, windowName) {
  raw <- paste(code, label, windowName, sep = "_")
  raw <- tolower(raw)
  raw <- gsub("[^a-z0-9]+", "_", raw)
  raw <- gsub("^_+|_+$", "", raw)
  raw
}

Build the target cohort

buildIcd10TargetTable <- function(cdm, icd10code, targetStartOffset, targetEndOffset,
                                  minimumObservationDays) {
  targetStartOffset <- as.integer(targetStartOffset)
  targetEndOffset <- as.integer(targetEndOffset)
  minimumObservationDays <- as.integer(minimumObservationDays)

  # Restrict to the selected ICD10 three-digit source code.
  conditionData <- cdm$condition_occurrence %>%
    dplyr::filter(substr(.data$condition_source_value, 1, 3) == icd10code) %>%
    dplyr::collect()

  # Keep the first qualifying diagnosis event per person as the baseline anchor.
  firstDiagnosis <- conditionData %>%
    dplyr::group_by(.data$person_id) %>%
    dplyr::slice_min(order_by = .data$condition_start_date, n = 1, with_ties = FALSE) %>%
    dplyr::ungroup() %>%
    dplyr::transmute(
      person_id = .data$person_id,
      anchor_date = .data$condition_start_date
    )

  # Ensure adequate observation time before the anchor date.
  observationPeriod <- cdm$observation_period %>%
    dplyr::collect() %>%
    dplyr::select(
      person_id,
      observation_period_start_date,
      observation_period_end_date
    )

  eligible <- firstDiagnosis %>%
    dplyr::left_join(observationPeriod, by = "person_id") %>%
    dplyr::filter(
      as.numeric(.data$anchor_date - .data$observation_period_start_date) > minimumObservationDays
    )

  targetTable <- eligible %>%
    dplyr::transmute(
      cohort_definition_id = "target",
      subject_id = .data$person_id,
      cohort_start_date = .data$anchor_date + targetStartOffset,
      cohort_end_date = .data$anchor_date + targetEndOffset
    ) %>%
    dplyr::filter(.data$cohort_start_date < .data$cohort_end_date)

  CohortContrast::resolveCohortTableOverlaps(
    cohortTable = targetTable,
    cdm = cdm
  )
}

Build the visit-based control cohort

buildIcd10VisitControl <- function(cdm, targetTable, controlLagDays) {
  controlLagDays <- as.integer(controlLagDays)
  followupDays <- as.integer(
    unique(targetTable$cohort_end_date - targetTable$cohort_start_date)
  )

  # Shift the target windows backwards to define the control-search anchor.
  controlSeed <- targetTable %>%
    dplyr::mutate(
      cohort_start_date = cohort_start_date - controlLagDays,
      cohort_end_date = cohort_end_date - controlLagDays
    )

  cdm <- CDMConnector::insertTable(
    cdm = cdm,
    name = "icd10_control_seed",
    table = controlSeed,
    overwrite = TRUE,
    temporary = TRUE
  )

  controlTable <- cdm$visit_occurrence %>%
    dplyr::inner_join(cdm$icd10_control_seed, by = c("person_id" = "subject_id")) %>%
    dplyr::filter(visit_start_date <= cohort_start_date) %>%
    dplyr::group_by(person_id) %>%
    # To reduce bias from differential encounter density, align each patient's
    # control window to begin at the start of the clinical visit closest to the
    # baseline anchor date, operationalized here as the latest eligible visit
    # on or before the shifted anchor date.
    dplyr::slice_max(order_by = visit_start_date, n = 1, with_ties = FALSE) %>%
    dplyr::ungroup() %>%
    dplyr::transmute(
      subject_id = .data$person_id,
      cohort_start_date = .data$visit_start_date
    ) %>%
    dplyr::collect() %>%
    dplyr::mutate(
      cohort_definition_id = "control",
      cohort_end_date = .data$cohort_start_date + followupDays
    ) %>%
    dplyr::select(
      cohort_definition_id,
      subject_id,
      cohort_start_date,
      cohort_end_date
    ) %>%
    dplyr::filter(.data$cohort_start_date < .data$cohort_end_date)

  CohortContrast::resolveCohortTableOverlaps(
    cohortTable = controlTable,
    cdm = cdm
  )
}

Keep only the final selected concepts

trimFinalStudy <- function(data) {
  # selectedFeatureData stores the concepts retained after statistical
  # filtering and optional post-processing.
  finalIds <- unique(data$selectedFeatureData$selectedFeatureIds)

  data$data_features <- data$data_features %>%
    dplyr::filter(.data$CONCEPT_ID %in% finalIds)

  data$data_patients <- data$data_patients %>%
    dplyr::filter(.data$CONCEPT_ID %in% finalIds)

  data$complementaryMappingTable <- data$complementaryMappingTable %>%
    dplyr::filter(
      .data$CONCEPT_ID %in% finalIds |
        .data$NEW_CONCEPT_ID %in% finalIds
    )

  data$selectedFeatureData$selectedFeatures <- data$selectedFeatureData$selectedFeatures %>%
    dplyr::filter(.data$CONCEPT_ID %in% finalIds)
  data$selectedFeatureData$selectedFeatureIds <- finalIds
  data$selectedFeatureData$selectedFeatureNames <- unique(
    data$selectedFeatureData$selectedFeatures$CONCEPT_NAME
  )
  data$trajectoryDataList <- data$selectedFeatureData

  data
}

Run one ICD10 code and one observation window

runIcd10AtlasCase <- function(cdm, icd10code, longDesc, windowConfig,
                              summaryRoot, scratchRoot) {
  studyName <- makeSafeStudyName(
    code = icd10code,
    label = longDesc,
    windowName = windowConfig$window_name
  )

  # Build the target and control cohorts for the selected ICD10 code and
  # observation window.
  targetTable <- buildIcd10TargetTable(
    cdm = cdm,
    icd10code = icd10code,
    targetStartOffset = windowConfig$target_start_offset,
    targetEndOffset = windowConfig$target_end_offset,
    minimumObservationDays = windowConfig$minimum_observation_days
  )

  controlTable <- buildIcd10VisitControl(
    cdm = cdm,
    targetTable = targetTable,
    controlLagDays = windowConfig$control_lag_days
  )

  # Skip extremely small analyses.
  if (nrow(targetTable) < 100 || nrow(controlTable) < 5) {
    return(invisible(NULL))
  }

  # Run CohortContrast in memory first.
  result <- CohortContrast::CohortContrast(
    cdm = cdm,
    targetTable = targetTable,
    controlTable = controlTable,
    pathToResults = scratchRoot,
    domainsIncluded = c(
      "Drug",
      "Condition",
      "Measurement",
      "Observation",
      "Procedure",
      "Visit",
      "Visit detail",
      "Death"
    ),
    prevalenceCutOff = 1,
    topK = FALSE,
    getSourceData = FALSE,
    runChi2YTests = TRUE,
    runLogitTests = TRUE,
    createOutputFiles = FALSE,
    complName = studyName,
    runRemoveTemporalBias = TRUE,
    removeTemporalBiasArgs = list(
      removeIdentified = TRUE,
      ratio = 1
    ),
    runAutomaticHierarchyCombineConcepts = TRUE,
    automaticHierarchyCombineConceptsArgs = list(
      abstractionLevel = -1,
      minDepthAllowed = 0,
      allowOnlyMinors = TRUE
    ),
    runAutomaticCorrelationCombineConcepts = TRUE,
    automaticCorrelationCombineConceptsArgs = list(
      abstractionLevel = -1,
      minCorrelation = 0.7,
      maxDaysInBetween = 1,
      heritageDriftAllowed = FALSE
    )
  )

  # Keep only the final selected concepts before exporting.
  result <- trimFinalStudy(result)

  # Save a temporary patient-mode study only so that precomputeSummary() can
  # generate the final summary-mode folder.
  CohortContrast:::saveResult(result, scratchRoot)

  CohortContrast::precomputeSummary(
    studyPath = file.path(scratchRoot, studyName),
    outputPath = file.path(summaryRoot, studyName),
    clusterKValues = c(2, 3, 4, 5)
  )

  # Remove the temporary patient-mode study folder and keep only the summary.
  unlink(file.path(scratchRoot, studyName), recursive = TRUE, force = TRUE)

  invisible(NULL)
}

Run the full atlas generation loop

# Directory that will keep only the summary-mode outputs.
summaryRoot <- "icd10_summary_atlas"

# Temporary directory used only while precomputeSummary() is running.
scratchRoot <- file.path(tempdir(), "cohortcontrast_icd10_scratch")

dir.create(summaryRoot, recursive = TRUE, showWarnings = FALSE)
dir.create(scratchRoot, recursive = TRUE, showWarnings = FALSE)

conn <- connectAtlasCdm()
db <- conn$db
cdm <- conn$cdm

eligibleCodes <- getEligibleIcd10Codes(cdm = cdm, minPatients = 100L)

for (i in seq_len(nrow(eligibleCodes))) {
  icd10code <- eligibleCodes$icd10code[i]
  longDesc <- eligibleCodes$long_desc[i]

  for (j in seq_len(nrow(windows))) {
    runIcd10AtlasCase(
      cdm = cdm,
      icd10code = icd10code,
      longDesc = longDesc,
      windowConfig = windows[j, ],
      summaryRoot = summaryRoot,
      scratchRoot = scratchRoot
    )
  }
}

DBI::dbDisconnect(db)

Resulting output folders

After the workflow completes, the output directory contains one summary-mode study folder per ICD10 code and per observation window.

The final retained studies:

respect the three configured observation periods
use visit-aligned control windows to reduce encounter-density bias
include only the concepts that survived the statistical and post-processing pipeline
keep only the summary outputs intended for atlas-style browsing