
Populations, variable specifications, and reporting
Source:vignettes/populations-and-reporting.Rmd
populations-and-reporting.RmdAfter tagging, filtering, and deriving, two tasks remain before the provenance record is complete:
-
Documenting populations — the definitions and
criteria for each analysis set or cohort
(
lg_population()) -
Documenting variable derivations — linking each
output variable back to its source (
lg_spec())
Both feed directly into lg_report(), which compiles
everything into a structured HTML provenance report.
Setup: a complete pipeline
We build a session with tagging, derivation, filtering, and multiple analysis populations to demonstrate the full reporting workflow.
lg_start(study_id = "TRIAL-001", analysis_id = "primary-efficacy")
#> lineager: session started [study: TRIAL-001] [analysis: primary-efficacy]
# Source data — patients and lab measurements
patients <- data.frame(
USUBJID = sprintf("PT-%03d", 1:15),
age = c(22L, 45L, 38L, 61L, 29L, 55L, 43L, 17L, 52L,
34L, 48L, 27L, 66L, 39L, 51L),
sex = rep(c("M", "F", "M"), 5L),
arm = c("TRT","TRT","CTL","TRT","CTL","CTL","TRT","TRT",
"CTL","TRT","CTL","TRT","CTL","CTL","TRT"),
enrolled = c(rep(TRUE, 13), FALSE, FALSE),
dosed = c(rep(TRUE, 10), FALSE, TRUE, TRUE, FALSE, FALSE),
stringsAsFactors = FALSE
)
labs <- data.frame(
USUBJID = sprintf("PT-%03d", c(1:10, 12:13)),
baseline = round(c(24.1,31.8,28.4,22.9,35.2,26.7,
29.1,33.4,27.8,25.5,30.2,28.9), 1),
endpoint = round(c(18.4,27.1,24.6,19.8,NA,22.3,
25.4,28.1,NA,21.7,26.4,24.1), 1),
stringsAsFactors = FALSE
)
# Tag
pts <- lg_tag(patients, dataset_id = "PATIENTS", label = "Patient registry")
#> lineager: tagged 'PATIENTS' — 15 rows, 6 cols
labs_tagged <- lg_tag(labs, dataset_id = "LABS", label = "Laboratory measurements")
#> lineager: tagged 'LABS' — 12 rows, 3 cols
cat("Patients:", nrow(pts), "\n")
#> Patients: 15
cat("Labs: ", nrow(labs_tagged), "\n")
#> Labs: 12Derive analysis variables
# Adult flag
pts <- lg_derive(pts,
adult = age >= 18L,
description = "adult: TRUE if age >= 18 years"
)
#> lineager: [PATIENTS] derive — adult: TRUE if age >= 18 years
# Derive population flags
pts <- lg_derive(pts,
ENRLFL = ifelse(enrolled, "Y", "N"),
DOSEFL = ifelse(enrolled & dosed, "Y", "N"),
description = "ENRLFL: enrolled; DOSEFL: enrolled AND received study treatment"
)
#> lineager: [PATIENTS] derive — ENRLFL: enrolled; DOSEFL: enrolled AND received study treatment
# Join lab measurements
pts_labs <- lg_join(pts, labs_tagged,
by = "USUBJID",
type = "left",
description = "Merge baseline and endpoint lab measurements from LABS"
)
#> lineager: [PATIENTS + LABS] left join — 15 rows out
# Derive analysis variables
analysis_ds <- lg_derive(pts_labs,
CHG = endpoint - baseline,
PCHG = round((endpoint - baseline) / baseline * 100, 2),
description = paste(
"CHG: absolute change from baseline (endpoint - baseline);",
"PCHG: percent change from baseline"
)
)
#> lineager: [PATIENTS] derive — CHG: absolute change from baseline (endpoint - baseline); PCHG: percent change from baseline
# Filter to enrolled, adult, dosed patients with complete endpoint
final <- analysis_ds |>
lg_filter(enrolled == TRUE, adult == TRUE,
reason = "Not enrolled or under 18",
reason_code = "NOT_ENROLLED") |>
lg_filter(dosed == TRUE,
reason = "Did not receive study treatment",
reason_code = "NOT_DOSED",
population = "SAFETY_SET") |>
lg_filter(!is.na(endpoint),
reason = "Missing primary endpoint measurement",
reason_code = "MISSING_EP",
population = "ANALYSIS_SET")
#> lineager: [PATIENTS] filter 'Not enrolled or under 18' — 15 in, 12 out, 3 excluded
#> lineager: [PATIENTS] filter 'Did not receive study treatment' — 12 in, 11 out, 1 excluded
#> lineager: [PATIENTS] filter 'Missing primary endpoint measurement' — 11 in, 9 out, 2 excluded
cat("Final analysis set:", nrow(final), "patients\n")
#> Final analysis set: 9 patients1. lg_population(): documenting analysis sets
lg_population() registers the formal definition of a
population flag. It requires that the flag variable already exists in
the dataset — compute it first with lg_derive(), then
register the definition.
lg_population(pts,
flag_var = "ENRLFL",
label = "Enrolled Set",
definition = "All patients who met eligibility criteria and were enrolled in the study",
incl_criteria = "enrolled == TRUE AND age >= 18"
)
#> lineager: population 'ENRLFL' (Enrolled Set) — 13 included, 2 excludedMultiple populations
lg_population(pts,
flag_var = "DOSEFL",
label = "Safety Set",
definition = "All enrolled patients who received at least one dose of study treatment",
incl_criteria = c("enrolled == TRUE", "dosed == TRUE"),
excl_criteria = "Enrolled but did not receive study treatment (dosed == FALSE)"
)
#> lineager: population 'DOSEFL' (Safety Set) — 12 included, 3 excludedWhat lg_population() records
env <- getFromNamespace(".lg", "lineager")
pop <- env$populations[["DOSEFL"]]
cat("Flag: ", pop$flag_var, "\n")
#> Flag: DOSEFL
cat("Label: ", pop$label, "\n")
#> Label: Safety Set
cat("N included: ", pop$n_included, "\n")
#> N included: 12
cat("N excluded: ", pop$n_excluded, "\n")
#> N excluded: 3
cat("N total: ", pop$n_total, "\n")
#> N total: 15The n_included and n_excluded counts come
from the flag_var column in the dataset at the time
lg_population() is called. Pass the most complete dataset
(before any population-specific filtering) so the counts reflect the
full enrolled cohort.
print method
print(pop)
#> <lg_population> DOSEFL — Safety Set
#> Definition : All enrolled patients who received at least one dose of study treatment
#> N included : 12
#> N excluded : 3
#> Inclusion : enrolled == TRUE; dosed == TRUE
#> Exclusion : Enrolled but did not receive study treatment (dosed == FALSE)2. lg_spec(): documenting variable derivations
lg_spec() registers a structured derivation
specification linking an output variable back to its source. Think of it
as the “Variable Derivations” section of an analysis plan, expressed as
structured R objects rather than prose.
lg_spec(
adam_dataset = "ANALYSIS",
adam_var = "CHG",
label = "Change from Baseline",
source_domain = "LABS",
source_var = "endpoint / baseline",
derivation = "endpoint - baseline; NA when endpoint is missing"
)
lg_spec(
adam_dataset = "ANALYSIS",
adam_var = "PCHG",
label = "Percent Change from Baseline",
source_domain = "LABS",
source_var = "endpoint / baseline",
derivation = "(endpoint - baseline) / baseline * 100, rounded to 2 decimal places",
conditions = "Only computed when baseline is non-missing and non-zero"
)Documenting flag derivations
lg_spec(
adam_dataset = "PATIENTS",
adam_var = "ENRLFL",
label = "Enrolled Flag",
source_domain = "PATIENTS",
source_var = "enrolled",
derivation = "Y if enrolled == TRUE; N otherwise"
)
lg_spec(
adam_dataset = "PATIENTS",
adam_var = "DOSEFL",
label = "Safety Flag",
source_domain = "PATIENTS",
source_var = "enrolled / dosed",
derivation = "Y if enrolled == TRUE AND dosed == TRUE; N otherwise"
)Overwriting a spec
If you call lg_spec() with the same
adam_dataset and adam_var, the prior spec is
replaced — useful during iterative analysis:
# Refine the CHG derivation description
lg_spec(
adam_dataset = "ANALYSIS",
adam_var = "CHG",
label = "Change from Baseline",
source_domain = "LABS",
source_var = "endpoint / baseline",
derivation = "endpoint - baseline. LOCF applied when endpoint missing at Week 12 only.",
conditions = "LOCF from Week 8 visit applied per SAP section 7.2"
)3. lg_report(): generating the provenance report
lg_report() compiles the full session into a
self-contained HTML document. It draws from everything accumulated
during the session:
| Report section | Source |
|---|---|
| Dataset Inventory | All lg_tag() calls |
| Subject Disposition | All lg_filter() exclusions |
| Population Definitions | All lg_population() registrations |
| Variable Derivations | All lg_spec() registrations |
| Operation Log | All lg_derive(), lg_join(),
lg_filter() calls |
| Exclusion Listing | Full exclusion registry |
In-memory report (inspect without writing to disk)
html <- lg_report(
output = NULL,
title = "Data Provenance Report",
study_id = "TRIAL-001",
author = "Ndoh Penn"
)
cat("Report size:", nchar(html), "characters\n")
#> Report size: 7109 characters
cat("Sections found:\n")
#> Sections found:
sections <- c("Dataset Inventory", "Subject Disposition",
"Population Flag", "Variable Derivation",
"Operation Log", "Exclusion Listing")
for (s in sections) {
cat(" ", if (grepl(s, html, ignore.case = TRUE)) "[YES]" else "[NO]",
s, "\n")
}
#> [YES] Dataset Inventory
#> [YES] Subject Disposition
#> [YES] Population Flag
#> [YES] Variable Derivation
#> [YES] Operation Log
#> [YES] Exclusion ListingWrite to file
lg_report(
output = "outputs/provenance_TRIAL001_primary.html",
title = "Data Provenance Report — TRIAL-001 Primary Analysis",
study_id = "TRIAL-001",
sponsor = "Example Pharma Ltd",
author = "Ndoh Penn, Biostatistician",
date = as.Date("2026-06-23")
)The report is entirely self-contained HTML — no external CSS, no JavaScript, no internet connection required at render time. It opens in any browser and can be attached to a regulatory submission package.
Report arguments
| Argument | Default | Purpose |
|---|---|---|
format |
"html" |
Output format (currently only HTML) |
output |
NULL |
File path; NULL returns the HTML string |
title |
"Data Provenance Report" |
Report title |
study_id |
Session study_id
|
Study identifier for header |
sponsor |
NULL |
Sponsor name |
author |
NULL |
Analyst name |
date |
Sys.Date() |
Report date |
4. A note on CDISC-specific usage
All functions in this vignette work identically for CDISC clinical data. For pharmaceutical users:
- Pass
domain = "DM"etc. inlg_tag()— domain appears in dataset inventory and distinguishes SDTM domains from ADaM datasets -
USUBJIDcolumn is detected automatically and embedded in.__lid__values for human readability -
lg_population()naturally documents SAFFL, ITTFL, PPROTFL — the labels and definitions become the Reviewer’s Guide population section -
lg_spec()documents SDTM-to-ADaM variable mappings — the report output aligns with the “Variable Derivations” section of the Reviewer’s Guide
For non-CDISC users, these arguments and features are entirely optional. The core workflow — tag, filter, trace, report — works without any CDISC-specific configuration.
5. Visualise the pipeline lineage
lg_lineage() builds a graph of the complete pipeline —
source datasets, all derive/join/filter operations, and exclusion
branches — as a Graphviz DOT string. lg_plot() renders it
inline or writes it to a file.
lin <- lg_lineage()
print(lin)
#> <lg_lineage> 2 source dataset(s), 7 operation(s), 3 exclusion branch(es)
#> Use lg_plot(lin) to render. DOT source:
#>
#> digraph lineage {
#> rankdir = TB;
#> graph [fontname="Helvetica", splines=ortho, nodesep=0.4, ranksep=0.6];
#> node [fontname="Helvetica", fontsize=10, margin="0.15,0.08"];
#> edge [fontname="Helvetica", fontsize=9, color="#6b6f80"];
#>
#> SRC_PATIENTS [label="PATIENTS\nn = 15", shape=box, style="filled,rounded", fillcolor="#e8effe", color="#1a56db", fontcolor="#0f1117"];
#> SRC_LABS [label="LABS\nn = 12", shape=box, style="filled,rounded", fillcolor="#e8effe", color="#1a56db", fontcolor="#0f1117"];
#> OP_op_0001 [label="DERIVE\nadult: TRUE if age >= 18 years", shape=ellipse, style="filled,rounded", fillcolor="#fff8e1", color="#f59e0b", fontcolor="#0f1117"];
#> DS_PATIENTS_op_0001 [label="PATIENTS\nn = 15", shape=box, style="filled,rounded", fillcolor="#ffffff", color="#6b6f80", fontcolor="#0f1117"];
#> OP_op_0002 [label="DERIVE\nENRLFL: enrolled; DOSEFL: enroll...", shape=ellipse, style="filled,rounded", fillcolor="#fff8e1", color="#f59e0b", fontcolor="#0f1117"];
#> DS_PATIENTS_op_0002 [label="PATIENTS\nn = 15", shape=box, style="filled,rounded", fillcolor="#ffffff", color="#6b6f80", fontcolor="#0f1117"];
#> OP_op_0003 [label="JOIN (left)\nby: USUBJID", shape=diamond, style="filled,rounded", fillcolor="#e8f5e9", color="#0e7a4f", fontcolor="#0f1117"];
#> DS_PATIENTS_op_0003 [label="PATIENTS\nn = 15", shape=box, style="filled,rounded", fillcolor="#ffffff", color="#6b6f80", fontcolor="#0f1117"];
#> OP_op_0004 [label="DERIVE\nCHG: absolute change from baseli...", shape=ellipse, style="filled,rounded", fillcolor="#fff8e1", color="#f59e0b", fontcolor="#0f1117"];
#> DS_PATIENTS_op_0004 [label="PATIENTS\nn = 15", shape=box, style="filled,rounded", fillcolor="#ffffff", color="#6b6f80", fontcolor="#0f1117"];
#> OP_op_0005 [label="FILTER\nNot enrolled or under 18\n−3 rows", shape=ellipse, style="filled,rounded", fillcolor="#fff3e0", color="#ea8c00", fontcolor="#0f1117"];
#> DS_PATIENTS_op_0005 [label="PATIENTS\nn = 12", shape=box, style="filled,rounded", fillcolor="#ffffff", color="#6b6f80", fontcolor="#0f1117"];
#> EXCL_op_0005 [label="excluded\nn = 3", shape=plaintext, fontcolor="#dc2626", fontsize=9];
#> OP_op_0006 [label="FILTER\nDid not receive study treatment\n−1 rows", shape=ellipse, style="filled,rounded", fillcolor="#fff3e0", color="#ea8c00", fontcolor="#0f1117"];
#> DS_PATIENTS_op_0006 [label="PATIENTS\nn = 11", shape=box, style="filled,rounded", fillcolor="#ffffff", color="#6b6f80", fontcolor="#0f1117"];
#> EXCL_op_0006 [label="excluded\nn = 1", shape=plaintext, fontcolor="#dc2626", fontsize=9];
#> OP_op_0007 [label="FILTER\nMissing primary endpoint measure...\n−2 rows", shape=ellipse, style="filled,rounded", fillcolor="#fff3e0", color="#ea8c00", fontcolor="#0f1117"];
#> DS_PATIENTS_op_0007 [label="PATIENTS\nn = 9", shape=box, style="filled,rounded", fillcolor="#ffffff", color="#6b6f80", fontcolor="#0f1117"];
#> EXCL_op_0007 [label="excluded\nn = 2", shape=plaintext, fontcolor="#dc2626", fontsize=9];
#>
#> SRC_PATIENTS -> OP_op_0001;
#> OP_op_0001 -> DS_PATIENTS_op_0001;
#> DS_PATIENTS_op_0001 -> OP_op_0002;
#> OP_op_0002 -> DS_PATIENTS_op_0002;
#> DS_PATIENTS_op_0002 -> OP_op_0003 [label=" x "];
#> SRC_LABS -> OP_op_0003 [label=" y "];
#> OP_op_0003 -> DS_PATIENTS_op_0003;
#> DS_PATIENTS_op_0003 -> OP_op_0004;
#> OP_op_0004 -> DS_PATIENTS_op_0004;
#> DS_PATIENTS_op_0004 -> OP_op_0005 [label=" n=15 "];
#> OP_op_0005 -> DS_PATIENTS_op_0005;
#> OP_op_0005 -> EXCL_op_0005;
#> DS_PATIENTS_op_0005 -> OP_op_0006 [label=" n=12 "];
#> OP_op_0006 -> DS_PATIENTS_op_0006;
#> OP_op_0006 -> EXCL_op_0006;
#> DS_PATIENTS_op_0006 -> OP_op_0007 [label=" n=11 "];
#> OP_op_0007 -> DS_PATIENTS_op_0007;
#> OP_op_0007 -> EXCL_op_0007;
#> }
# Render inline (requires DiagrammeR)
lg_plot(lin)
# Export DOT for Graphviz or https://dreampuf.github.io/GraphvizOnline/
lg_plot(lin, output = "outputs/pipeline.dot")The lineage graph for this pipeline shows the two source datasets (PATIENTS and LABS) flowing through four derive steps, a left join, and three filter steps with exclusion branches at each stage.
6. End the session
lg_end()
#> lineager: session ended — 7 operation(s), 6 exclusion(s), 2 population(s), 4 var spec(s)