splitGraph existsLeakage in biomedical evaluation workflows often comes from dataset structure rather than from an obvious coding mistake. Two samples may look independent in a model matrix while still sharing the same subject, batch, study, timepoint, or feature provenance. If those relationships stay implicit, train/test separation can look correct while violating the scientific separation you actually intended.
splitGraph exists to make those relationships explicit
before evaluation. It turns metadata into a typed dependency graph that
can be:
split_spec class and as_split_spec() /
validate_split_spec() APIThe package is intentionally narrow. It does not fit models, run preprocessing pipelines, or generate resamples by itself. Its job is to represent dependency structure clearly enough that downstream evaluation can be trustworthy.
The example below includes exactly the kinds of relationships that usually matter for leakage-aware evaluation:
P1 and P2)B1)P2) appearing across studiesmeta <- data.frame(
sample_id = c("S1", "S2", "S3", "S4", "S5", "S6"),
subject_id = c("P1", "P1", "P2", "P3", "P4", "P2"),
batch_id = c("B1", "B2", "B1", "B3", NA, "B1"),
study_id = c("ST1", "ST1", "ST1", "ST2", "ST3", "ST2"),
timepoint_id = c("T0", "T1", "T0", "T2", NA, "T1"),
assay_id = c("RNAseq", "RNAseq", "RNAseq", "RNAseq", "Proteomics", "RNAseq"),
featureset_id = c("FS_GLOBAL", "FS_GLOBAL", "FS_GLOBAL", "FS_GLOBAL", "FS_PROT", "FS_GLOBAL"),
outcome_id = c("O_case", "O_case", "O_ctrl", "O_case", "O_ctrl", "O_ctrl"),
stringsAsFactors = FALSE
)
meta
#> sample_id subject_id batch_id study_id timepoint_id assay_id featureset_id
#> 1 S1 P1 B1 ST1 T0 RNAseq FS_GLOBAL
#> 2 S2 P1 B2 ST1 T1 RNAseq FS_GLOBAL
#> 3 S3 P2 B1 ST1 T0 RNAseq FS_GLOBAL
#> 4 S4 P3 B3 ST2 T2 RNAseq FS_GLOBAL
#> 5 S5 P4 <NA> ST3 <NA> Proteomics FS_PROT
#> 6 S6 P2 B1 ST2 T1 RNAseq FS_GLOBAL
#> outcome_id
#> 1 O_case
#> 2 O_case
#> 3 O_ctrl
#> 4 O_case
#> 5 O_ctrl
#> 6 O_ctrlThis is still a small example, but it already contains enough structure to make naive random splitting risky.
graph_from_metadata()When your metadata already uses the canonical column names
(sample_id, subject_id, batch_id,
study_id, timepoint_id,
time_index, assay_id,
featureset_id, outcome_id /
outcome_value), graph_from_metadata() does
ingestion, typed node construction, canonical edge construction, and
optional timepoint_precedes derivation in a single
call:
quick_graph <- graph_from_metadata(
data.frame(
sample_id = c("S1", "S2", "S3", "S4", "S5", "S6"),
subject_id = c("P1", "P1", "P2", "P2", "P3", "P3"),
batch_id = c("B1", "B2", "B1", "B2", "B1", "B2"),
timepoint_id = c("T0", "T1", "T0", "T1", "T0", "T1"),
time_index = c(0, 1, 0, 1, 0, 1),
outcome_value = c(0, 1, 0, 1, 1, 0)
),
graph_name = "quick_demo"
)
quick_graph
#> <dependency_graph> quick_demo
#> Nodes: 15
#> Edges: 25The rest of this vignette uses the explicit constructor path because
it lets us show node attributes (time_index,
visit_label, platform,
derivation_scope) and non-canonical edges
(featureset_generated_from_*,
subject_has_outcome) that
graph_from_metadata() does not build for you. Use
graph_from_metadata() when the canonical columns are
enough; use the explicit path when you need custom attributes or extra
relations.
The first step is to standardize metadata and then turn each entity type into canonical graph nodes. Sample-level relations become typed edges.
meta <- ingest_metadata(meta, dataset_name = "VignetteDemo")
sample_nodes <- create_nodes(meta, type = "Sample", id_col = "sample_id")
subject_nodes <- create_nodes(meta, type = "Subject", id_col = "subject_id")
batch_nodes <- create_nodes(meta, type = "Batch", id_col = "batch_id")
study_nodes <- create_nodes(meta, type = "Study", id_col = "study_id")
time_nodes <- create_nodes(
data.frame(
timepoint_id = c("T0", "T1", "T2"),
time_index = c(0L, 1L, 2L),
visit_label = c("baseline", "follow_up", "late_follow_up"),
stringsAsFactors = FALSE
),
type = "Timepoint",
id_col = "timepoint_id",
attr_cols = c("time_index", "visit_label")
)
assay_nodes <- create_nodes(
data.frame(
assay_id = c("RNAseq", "Proteomics"),
modality = c("transcriptomics", "proteomics"),
platform = c("NovaSeq", "Orbitrap"),
stringsAsFactors = FALSE
),
type = "Assay",
id_col = "assay_id",
attr_cols = c("modality", "platform")
)
featureset_nodes <- create_nodes(
data.frame(
featureset_id = c("FS_GLOBAL", "FS_PROT"),
featureset_name = c("global_rna_signature", "proteomics_panel"),
derivation_scope = c("per_dataset", "external"),
feature_count = c(500L, 80L),
stringsAsFactors = FALSE
),
type = "FeatureSet",
id_col = "featureset_id",
attr_cols = c("featureset_name", "derivation_scope", "feature_count")
)
outcome_nodes <- create_nodes(
data.frame(
outcome_id = c("O_case", "O_ctrl"),
outcome_name = c("response", "response"),
outcome_type = c("binary", "binary"),
observation_level = c("subject", "subject"),
stringsAsFactors = FALSE
),
type = "Outcome",
id_col = "outcome_id",
attr_cols = c("outcome_name", "outcome_type", "observation_level")
)
subject_edges <- create_edges(
meta, "sample_id", "subject_id",
"Sample", "Subject", "sample_belongs_to_subject"
)
batch_edges <- create_edges(
meta, "sample_id", "batch_id",
"Sample", "Batch", "sample_processed_in_batch",
allow_missing = TRUE
)
study_edges <- create_edges(
meta, "sample_id", "study_id",
"Sample", "Study", "sample_from_study"
)
time_edges <- create_edges(
meta, "sample_id", "timepoint_id",
"Sample", "Timepoint", "sample_collected_at_timepoint",
allow_missing = TRUE
)
assay_edges <- create_edges(
meta, "sample_id", "assay_id",
"Sample", "Assay", "sample_measured_by_assay"
)
featureset_edges <- create_edges(
meta, "sample_id", "featureset_id",
"Sample", "FeatureSet", "sample_uses_featureset"
)
outcome_edges <- create_edges(
data.frame(
subject_id = c("P1", "P2", "P3", "P4"),
outcome_id = c("O_case", "O_ctrl", "O_case", "O_ctrl"),
stringsAsFactors = FALSE
),
"subject_id", "outcome_id",
"Subject", "Outcome", "subject_has_outcome"
)
precedence_edges <- create_edges(
data.frame(
from_timepoint = c("T0", "T1"),
to_timepoint = c("T1", "T2"),
stringsAsFactors = FALSE
),
"from_timepoint", "to_timepoint",
"Timepoint", "Timepoint", "timepoint_precedes"
)
featureset_from_study <- create_edges(
data.frame(
featureset_id = "FS_GLOBAL",
study_id = "ST1",
stringsAsFactors = FALSE
),
"featureset_id", "study_id",
"FeatureSet", "Study", "featureset_generated_from_study"
)
featureset_from_batch <- create_edges(
data.frame(
featureset_id = "FS_GLOBAL",
batch_id = "B1",
stringsAsFactors = FALSE
),
"featureset_id", "batch_id",
"FeatureSet", "Batch", "featureset_generated_from_batch"
)The node and edge tables are canonical and typed. The package assigns
globally unique node IDs such as sample:S1 and
subject:P1, so different entity types cannot collide
accidentally.
sample_nodes
#> <graph_node_set> 6 nodes across 1 types (schema 0.1.0 )
as.data.frame(sample_nodes)[, c("node_id", "node_type", "node_key", "label")]
#> node_id node_type node_key label
#> 1 sample:S1 Sample S1 S1
#> 2 sample:S2 Sample S2 S2
#> 3 sample:S3 Sample S3 S3
#> 4 sample:S4 Sample S4 S4
#> 5 sample:S5 Sample S5 S5
#> 6 sample:S6 Sample S6 S6
edge_preview <- do.call(rbind, lapply(
list(
subject_edges, batch_edges, study_edges, time_edges,
assay_edges, featureset_edges, outcome_edges,
precedence_edges, featureset_from_study, featureset_from_batch
),
as.data.frame
))
edge_preview[, c("from", "to", "edge_type")]
#> from to edge_type
#> 1 sample:S1 subject:P1 sample_belongs_to_subject
#> 2 sample:S2 subject:P1 sample_belongs_to_subject
#> 3 sample:S3 subject:P2 sample_belongs_to_subject
#> 4 sample:S4 subject:P3 sample_belongs_to_subject
#> 5 sample:S5 subject:P4 sample_belongs_to_subject
#> 6 sample:S6 subject:P2 sample_belongs_to_subject
#> 7 sample:S1 batch:B1 sample_processed_in_batch
#> 8 sample:S2 batch:B2 sample_processed_in_batch
#> 9 sample:S3 batch:B1 sample_processed_in_batch
#> 10 sample:S4 batch:B3 sample_processed_in_batch
#> 11 sample:S6 batch:B1 sample_processed_in_batch
#> 12 sample:S1 study:ST1 sample_from_study
#> 13 sample:S2 study:ST1 sample_from_study
#> 14 sample:S3 study:ST1 sample_from_study
#> 15 sample:S4 study:ST2 sample_from_study
#> 16 sample:S5 study:ST3 sample_from_study
#> 17 sample:S6 study:ST2 sample_from_study
#> 18 sample:S1 timepoint:T0 sample_collected_at_timepoint
#> 19 sample:S2 timepoint:T1 sample_collected_at_timepoint
#> 20 sample:S3 timepoint:T0 sample_collected_at_timepoint
#> 21 sample:S4 timepoint:T2 sample_collected_at_timepoint
#> 22 sample:S6 timepoint:T1 sample_collected_at_timepoint
#> 23 sample:S1 assay:RNAseq sample_measured_by_assay
#> 24 sample:S2 assay:RNAseq sample_measured_by_assay
#> 25 sample:S3 assay:RNAseq sample_measured_by_assay
#> 26 sample:S4 assay:RNAseq sample_measured_by_assay
#> 27 sample:S5 assay:Proteomics sample_measured_by_assay
#> 28 sample:S6 assay:RNAseq sample_measured_by_assay
#> 29 sample:S1 featureset:FS_GLOBAL sample_uses_featureset
#> 30 sample:S2 featureset:FS_GLOBAL sample_uses_featureset
#> 31 sample:S3 featureset:FS_GLOBAL sample_uses_featureset
#> 32 sample:S4 featureset:FS_GLOBAL sample_uses_featureset
#> 33 sample:S5 featureset:FS_PROT sample_uses_featureset
#> 34 sample:S6 featureset:FS_GLOBAL sample_uses_featureset
#> 35 subject:P1 outcome:O_case subject_has_outcome
#> 36 subject:P2 outcome:O_ctrl subject_has_outcome
#> 37 subject:P3 outcome:O_case subject_has_outcome
#> 38 subject:P4 outcome:O_ctrl subject_has_outcome
#> 39 timepoint:T0 timepoint:T1 timepoint_precedes
#> 40 timepoint:T1 timepoint:T2 timepoint_precedes
#> 41 featureset:FS_GLOBAL study:ST1 featureset_generated_from_study
#> 42 featureset:FS_GLOBAL batch:B1 featureset_generated_from_batchThe node table shows the canonical sample IDs that everything else refers to. The edge table shows the package’s central design choice: dependency structure is explicit, typed, and inspectable.
graph <- build_dependency_graph(
nodes = list(
sample_nodes, subject_nodes, batch_nodes, study_nodes,
time_nodes, assay_nodes, featureset_nodes, outcome_nodes
),
edges = list(
subject_edges, batch_edges, study_edges, time_edges,
assay_edges, featureset_edges, outcome_edges,
precedence_edges, featureset_from_study, featureset_from_batch
),
graph_name = "vignette_graph",
dataset_name = attr(meta, "dataset_name")
)
graph
#> <dependency_graph> vignette_graph
#> Nodes: 25
#> Edges: 42
summary(graph)
#> $graph_name
#> [1] "vignette_graph"
#>
#> $dataset_name
#> [1] "VignetteDemo"
#>
#> $schema_version
#> [1] "0.1.0"
#>
#> $n_nodes
#> [1] 25
#>
#> $n_edges
#> [1] 42
#>
#> $node_types
#> value n
#> 1 Sample 6
#> 2 Subject 4
#> 3 Batch 3
#> 4 Study 3
#> 5 Timepoint 3
#> 6 Assay 2
#> 7 FeatureSet 2
#> 8 Outcome 2
#>
#> $edge_types
#> value n
#> 1 sample_belongs_to_subject 6
#> 2 sample_from_study 6
#> 3 sample_measured_by_assay 6
#> 4 sample_uses_featureset 6
#> 5 sample_collected_at_timepoint 5
#> 6 sample_processed_in_batch 5
#> 7 subject_has_outcome 4
#> 8 timepoint_precedes 2
#> 9 featureset_generated_from_batch 1
#> 10 featureset_generated_from_study 1At this point the package has a single dependency_graph
object with both tabular and igraph representations behind
it. The summary is useful because it tells you exactly which entity
types and relation types are present before you derive any split
rules.
plot() renders the graph with a typed, layered layout:
Sample on top, peer dependencies (Subject,
Batch, Study, Timepoint) in the
middle band, Assay/FeatureSet below that, and
Outcome at the bottom. Node colors are keyed to type and an
auto-generated legend is drawn by default.
Useful options:
Validation is where splitGraph starts paying off. The
graph below is structurally valid, but it still carries leakage-relevant
warnings and advisories.
validation <- validate_graph(graph)
validation
#> <depgraph_validation_report> vignette_graph
#> Valid: TRUE
#> Issues: 6
#> By severity:
#> - advisory : 5
#> - warning : 1
as.data.frame(validation)[, c("level", "severity", "code", "message")]
#> level severity code
#> 1 leakage advisory repeated_subject_samples
#> 2 leakage advisory repeated_subject_samples
#> 3 leakage warning subject_cross_study_overlap
#> 4 leakage advisory per_dataset_featureset
#> 5 leakage advisory shared_featureset_provenance
#> 6 leakage advisory heavy_batch_reuse
#> message
#> 1 Subject `subject:P1` is linked to multiple samples.
#> 2 Subject `subject:P2` is linked to multiple samples.
#> 3 Subject `subject:P2` appears across multiple studies.
#> 4 FeatureSet `featureset:FS_GLOBAL` was derived at the full-dataset scope.
#> 5 FeatureSet `featureset:FS_GLOBAL` is shared across multiple samples.
#> 6 Batch `batch:B1` is reused across many samples.That output is the core value proposition of the package in one place:
valid = TRUE here means the graph has no errors. It does
not mean the dataset is free of leakage risk. Warnings and advisories
still matter.
The package is also intentionally strict about silent failure. If you ask for a subset of samples and some of them do not resolve, it errors instead of dropping them.
tryCatch(
derive_split_constraints(graph, mode = "subject", samples = c("S1", "BAD")),
error = function(e) e$message
)
#> [1] "Unknown sample IDs: BAD"That behavior is important in practice because quietly omitting samples would change the truth of the split problem.
splitGraph can derive direct constraints for subject,
batch, study, and time as well as composite constraints that combine
multiple dependency sources.
subject_constraint <- derive_split_constraints(graph, mode = "subject")
batch_constraint <- derive_split_constraints(graph, mode = "batch")
study_constraint <- derive_split_constraints(graph, mode = "study")
time_constraint <- derive_split_constraints(graph, mode = "time")
strict_constraint <- derive_split_constraints(
graph,
mode = "composite",
strategy = "strict",
via = c("Subject", "Batch")
)
rule_based_constraint <- derive_split_constraints(
graph,
mode = "composite",
strategy = "rule_based",
priority = c("batch", "study", "subject", "time")
)
constraint_overview <- do.call(rbind, lapply(
list(
subject = subject_constraint,
batch = batch_constraint,
study = study_constraint,
time = time_constraint,
composite_strict = strict_constraint,
composite_rule = rule_based_constraint
),
function(x) {
data.frame(
strategy = x$strategy,
groups = length(unique(x$sample_map$group_id)),
warnings = if (is.null(x$metadata$warnings)) 0L else length(x$metadata$warnings),
stringsAsFactors = FALSE
)
}
))
constraint_overview <- cbind(constraint = row.names(constraint_overview), constraint_overview)
row.names(constraint_overview) <- NULL
constraint_overview
#> constraint strategy groups warnings
#> 1 subject subject 4 0
#> 2 batch batch 4 1
#> 3 study study 3 0
#> 4 time time 4 1
#> 5 composite_strict strict 3 0
#> 6 composite_rule rule_based 4 0That summary already shows why the package is useful: different notions of dependency produce different splitting units.
batch_constraint
#> <split_constraint> batch
#> Samples: 6
#> Groups: 4
#> Warnings: 1
as.data.frame(batch_constraint)[, c("sample_id", "group_id", "group_label", "explanation")]
#> sample_id group_id group_label
#> 1 S1 batch:B1 B1
#> 2 S2 batch:B2 B2
#> 3 S3 batch:B1 B1
#> 4 S4 batch:B3 B3
#> 5 S5 batch:unlinked:S5 unlinked_S5
#> 6 S6 batch:B1 B1
#> explanation
#> 1 Grouped by batch through sample_processed_in_batch -> B1.
#> 2 Grouped by batch through sample_processed_in_batch -> B2.
#> 3 Grouped by batch through sample_processed_in_batch -> B1.
#> 4 Grouped by batch through sample_processed_in_batch -> B3.
#> 5 No batch assignment was available; sample retained as an unlinked singleton group.
#> 6 Grouped by batch through sample_processed_in_batch -> B1.Batch grouping keeps all B1 samples together and
preserves S5 as an explicit singleton because it has no
batch assignment. Missing structure is not hidden.
time_constraint
#> <split_constraint> time
#> Samples: 6
#> Groups: 4
#> Warnings: 1
as.data.frame(time_constraint)[, c("sample_id", "group_id", "timepoint_id", "order_rank")]
#> sample_id group_id timepoint_id order_rank
#> 1 S1 time:T0 T0 1
#> 2 S2 time:T1 T1 2
#> 3 S3 time:T0 T0 1
#> 4 S4 time:T2 T2 3
#> 5 S5 time:unlinked:S5 <NA> NA
#> 6 S6 time:T1 T1 2Time grouping adds order_rank, which is the field
downstream tooling actually needs for ordered evaluation. The missing
timepoint on S5 stays visible as NA, so
ordering is partial rather than pretended.
strict_constraint
#> <split_constraint> strict
#> Samples: 6
#> Groups: 3
as.data.frame(strict_constraint)[, c("sample_id", "group_id", "constraint_type")]
#> sample_id group_id constraint_type
#> 1 S1 component_1 composite_strict
#> 2 S2 component_1 composite_strict
#> 3 S3 component_1 composite_strict
#> 4 S4 component_2 composite_strict
#> 5 S5 component_3 composite_strict
#> 6 S6 component_1 composite_strict
rule_based_constraint
#> <split_constraint> rule_based
#> Samples: 6
#> Groups: 4
as.data.frame(rule_based_constraint)[, c("sample_id", "group_id", "constraint_type", "group_label")]
#> sample_id group_id constraint_type group_label
#> 1 S1 composite_batch:B1 batch B1
#> 2 S2 composite_batch:B2 batch B2
#> 3 S3 composite_batch:B1 batch B1
#> 4 S4 composite_batch:B3 batch B3
#> 5 S5 composite_study:ST3 study ST3
#> 6 S6 composite_batch:B1 batch B1The strict composite constraint uses transitive closure:
S1, S2, S3, and S6
end up in the same group because subject and batch links connect them
into one dependency component. The rule-based composite constraint is
different: it uses the highest-priority available dependency per sample,
so S5 falls back to study-level grouping instead of
becoming a composite component.
If explicit time_index metadata are unavailable,
splitGraph can still infer time order from
timepoint_precedes edges.
precedence_meta <- data.frame(
sample_id = c("S1", "S2", "S3"),
subject_id = c("P1", "P1", "P2"),
study_id = c("ST1", "ST1", "ST2"),
timepoint_id = c("T0", "T1", "T2"),
stringsAsFactors = FALSE
)
precedence_graph <- build_dependency_graph(
nodes = list(
create_nodes(precedence_meta, type = "Sample", id_col = "sample_id"),
create_nodes(precedence_meta, type = "Subject", id_col = "subject_id"),
create_nodes(precedence_meta, type = "Study", id_col = "study_id"),
create_nodes(
data.frame(timepoint_id = c("T0", "T1", "T2"), stringsAsFactors = FALSE),
type = "Timepoint",
id_col = "timepoint_id"
)
),
edges = list(
create_edges(
precedence_meta, "sample_id", "subject_id",
"Sample", "Subject", "sample_belongs_to_subject"
),
create_edges(
precedence_meta, "sample_id", "study_id",
"Sample", "Study", "sample_from_study"
),
create_edges(
precedence_meta, "sample_id", "timepoint_id",
"Sample", "Timepoint", "sample_collected_at_timepoint"
),
create_edges(
data.frame(
from_timepoint = c("T0", "T1"),
to_timepoint = c("T1", "T2"),
stringsAsFactors = FALSE
),
"from_timepoint", "to_timepoint",
"Timepoint", "Timepoint", "timepoint_precedes"
)
),
graph_name = "precedence_only_graph"
)
precedence_time_constraint <- derive_split_constraints(precedence_graph, mode = "time")
precedence_time_constraint$metadata$time_order_source
#> [1] "timepoint_precedes"
as.data.frame(precedence_time_constraint)[, c("sample_id", "timepoint_id", "time_index", "order_rank")]
#> sample_id timepoint_id time_index order_rank
#> 1 S1 T0 NA 1
#> 2 S2 T1 NA 2
#> 3 S3 T2 NA 3The important detail is that ordering is still derived, but the
source is timepoint_precedes rather than
time_index.
The graph-derived constraint is not the end of the workflow. The main
handoff target is a canonical sample-level split specification — the
split_spec class. Downstream tools consume it through their
own adapters, so split_spec stays tool-agnostic.
split_spec <- as_split_spec(strict_constraint, graph = graph)
split_spec
#> <split_spec> composite
#> Samples: 6
#> Groups: 3
#> Recommended resampling: custom_grouped_cv
as.data.frame(split_spec)[, c(
"sample_id", "group_id", "batch_group", "study_group", "timepoint_id", "order_rank"
)]
#> sample_id group_id batch_group study_group timepoint_id order_rank
#> 1 S1 component_1 B1 ST1 T0 1
#> 2 S2 component_1 B2 ST1 T1 2
#> 3 S3 component_1 B1 ST1 T0 1
#> 4 S4 component_2 B3 ST2 T2 3
#> 5 S5 component_3 <NA> ST3 <NA> NA
#> 6 S6 component_1 B1 ST2 T1 2
split_spec_validation <- validate_split_spec(split_spec)
split_spec_validation
#> <split_spec_validation>
#> Valid: TRUE
#> Issues: 0
as.data.frame(split_spec_validation)
#> [1] issue_id severity code message n_affected details
#> <0 rows> (or 0-length row.names)This translation step is where the package becomes operational for downstream evaluation workflows:
group_id carries the split unitbatch_group and study_group are available
for blockingorder_rank is available for ordered evaluationThe final helper combines graph validation, constraint diagnostics, and split-spec readiness into one summary object.
risk_summary <- summarize_leakage_risks(
graph,
constraint = strict_constraint,
split_spec = split_spec
)
risk_summary
#> <leakage_risk_summary>
#> Overview: Detected 12 structural leakage diagnostics across validation, constraint, and split-spec readiness.
#> Diagnostics: 12
as.data.frame(risk_summary)[, c("source", "severity", "category", "message")]
#> source severity category
#> 1 validation advisory repeated_subject_samples
#> 2 validation advisory repeated_subject_samples
#> 3 validation warning subject_cross_study_overlap
#> 4 validation advisory per_dataset_featureset
#> 5 validation advisory shared_featureset_provenance
#> 6 validation advisory heavy_batch_reuse
#> 7 constraint advisory singleton_heavy_constraint
#> 8 split_spec advisory split_spec_ready
#> 9 split_spec advisory ordering_available
#> 10 split_spec advisory blocking_available
#> 11 split_spec advisory blocking_available
#> 12 split_spec advisory split_spec_singleton_heavy
#> message
#> 1 Subject `subject:P1` is linked to multiple samples.
#> 2 Subject `subject:P2` is linked to multiple samples.
#> 3 Subject `subject:P2` appears across multiple studies.
#> 4 FeatureSet `featureset:FS_GLOBAL` was derived at the full-dataset scope.
#> 5 FeatureSet `featureset:FS_GLOBAL` is shared across multiple samples.
#> 6 Batch `batch:B1` is reused across many samples.
#> 7 The derived split constraint is dominated by singleton groups.
#> 8 Split spec passed preflight validation.
#> 9 Split spec provides ordering through `order_rank` for 5 of 6 samples.
#> 10 Split spec provides blocking variable `batch_group` for 5 of 6 samples.
#> 11 Split spec provides blocking variable `study_group` for 6 of 6 samples.
#> 12 Split spec grouping is dominated by singleton groups.This is a useful stopping point before model training. It gives you one place to review whether the graph is structurally sound, whether the chosen constraint is overly singleton-heavy, and whether the downstream split spec is ready to use.
split_spec is the tool-agnostic handoff artifact.
splitGraph does not know about any particular resampling
package — downstream consumers provide their own adapters so
splitGraph stays neutral. The typical end-to-end flow
is:
graph_from_metadata() (or the explicit constructor
path) → typed dependency_graphderive_split_constraints(g, mode = ...) →
split_constraintas_split_spec(constraint, graph = g) →
split_specThe sample_data frame carried by split_spec
exposes exactly the columns downstream adapters consume:
sample_id for joining against the observation frame,
group_id for grouped resampling, batch_group /
study_group for blocking, and order_rank for
ordered evaluation. Adapters can be built by any package that wants to
consume a split_spec — for example, on top of
rsample::group_vfold_cv() (grouped CV keyed to
group_id) or rsample::rolling_origin()
(ordered evaluation keyed to order_rank).
The end-to-end workflow above shows the package surface. The case studies below show how the same graph leads to different evaluation decisions depending on the scientific question.
Suppose the real question is whether future observations from the same subject should be held out from training. In this setting, subject reuse and time ordering both matter, but they solve different problems.
subject_groups <- grouping_vector(subject_constraint)
time_groups <- time_constraint$sample_map[, c("sample_id", "group_id", "timepoint_id", "order_rank")]
subject_groups
#> S1 S2 S3 S4 S5 S6
#> "subject:P1" "subject:P1" "subject:P2" "subject:P3" "subject:P4" "subject:P2"
time_groups
#> sample_id group_id timepoint_id order_rank
#> 1 S1 time:T0 T0 1
#> 2 S2 time:T1 T1 2
#> 3 S3 time:T0 T0 1
#> 4 S4 time:T2 T2 3
#> 5 S5 time:unlinked:S5 <NA> NA
#> 6 S6 time:T1 T1 2Interpretation:
S1 and S2 share subject P1,
so subject-grouped evaluation keeps them together.S3 and S6 share subject P2,
so they also stay together under a subject-based split.T0,
T1, and T2 become ordered units with explicit
order_rank.If the leakage concern is repeated measurements from the same individual, use the subject constraint. If the evaluation question is prospective prediction, the time constraint adds the ordering information you need.
The graph intentionally includes subject P2 in both
ST1 and ST2. A study-only split would treat
those studies as separate units, but the graph shows that subject
overlap breaks the intended independence.
cross_study_issues <- as.data.frame(validation)[
as.data.frame(validation)$code == "subject_cross_study_overlap",
c("severity", "code", "message")
]
p2_shared <- detect_shared_dependencies(
graph,
via = "Subject",
samples = c("S3", "S6")
)
study_only_map <- study_constraint$sample_map[, c("sample_id", "group_id", "group_label")]
strict_map <- strict_constraint$sample_map[, c("sample_id", "group_id", "constraint_type")]
cross_study_issues
#> severity code
#> 3 warning subject_cross_study_overlap
#> message
#> 3 Subject `subject:P2` appears across multiple studies.
as.data.frame(p2_shared)
#> sample_id_1 sample_id_2 sample_node_id_1 sample_node_id_2 shared_node_id
#> 1 S3 S6 sample:S3 sample:S6 subject:P2
#> shared_node_type edge_type
#> 1 Subject sample_belongs_to_subject
study_only_map[study_only_map$sample_id %in% c("S3", "S6"), ]
#> sample_id group_id group_label
#> 3 S3 study:ST1 ST1
#> 6 S6 study:ST2 ST2
strict_map[strict_map$sample_id %in% c("S3", "S6"), ]
#> sample_id group_id constraint_type
#> 3 S3 component_1 composite_strict
#> 6 S6 component_1 composite_strictInterpretation:
S3 and
S6 are linked through the same subjectST1 versus ST2)This is exactly the kind of failure mode splitGraph is
designed to expose: metadata columns suggest a legitimate study split,
but graph structure shows that the split would still leak subject
information.
Real metadata are rarely complete. Here, S5 has no batch
assignment and no timepoint assignment. The package does not pretend
those fields exist. It keeps the sample visible and tells you how the
split logic handled it.
batch_missing <- batch_constraint$sample_map[
batch_constraint$sample_map$sample_id == "S5",
c("sample_id", "group_id", "group_label", "explanation")
]
rule_based_missing <- rule_based_constraint$sample_map[
rule_based_constraint$sample_map$sample_id == "S5",
c("sample_id", "group_id", "constraint_type", "group_label", "explanation")
]
split_spec_missing <- as.data.frame(split_spec)[
as.data.frame(split_spec)$sample_id == "S5",
c("sample_id", "group_id", "batch_group", "study_group", "timepoint_id", "order_rank")
]
batch_missing
#> sample_id group_id group_label
#> 5 S5 batch:unlinked:S5 unlinked_S5
#> explanation
#> 5 No batch assignment was available; sample retained as an unlinked singleton group.
rule_based_missing
#> sample_id group_id constraint_type group_label
#> 5 S5 composite_study:ST3 study ST3
#> explanation
#> 5 Composite rule-based grouping selected study based on priority order batch > study > subject > time -> ST3. Additional available dependencies: subject=P4.
split_spec_missing
#> sample_id group_id batch_group study_group timepoint_id order_rank
#> 5 S5 component_3 <NA> ST3 <NA> NAInterpretation:
S5 as an explicit singleton
because batch metadata are missingS5NA rather than silently inventing
valuesThat behavior matters because incomplete metadata are common.
splitGraph stays strict about what is known, but still
produces a usable, inspectable split object.
A typical practical question is not “what can the package compute?” but “which constraint should I actually use?” The answer depends on which dependency source is scientifically unacceptable to leak across train and test.
strategy_summary <- data.frame(
constraint = c("subject", "batch", "study", "time", "composite_strict", "composite_rule"),
groups = c(
length(unique(subject_constraint$sample_map$group_id)),
length(unique(batch_constraint$sample_map$group_id)),
length(unique(study_constraint$sample_map$group_id)),
length(unique(time_constraint$sample_map$group_id)),
length(unique(strict_constraint$sample_map$group_id)),
length(unique(rule_based_constraint$sample_map$group_id))
),
warnings = c(
length(or_empty(subject_constraint$metadata$warnings)),
length(or_empty(batch_constraint$metadata$warnings)),
length(or_empty(study_constraint$metadata$warnings)),
length(or_empty(time_constraint$metadata$warnings)),
length(or_empty(strict_constraint$metadata$warnings)),
length(or_empty(rule_based_constraint$metadata$warnings))
),
recommended_resampling = c(
as_split_spec(subject_constraint, graph = graph)$recommended_resampling,
as_split_spec(batch_constraint, graph = graph)$recommended_resampling,
as_split_spec(study_constraint, graph = graph)$recommended_resampling,
as_split_spec(time_constraint, graph = graph)$recommended_resampling,
as_split_spec(strict_constraint, graph = graph)$recommended_resampling,
as_split_spec(rule_based_constraint, graph = graph)$recommended_resampling
),
stringsAsFactors = FALSE
)
strategy_summary
#> constraint groups warnings recommended_resampling
#> 1 subject 4 0 grouped_cv
#> 2 batch 4 1 blocked_cv
#> 3 study 3 0 leave_one_group_out
#> 4 time 4 1 ordered_split
#> 5 composite_strict 3 0 custom_grouped_cv
#> 6 composite_rule 4 0 grouped_cvInterpretation:
The package does not choose the scientific objective for you. It makes the trade-off visible and auditable.
splitGraph is usefulsplitGraph is a good fit when:
splitGraph is not forsplitGraph is not:
Its value is earlier in the workflow: it makes dependency structure explicit so that the split design itself can be justified.
If you already know your data have repeated subjects, reused batches,
temporal ordering, or shared feature provenance, then you already have a
graph problem whether you model it explicitly or not.
splitGraph is useful because it turns that hidden graph
into an object you can validate, query, and convert into a split design
that downstream tooling can trust.