R/registration_stats_anova.R
registration_stats_anova.Rd
This function computes the gene ANOVA F-statistics (at least one group is
different from the rest). These F-statistics can be used for spatial
registration with layer_stat_cor()
and related functions. Although, they
are more typically used for identifying ANOVA-marker genes.
registration_stats_anova(
sce_pseudo,
block_cor,
covars = NULL,
var_registration = "registration_variable",
var_sample_id = "registration_sample_id",
gene_ensembl = NULL,
gene_name = NULL,
suffix = ""
)
The output of registration_pseudobulk()
.
A numeric(1)
computed with registration_block_cor()
.
A character()
with names of sample-level covariates.
A character(1)
specifying the colData(sce_pseudo)
variable of interest against which will be used for computing the relevant
statistics.
A character(1)
specifying the colData(sce_pseudo)
variable with the sample ID.
A character(1)
specifying the rowData(sce_pseudo)
column with the ENSEMBL gene IDs. This will be used by layer_stat_cor()
.
A character(1)
specifying the rowData(sce_pseudo)
column with the gene names (symbols).
A character(1)
specifying the suffix to use for the
F-statistics column. This is particularly useful if you will run this
function more than once and want to be able to merge the results.
A data.frame()
with the ANOVA statistical results. This is
similar to fetch_data("modeling_results")$anova
.
Other spatial registration and statistical modeling functions:
registration_block_cor()
,
registration_model()
,
registration_pseudobulk()
,
registration_stats_enrichment()
,
registration_stats_pairwise()
,
registration_wrapper()
example("registration_block_cor", package = "spatialLIBD")
#>
#> rgst__> example("registration_model", package = "spatialLIBD")
#>
#> rgstr_> example("registration_pseudobulk", package = "spatialLIBD")
#>
#> rgstr_> ## Ensure reproducibility of example data
#> rgstr_> set.seed(20220907)
#>
#> rgstr_> ## Generate example data
#> rgstr_> sce <- scuttle::mockSCE()
#>
#> rgstr_> ## Add some sample IDs
#> rgstr_> sce$sample_id <- sample(LETTERS[1:5], ncol(sce), replace = TRUE)
#>
#> rgstr_> ## Add a sample-level covariate: age
#> rgstr_> ages <- rnorm(5, mean = 20, sd = 4)
#>
#> rgstr_> names(ages) <- LETTERS[1:5]
#>
#> rgstr_> sce$age <- ages[sce$sample_id]
#>
#> rgstr_> ## Add gene-level information
#> rgstr_> rowData(sce)$ensembl <- paste0("ENSG", seq_len(nrow(sce)))
#>
#> rgstr_> rowData(sce)$gene_name <- paste0("gene", seq_len(nrow(sce)))
#>
#> rgstr_> ## Pseudo-bulk
#> rgstr_> sce_pseudo <- registration_pseudobulk(sce, "Cell_Cycle", "sample_id", c("age"), min_ncells = NULL)
#> 2024-10-31 20:27:43.330647 make pseudobulk object
#> 2024-10-31 20:27:43.48633 drop lowly expressed genes
#> 2024-10-31 20:27:43.542063 normalize expression
#>
#> rgstr_> colData(sce_pseudo)
#> DataFrame with 20 rows and 8 columns
#> Mutation_Status Cell_Cycle Treatment sample_id age
#> <character> <character> <character> <character> <numeric>
#> A_G0 NA G0 NA A 19.1872
#> B_G0 NA G0 NA B 25.3496
#> C_G0 NA G0 NA C 24.1802
#> D_G0 NA G0 NA D 15.5211
#> E_G0 NA G0 NA E 20.9701
#> ... ... ... ... ... ...
#> A_S NA S NA A 19.1872
#> B_S NA S NA B 25.3496
#> C_S NA S NA C 24.1802
#> D_S NA S NA D 15.5211
#> E_S NA S NA E 20.9701
#> registration_variable registration_sample_id ncells
#> <character> <character> <integer>
#> A_G0 G0 A 8
#> B_G0 G0 B 13
#> C_G0 G0 C 9
#> D_G0 G0 D 7
#> E_G0 G0 E 10
#> ... ... ... ...
#> A_S S A 12
#> B_S S B 8
#> C_S S C 7
#> D_S S D 14
#> E_S S E 11
#>
#> rgstr_> registration_mod <- registration_model(sce_pseudo, "age")
#> 2024-10-31 20:27:43.618136 create model matrix
#>
#> rgstr_> head(registration_mod)
#> registration_variableG0 registration_variableG1 registration_variableG2M
#> A_G0 1 0 0
#> B_G0 1 0 0
#> C_G0 1 0 0
#> D_G0 1 0 0
#> E_G0 1 0 0
#> A_G1 0 1 0
#> registration_variableS age
#> A_G0 0 19.18719
#> B_G0 0 25.34965
#> C_G0 0 24.18019
#> D_G0 0 15.52107
#> E_G0 0 20.97006
#> A_G1 0 19.18719
#>
#> rgst__> block_cor <- registration_block_cor(sce_pseudo, registration_mod)
#> 2024-10-31 20:27:43.63019 run duplicateCorrelation()
#> 2024-10-31 20:27:44.946026 The estimated correlation is: -0.0187869166526901
results_anova <- registration_stats_anova(sce_pseudo,
block_cor, "age",
gene_ensembl = "ensembl", gene_name = "gene_name", suffix = "example"
)
#> 2024-10-31 20:27:44.95863 computing F-statistics
head(results_anova)
#> f_stat_example p_value_example fdr_example AveExpr_example ensembl gene
#> 1 0.1328075 0.93918553 0.9951876 5.499488 ENSG1 gene1
#> 2 0.5848283 0.63309609 0.9945781 3.727026 ENSG2 gene2
#> 3 0.5225232 0.67256636 0.9945781 10.723772 ENSG3 gene3
#> 4 0.1817125 0.90734075 0.9945781 5.569130 ENSG4 gene4
#> 5 2.8119984 0.07061058 0.9945781 2.971182 ENSG5 gene5
#> 6 2.3501978 0.10859055 0.9945781 5.258797 ENSG6 gene6
## Specifying `block_cor = NaN` then ignores the correlation structure
results_anova_nan <- registration_stats_anova(sce_pseudo,
block_cor = NaN, "age",
gene_ensembl = "ensembl", gene_name = "gene_name", suffix = "example"
)
#> 2024-10-31 20:27:45.027015 computing F-statistics
head(results_anova_nan)
#> f_stat_example p_value_example fdr_example AveExpr_example ensembl gene
#> 1 0.1357180 0.93736053 0.9943765 5.499488 ENSG1 gene1
#> 2 0.5981849 0.62486365 0.9929266 3.727026 ENSG2 gene2
#> 3 0.5333663 0.66557385 0.9929266 10.723772 ENSG3 gene3
#> 4 0.1883530 0.90286049 0.9929266 5.569130 ENSG4 gene4
#> 5 2.8288401 0.06953318 0.9876267 2.971182 ENSG5 gene5
#> 6 2.3888417 0.10467962 0.9896288 5.258797 ENSG6 gene6
## Note that you can merge multiple of these data.frames if you run this
## function for different sets. For example, maybe you drop one group
## before pseudo-bulking if you know that there are many differences between
## that group and others. For example, we have dropped the white matter (WM)
## prior to computing ANOVA F-statistics.
## no covariates
results_anova_nocovar <- registration_stats_anova(sce_pseudo,
block_cor,
covars = NULL,
gene_ensembl = "ensembl", gene_name = "gene_name", suffix = "nocovar"
)
#> 2024-10-31 20:27:45.077416 computing F-statistics
head(results_anova_nocovar)
#> f_stat_nocovar p_value_nocovar fdr_nocovar AveExpr_nocovar ensembl gene
#> 1 0.1393641 0.9351345 0.9962097 5.499488 ENSG1 gene1
#> 2 0.5831947 0.6336729 0.9940431 3.727026 ENSG2 gene2
#> 3 0.5291261 0.6679753 0.9940431 10.723772 ENSG3 gene3
#> 4 0.1795410 0.9088789 0.9940431 5.569130 ENSG4 gene4
#> 5 2.1543529 0.1288912 0.9940431 2.971182 ENSG5 gene5
#> 6 2.1109373 0.1345061 0.9940431 5.258797 ENSG6 gene6
## Merge both results into a single data.frame, thanks to having different
## 'suffix' values.
results_anova_merged <- merge(results_anova, results_anova_nocovar)
head(results_anova_merged)
#> ensembl gene f_stat_example p_value_example fdr_example AveExpr_example
#> 1 ENSG1 gene1 0.1328075 0.93918553 0.9951876 5.499488
#> 2 ENSG10 gene10 1.4570886 0.26147730 0.9945781 2.627385
#> 3 ENSG100 gene100 3.2217706 0.04889378 0.9945781 2.232185
#> 4 ENSG1000 gene1000 0.6758112 0.57868578 0.9945781 10.308443
#> 5 ENSG1001 gene1001 0.7945746 0.51364699 0.9945781 6.472580
#> 6 ENSG1002 gene1002 0.3648583 0.77922852 0.9945781 10.207492
#> f_stat_nocovar p_value_nocovar fdr_nocovar AveExpr_nocovar
#> 1 0.1393641 0.93513454 0.9962097 5.499488
#> 2 1.4062441 0.27343407 0.9940431 2.627385
#> 3 3.3026378 0.04397329 0.9940431 2.232185
#> 4 0.5350167 0.66417556 0.9940431 10.308443
#> 5 0.8090300 0.50527512 0.9940431 6.472580
#> 6 0.3768459 0.77078373 0.9940431 10.207492