Spatial registration: compute ANOVA statistics

This function computes the gene ANOVA F-statistics (at least one group is different from the rest). These F-statistics can be used for spatial registration with layer_stat_cor() and related functions. Although, they are more typically used for identifying ANOVA-marker genes.

registration_stats_anova(
  sce_pseudo,
  block_cor,
  covars = NULL,
  var_registration = "registration_variable",
  var_sample_id = "registration_sample_id",
  gene_ensembl = NULL,
  gene_name = NULL,
  suffix = ""
)

Arguments

sce_pseudo: The output of registration_pseudobulk().
block_cor: A numeric(1) computed with registration_block_cor().
covars: A character() with names of sample-level covariates.
var_registration: A character(1) specifying the colData(sce_pseudo) variable of interest against which will be used for computing the relevant statistics.
var_sample_id: A character(1) specifying the colData(sce_pseudo) variable with the sample ID.
gene_ensembl: A character(1) specifying the rowData(sce_pseudo) column with the ENSEMBL gene IDs. This will be used by layer_stat_cor().
gene_name: A character(1) specifying the rowData(sce_pseudo) column with the gene names (symbols).
suffix: A character(1) specifying the suffix to use for the F-statistics column. This is particularly useful if you will run this function more than once and want to be able to merge the results.

Value

A data.frame() with the ANOVA statistical results. This is similar to fetch_data("modeling_results")$anova.

Examples

example("registration_block_cor", package = "spatialLIBD")
#> 
#> rgst__> example("registration_model", package = "spatialLIBD")
#> 
#> rgstr_> example("registration_pseudobulk", package = "spatialLIBD")
#> 
#> rgstr_> ## Ensure reproducibility of example data
#> rgstr_> set.seed(20220907)
#> 
#> rgstr_> ## Generate example data
#> rgstr_> sce <- scuttle::mockSCE()
#> 
#> rgstr_> ## Add some sample IDs
#> rgstr_> sce$sample_id <- sample(LETTERS[1:5], ncol(sce), replace = TRUE)
#> 
#> rgstr_> ## Add a sample-level covariate: age
#> rgstr_> ages <- rnorm(5, mean = 20, sd = 4)
#> 
#> rgstr_> names(ages) <- LETTERS[1:5]
#> 
#> rgstr_> sce$age <- ages[sce$sample_id]
#> 
#> rgstr_> ## Add gene-level information
#> rgstr_> rowData(sce)$ensembl <- paste0("ENSG", seq_len(nrow(sce)))
#> 
#> rgstr_> rowData(sce)$gene_name <- paste0("gene", seq_len(nrow(sce)))
#> 
#> rgstr_> ## Pseudo-bulk by Cell Cycle
#> rgstr_> sce_pseudo <- registration_pseudobulk(
#> rgstr_+     sce,
#> rgstr_+     var_registration = "Cell_Cycle",
#> rgstr_+     var_sample_id = "sample_id",
#> rgstr_+     covars = c("age"),
#> rgstr_+     min_ncells = NULL
#> rgstr_+ )
#> 2025-05-09 16:31:13.687468 make pseudobulk object
#> 2025-05-09 16:31:13.802635 drop lowly expressed genes
#> 2025-05-09 16:31:13.851534 normalize expression
#> 
#> rgstr_> colData(sce_pseudo)
#> DataFrame with 20 rows and 9 columns
#>      Mutation_Status  Cell_Cycle   Treatment   sample_id       age
#>          <character> <character> <character> <character> <numeric>
#> A_G0              NA          G0          NA           A   19.1872
#> B_G0              NA          G0          NA           B   25.3496
#> C_G0              NA          G0          NA           C   24.1802
#> D_G0              NA          G0          NA           D   15.5211
#> E_G0              NA          G0          NA           E   20.9701
#> ...              ...         ...         ...         ...       ...
#> A_S               NA           S          NA           A   19.1872
#> B_S               NA           S          NA           B   25.3496
#> C_S               NA           S          NA           C   24.1802
#> D_S               NA           S          NA           D   15.5211
#> E_S               NA           S          NA           E   20.9701
#>      registration_variable registration_sample_id    ncells pseudo_sum_umi
#>                <character>            <character> <integer>      <numeric>
#> A_G0                    G0                      A         8        2946915
#> B_G0                    G0                      B        13        4922867
#> C_G0                    G0                      C         9        3398888
#> D_G0                    G0                      D         7        2630651
#> E_G0                    G0                      E        10        3761710
#> ...                    ...                    ...       ...            ...
#> A_S                      S                      A        12        4516334
#> B_S                      S                      B         8        2960685
#> C_S                      S                      C         7        2595774
#> D_S                      S                      D        14        5233560
#> E_S                      S                      E        11        4151818
#> 
#> rgstr_> registration_mod <- registration_model(sce_pseudo, "age")
#> 2025-05-09 16:31:13.931109 create model matrix
#> 
#> rgstr_> head(registration_mod)
#>      registration_variableG0 registration_variableG1 registration_variableG2M
#> A_G0                       1                       0                        0
#> B_G0                       1                       0                        0
#> C_G0                       1                       0                        0
#> D_G0                       1                       0                        0
#> E_G0                       1                       0                        0
#> A_G1                       0                       1                        0
#>      registration_variableS      age
#> A_G0                      0 19.18719
#> B_G0                      0 25.34965
#> C_G0                      0 24.18019
#> D_G0                      0 15.52107
#> E_G0                      0 20.97006
#> A_G1                      0 19.18719
#> 
#> rgst__> block_cor <- registration_block_cor(sce_pseudo, registration_mod)
#> 2025-05-09 16:31:13.944285 run duplicateCorrelation()
#> 2025-05-09 16:31:15.279168 The estimated correlation is: -0.0187869166526901
results_anova <- registration_stats_anova(sce_pseudo,
    block_cor, "age",
    gene_ensembl = "ensembl", gene_name = "gene_name", suffix = "example"
)
#> 2025-05-09 16:31:15.292863 computing F-statistics
head(results_anova)
#>   f_stat_example p_value_example fdr_example AveExpr_example ensembl  gene
#> 1      0.1328075      0.93918553   0.9951876        5.499488   ENSG1 gene1
#> 2      0.5848283      0.63309609   0.9945781        3.727026   ENSG2 gene2
#> 3      0.5225232      0.67256636   0.9945781       10.723772   ENSG3 gene3
#> 4      0.1817125      0.90734075   0.9945781        5.569130   ENSG4 gene4
#> 5      2.8119984      0.07061058   0.9945781        2.971182   ENSG5 gene5
#> 6      2.3501978      0.10859055   0.9945781        5.258797   ENSG6 gene6

## Specifying `block_cor = NaN` then ignores the correlation structure
results_anova_nan <- registration_stats_anova(sce_pseudo,
    block_cor = NaN, "age",
    gene_ensembl = "ensembl", gene_name = "gene_name", suffix = "example"
)
#> 2025-05-09 16:31:15.360393 computing F-statistics
head(results_anova_nan)
#>   f_stat_example p_value_example fdr_example AveExpr_example ensembl  gene
#> 1      0.1357180      0.93736053   0.9943765        5.499488   ENSG1 gene1
#> 2      0.5981849      0.62486365   0.9929266        3.727026   ENSG2 gene2
#> 3      0.5333663      0.66557385   0.9929266       10.723772   ENSG3 gene3
#> 4      0.1883530      0.90286049   0.9929266        5.569130   ENSG4 gene4
#> 5      2.8288401      0.06953318   0.9876267        2.971182   ENSG5 gene5
#> 6      2.3888417      0.10467962   0.9896288        5.258797   ENSG6 gene6

## Note that you can merge multiple of these data.frames if you run this
## function for different sets. For example, maybe you drop one group
## before pseudo-bulking if you know that there are many differences between
## that group and others. For example, we have dropped the white matter (WM)
## prior to computing ANOVA F-statistics.

## no covariates
results_anova_nocovar <- registration_stats_anova(sce_pseudo,
    block_cor,
    covars = NULL,
    gene_ensembl = "ensembl", gene_name = "gene_name", suffix = "nocovar"
)
#> 2025-05-09 16:31:15.413353 computing F-statistics
head(results_anova_nocovar)
#>   f_stat_nocovar p_value_nocovar fdr_nocovar AveExpr_nocovar ensembl  gene
#> 1      0.1393641       0.9351345   0.9962097        5.499488   ENSG1 gene1
#> 2      0.5831947       0.6336729   0.9940431        3.727026   ENSG2 gene2
#> 3      0.5291261       0.6679753   0.9940431       10.723772   ENSG3 gene3
#> 4      0.1795410       0.9088789   0.9940431        5.569130   ENSG4 gene4
#> 5      2.1543529       0.1288912   0.9940431        2.971182   ENSG5 gene5
#> 6      2.1109373       0.1345061   0.9940431        5.258797   ENSG6 gene6

## Merge both results into a single data.frame, thanks to having different
## 'suffix' values.
results_anova_merged <- merge(results_anova, results_anova_nocovar)
head(results_anova_merged)
#>    ensembl     gene f_stat_example p_value_example fdr_example AveExpr_example
#> 1    ENSG1    gene1      0.1328075      0.93918553   0.9951876        5.499488
#> 2   ENSG10   gene10      1.4570886      0.26147730   0.9945781        2.627385
#> 3  ENSG100  gene100      3.2217706      0.04889378   0.9945781        2.232185
#> 4 ENSG1000 gene1000      0.6758112      0.57868578   0.9945781       10.308443
#> 5 ENSG1001 gene1001      0.7945746      0.51364699   0.9945781        6.472580
#> 6 ENSG1002 gene1002      0.3648583      0.77922852   0.9945781       10.207492
#>   f_stat_nocovar p_value_nocovar fdr_nocovar AveExpr_nocovar
#> 1      0.1393641      0.93513454   0.9962097        5.499488
#> 2      1.4062441      0.27343407   0.9940431        2.627385
#> 3      3.3026378      0.04397329   0.9940431        2.232185
#> 4      0.5350167      0.66417556   0.9940431       10.308443
#> 5      0.8090300      0.50527512   0.9940431        6.472580
#> 6      0.3768459      0.77078373   0.9940431       10.207492

Arguments

Value

See also

Examples