This function reads in a recount3 gene or gexon counts file into R. You can first locate the file using locate_url() then download it to your computer using file_retrieve().

read_counts(counts_file, samples = NULL)

Arguments

counts_file

A character(1) with the local path to a recount3 counts file.

samples

A character() with external_id sample IDs to read in. When NULL (default), all samples will be read in. This argument is used by create_rse_manual().

Value

A data.frame() with sample IDs as the column names.

References

https://doi.org/10.12688/f1000research.12223.1 for details on the base-pair coverage counts used in recount2 and recount3.

See also

Other internal functions for accessing the recount3 data: annotation_ext(), create_rse_manual(), file_retrieve(), locate_url_ann(), locate_url(), project_homes(), read_metadata()

Examples


## Download the gene counts file for project SRP009615
url_SRP009615_gene <- locate_url(
    "SRP009615",
    "data_sources/sra",
    type = "gene"
)
local_SRP009615_gene <- file_retrieve(url = url_SRP009615_gene)
#> 2023-05-07 00:12:32.972618 caching file sra.gene_sums.SRP009615.G026.gz.

## Read the gene counts, take about 3 seconds
system.time(SRP009615_gene_counts <- read_counts(local_SRP009615_gene))
#>    user  system elapsed 
#>   0.103   0.004   0.106 
dim(SRP009615_gene_counts)
#> [1] 63856    12

## Explore the top left corner
SRP009615_gene_counts[seq_len(6), seq_len(6)]
#>                   SRR389077 SRR387777 SRR387778 SRR389078 SRR387779 SRR389079
#> ENSG00000278704.1         0         0         0         0         0         0
#> ENSG00000277400.1         0         0         0         0         0         0
#> ENSG00000274847.1         0         0         0         0         0         0
#> ENSG00000277428.1         0         0         0         0         0         0
#> ENSG00000276256.1         0         0         0         0         0         0
#> ENSG00000278198.1         0         0         0         0         0         0

## Explore the first 6 samples.
summary(SRP009615_gene_counts[, seq_len(6)])
#>    SRR389077          SRR387777          SRR387778          SRR389078       
#>  Min.   :       0   Min.   :       0   Min.   :       0   Min.   :       0  
#>  1st Qu.:       0   1st Qu.:       0   1st Qu.:       0   1st Qu.:       0  
#>  Median :      36   Median :      68   Median :      72   Median :      60  
#>  Mean   :    7746   Mean   :   11878   Mean   :   13190   Mean   :    8459  
#>  3rd Qu.:    1474   3rd Qu.:    2485   3rd Qu.:    2816   3rd Qu.:    1500  
#>  Max.   :19403569   Max.   :10234801   Max.   :10622529   Max.   :20949547  
#>    SRR387779          SRR389079       
#>  Min.   :       0   Min.   :       0  
#>  1st Qu.:       0   1st Qu.:       0  
#>  Median :      59   Median :     144  
#>  Mean   :   18214   Mean   :   23123  
#>  3rd Qu.:    3472   3rd Qu.:    5627  
#>  Max.   :14667496   Max.   :14832424  

## Note that the count units are in
## base-pair coverage counts just like in the recount2 project.
## See https://doi.org/10.12688/f1000research.12223.1 for more details
## about this type of counts.
## They can be converted to reads per 40 million reads, RPKM and other
## counts. This is more easily done once assembled into a
## RangedSummarizedExperiment object.

## Locate and retrieve an exon counts file
local_SRP009615_exon <- file_retrieve(
    locate_url(
        "SRP009615",
        "data_sources/sra",
        type = "exon"
    )
)
#> 2023-05-07 00:12:34.394238 caching file sra.exon_sums.SRP009615.G026.gz.
local_SRP009615_exon
#>                                              sra.exon_sums.SRP009615.G026.gz 
#> "/github/home/.cache/R/recount3/2248fa92647_sra.exon_sums.SRP009615.G026.gz" 

## Read the exon counts, takes about 50-60 seconds
system.time(
    SRP009615_exon_counts <- read_counts(
        local_SRP009615_exon
    )
)
#>    user  system elapsed 
#>   0.929   0.104   1.033 
dim(SRP009615_exon_counts)
#> [1] 1299686      12
pryr::object_size(SRP009615_exon_counts)
#> 124.77 MB

## Explore the top left corner
SRP009615_exon_counts[seq_len(6), seq_len(6)]
#>                            SRR389077 SRR387777 SRR387778 SRR389078 SRR387779
#> GL000009.2|56140|58376|-           0         0         0         0         0
#> GL000194.1|53594|54832|-           0         0         0         0         0
#> GL000194.1|55446|55676|-           0         0         0         0         0
#> GL000194.1|53590|55676|-           0         0         0         0         0
#> GL000194.1|112792|112850|-         0         0         0         0         0
#> GL000194.1|112792|112850|-         0         0         0         0         0
#>                            SRR389079
#> GL000009.2|56140|58376|-           0
#> GL000194.1|53594|54832|-           0
#> GL000194.1|55446|55676|-           0
#> GL000194.1|53590|55676|-           0
#> GL000194.1|112792|112850|-         0
#> GL000194.1|112792|112850|-         0

## Explore the first 6 samples.
summary(SRP009615_exon_counts[, seq_len(6)])
#>    SRR389077          SRR387777          SRR387778          SRR389078       
#>  Min.   :       0   Min.   :       0   Min.   :       0   Min.   :       0  
#>  1st Qu.:       0   1st Qu.:       0   1st Qu.:       0   1st Qu.:       0  
#>  Median :     105   Median :     186   Median :     213   Median :      93  
#>  Mean   :    1401   Mean   :    2334   Mean   :    2574   Mean   :    1491  
#>  3rd Qu.:     762   3rd Qu.:    1391   3rd Qu.:    1512   3rd Qu.:     741  
#>  Max.   :19403569   Max.   :10234801   Max.   :10622529   Max.   :20949547  
#>    SRR387779          SRR389079       
#>  Min.   :       0   Min.   :       0  
#>  1st Qu.:       0   1st Qu.:       0  
#>  Median :     225   Median :     397  
#>  Mean   :    3500   Mean   :    4457  
#>  3rd Qu.:    1768   3rd Qu.:    2425  
#>  Max.   :14667496   Max.   :14832424