This function reads in a recount3
gene or gexon counts file into R. You can
first locate the file using locate_url()
then download it to your
computer using file_retrieve()
.
read_counts(counts_file, samples = NULL)
A character(1)
with the local path to a recount3
counts file.
A character()
with external_id
sample IDs to read in. When
NULL
(default), all samples will be read in. This argument is used by
create_rse_manual()
.
A data.frame()
with sample IDs as the column names.
https://doi.org/10.12688/f1000research.12223.1 for details on the base-pair coverage counts used in recount2 and recount3.
Other internal functions for accessing the recount3 data:
annotation_ext()
,
create_rse_manual()
,
file_retrieve()
,
locate_url_ann()
,
locate_url()
,
project_homes()
,
read_metadata()
## Download the gene counts file for project SRP009615
url_SRP009615_gene <- locate_url(
"SRP009615",
"data_sources/sra",
type = "gene"
)
local_SRP009615_gene <- file_retrieve(url = url_SRP009615_gene)
#> 2023-05-07 00:12:32.972618 caching file sra.gene_sums.SRP009615.G026.gz.
## Read the gene counts, take about 3 seconds
system.time(SRP009615_gene_counts <- read_counts(local_SRP009615_gene))
#> user system elapsed
#> 0.103 0.004 0.106
dim(SRP009615_gene_counts)
#> [1] 63856 12
## Explore the top left corner
SRP009615_gene_counts[seq_len(6), seq_len(6)]
#> SRR389077 SRR387777 SRR387778 SRR389078 SRR387779 SRR389079
#> ENSG00000278704.1 0 0 0 0 0 0
#> ENSG00000277400.1 0 0 0 0 0 0
#> ENSG00000274847.1 0 0 0 0 0 0
#> ENSG00000277428.1 0 0 0 0 0 0
#> ENSG00000276256.1 0 0 0 0 0 0
#> ENSG00000278198.1 0 0 0 0 0 0
## Explore the first 6 samples.
summary(SRP009615_gene_counts[, seq_len(6)])
#> SRR389077 SRR387777 SRR387778 SRR389078
#> Min. : 0 Min. : 0 Min. : 0 Min. : 0
#> 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0
#> Median : 36 Median : 68 Median : 72 Median : 60
#> Mean : 7746 Mean : 11878 Mean : 13190 Mean : 8459
#> 3rd Qu.: 1474 3rd Qu.: 2485 3rd Qu.: 2816 3rd Qu.: 1500
#> Max. :19403569 Max. :10234801 Max. :10622529 Max. :20949547
#> SRR387779 SRR389079
#> Min. : 0 Min. : 0
#> 1st Qu.: 0 1st Qu.: 0
#> Median : 59 Median : 144
#> Mean : 18214 Mean : 23123
#> 3rd Qu.: 3472 3rd Qu.: 5627
#> Max. :14667496 Max. :14832424
## Note that the count units are in
## base-pair coverage counts just like in the recount2 project.
## See https://doi.org/10.12688/f1000research.12223.1 for more details
## about this type of counts.
## They can be converted to reads per 40 million reads, RPKM and other
## counts. This is more easily done once assembled into a
## RangedSummarizedExperiment object.
## Locate and retrieve an exon counts file
local_SRP009615_exon <- file_retrieve(
locate_url(
"SRP009615",
"data_sources/sra",
type = "exon"
)
)
#> 2023-05-07 00:12:34.394238 caching file sra.exon_sums.SRP009615.G026.gz.
local_SRP009615_exon
#> sra.exon_sums.SRP009615.G026.gz
#> "/github/home/.cache/R/recount3/2248fa92647_sra.exon_sums.SRP009615.G026.gz"
## Read the exon counts, takes about 50-60 seconds
system.time(
SRP009615_exon_counts <- read_counts(
local_SRP009615_exon
)
)
#> user system elapsed
#> 0.929 0.104 1.033
dim(SRP009615_exon_counts)
#> [1] 1299686 12
pryr::object_size(SRP009615_exon_counts)
#> 124.77 MB
## Explore the top left corner
SRP009615_exon_counts[seq_len(6), seq_len(6)]
#> SRR389077 SRR387777 SRR387778 SRR389078 SRR387779
#> GL000009.2|56140|58376|- 0 0 0 0 0
#> GL000194.1|53594|54832|- 0 0 0 0 0
#> GL000194.1|55446|55676|- 0 0 0 0 0
#> GL000194.1|53590|55676|- 0 0 0 0 0
#> GL000194.1|112792|112850|- 0 0 0 0 0
#> GL000194.1|112792|112850|- 0 0 0 0 0
#> SRR389079
#> GL000009.2|56140|58376|- 0
#> GL000194.1|53594|54832|- 0
#> GL000194.1|55446|55676|- 0
#> GL000194.1|53590|55676|- 0
#> GL000194.1|112792|112850|- 0
#> GL000194.1|112792|112850|- 0
## Explore the first 6 samples.
summary(SRP009615_exon_counts[, seq_len(6)])
#> SRR389077 SRR387777 SRR387778 SRR389078
#> Min. : 0 Min. : 0 Min. : 0 Min. : 0
#> 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0
#> Median : 105 Median : 186 Median : 213 Median : 93
#> Mean : 1401 Mean : 2334 Mean : 2574 Mean : 1491
#> 3rd Qu.: 762 3rd Qu.: 1391 3rd Qu.: 1512 3rd Qu.: 741
#> Max. :19403569 Max. :10234801 Max. :10622529 Max. :20949547
#> SRR387779 SRR389079
#> Min. : 0 Min. : 0
#> 1st Qu.: 0 1st Qu.: 0
#> Median : 225 Median : 397
#> Mean : 3500 Mean : 4457
#> 3rd Qu.: 1768 3rd Qu.: 2425
#> Max. :14667496 Max. :14832424