As described in the recount workflow, the counts provided by the recount2
project are base-pair counts. You can scale them using transform_counts()
or compute the read counts using the area under coverage information (AUC).
compute_read_counts(
rse,
round = TRUE,
avg_mapped_read_length = "recount_qc.star.average_mapped_length"
)
A
RangedSummarizedExperiment-class
created by create_rse()
.
A logical(1)
specifying whether to round the transformed
counts or not.
A character(1)
specifying the metdata column
name that contains the average fragment length after aligning. This is
typically twice the average read length for paired-end reads.
A matrix()
with the read counts. By default this function uses
the average read length to the QC annotation.
This function is similar to
recount::read_counts(use_paired_end = TRUE, round = TRUE)
but more general
and with a different name to avoid NAMESPACE conflicts. Note that the default
value of round
is different than in recount::read_counts()
. This
was done to match the default value of round
in transform_counts()
.
Collado-Torres L, Nellore A and Jaffe AE. recount workflow: Accessing over 70,000 human RNA-seq samples with Bioconductor version 1; referees: 1 approved, 2 approved with reservations. F1000Research 2017, 6:1558 doi: 10.12688/f1000research.12223.1.
Other count transformation functions:
compute_scale_factors()
,
is_paired_end()
,
transform_counts()
## Create a RSE object at the gene level
rse_gene_SRP009615 <- create_rse_manual("SRP009615")
#> 2023-05-07 00:10:22.027918 downloading and reading the metadata.
#> 2023-05-07 00:10:22.332659 caching file sra.sra.SRP009615.MD.gz.
#> 2023-05-07 00:10:22.640917 caching file sra.recount_project.SRP009615.MD.gz.
#> 2023-05-07 00:10:22.966372 caching file sra.recount_qc.SRP009615.MD.gz.
#> 2023-05-07 00:10:23.315081 caching file sra.recount_seq_qc.SRP009615.MD.gz.
#> 2023-05-07 00:10:23.628464 caching file sra.recount_pred.SRP009615.MD.gz.
#> 2023-05-07 00:10:23.704635 downloading and reading the feature information.
#> 2023-05-07 00:10:23.983416 caching file human.gene_sums.G026.gtf.gz.
#> 2023-05-07 00:10:24.519525 downloading and reading the counts: 12 samples across 63856 features.
#> 2023-05-07 00:10:24.830053 caching file sra.gene_sums.SRP009615.G026.gz.
#> 2023-05-07 00:10:25.069605 constructing the RangedSummarizedExperiment (rse) object.
colSums(compute_read_counts(rse_gene_SRP009615)) / 1e6
#> SRR387777 SRR387778 SRR387779 SRR387780 SRR389079 SRR389080 SRR389081 SRR389082
#> 21.29962 23.66484 32.62525 25.46085 41.48717 28.05485 17.78092 15.46669
#> SRR389083 SRR389084 SRR389077 SRR389078
#> 18.84982 18.18674 13.88166 15.14785
## Create a RSE object at the gene level
rse_gene_DRP000499 <- create_rse_manual("DRP000499")
#> 2023-05-07 00:10:25.147308 downloading and reading the metadata.
#> 2023-05-07 00:10:25.417019 caching file sra.sra.DRP000499.MD.gz.
#> 2023-05-07 00:10:25.734304 caching file sra.recount_project.DRP000499.MD.gz.
#> 2023-05-07 00:10:26.051436 caching file sra.recount_qc.DRP000499.MD.gz.
#> 2023-05-07 00:10:26.365882 caching file sra.recount_seq_qc.DRP000499.MD.gz.
#> 2023-05-07 00:10:26.72776 caching file sra.recount_pred.DRP000499.MD.gz.
#> 2023-05-07 00:10:26.803488 downloading and reading the feature information.
#> 2023-05-07 00:10:27.079844 caching file human.gene_sums.G026.gtf.gz.
#> 2023-05-07 00:10:27.552718 downloading and reading the counts: 21 samples across 63856 features.
#> 2023-05-07 00:10:27.855395 caching file sra.gene_sums.DRP000499.G026.gz.
#> 2023-05-07 00:10:28.09871 constructing the RangedSummarizedExperiment (rse) object.
colSums(compute_read_counts(rse_gene_DRP000499)) / 1e6
#> DRR001622 DRR001623 DRR001624 DRR001625 DRR001626 DRR001627 DRR001628 DRR001629
#> 10.952140 7.995718 11.874082 NaN 7.518984 11.272031 53.412769 27.539984
#> DRR001630 DRR001631 DRR001632 DRR001633 DRR001634 DRR001635 DRR001636 DRR001637
#> 27.721930 35.301338 48.673389 20.585057 35.074474 31.622335 34.844632 46.270835
#> DRR001638 DRR001639 DRR001640 DRR001641 DRR001642
#> 31.505395 26.991040 18.916860 17.788569 25.389886
## You can compare the read counts against those from recount::read_counts()
## from the recount2 project which used a different RNA-seq aligner
## If needed, install recount, the R/Bioconductor package for recount2:
# BiocManager::install("recount")
recount2_readsums <- colSums(assay(recount::read_counts(
recount::rse_gene_SRP009615
), "counts")) / 1e6
#> Setting options('download.file.method.GEOquery'='auto')
#> Setting options('GEOquery.inmemory.gpl'=FALSE)
recount3_readsums <- colSums(compute_read_counts(rse_gene_SRP009615)) / 1e6
recount_readsums <- data.frame(
recount2 = recount2_readsums[order(names(recount2_readsums))],
recount3 = recount3_readsums[order(names(recount3_readsums))]
)
plot(recount2 ~ recount3, data = recount_readsums)
abline(a = 0, b = 1, col = "purple", lwd = 2, lty = 2)
## Repeat for DRP000499, a paired-end study
recount::download_study("DRP000499", outdir = tempdir())
#> 2023-05-07 00:10:31.92353 downloading file rse_gene.Rdata to /tmp/RtmpAxQcvh
load(file.path(tempdir(), "rse_gene.Rdata"), verbose = TRUE)
#> Loading objects:
#> rse_gene
recount2_readsums <- colSums(assay(recount::read_counts(
rse_gene
), "counts")) / 1e6
recount3_readsums <- colSums(compute_read_counts(rse_gene_DRP000499)) / 1e6
recount_readsums <- data.frame(
recount2 = recount2_readsums[order(names(recount2_readsums))],
recount3 = recount3_readsums[order(names(recount3_readsums))]
)
plot(recount2 ~ recount3, data = recount_readsums)
abline(a = 0, b = 1, col = "purple", lwd = 2, lty = 2)