List available projects in recount3
available_projects(
organism = c("human", "mouse"),
recount3_url = getOption("recount3_url", "http://duffel.rail.bio/recount3"),
bfc = recount3_cache(),
available_homes = project_homes(organism = organism, recount3_url = recount3_url)
)
A character(1)
specifying which organism you want to
download data from. Supported options are "human"
or "mouse"
.
A character(1)
specifying the home URL for recount3
or a local directory where you have mirrored recount3
. Defaults to the
load balancer http://duffel.rail.bio/recount3, but can also be
https://recount-opendata.s3.amazonaws.com/recount3/release from
https://registry.opendata.aws/recount/ or SciServer datascope from
IDIES at JHU https://sciserver.org/public-data/recount3/data. You can
set the R option recount3_url
(for example in your .Rprofile
) if
you have a favorite mirror.
A BiocFileCache-class
object where the files will be cached to, typically created by
recount3_cache()
.
A character()
vector with the available project homes
for the given recount3_url
. If you use a non-standard recount3_url
, you
will likely need to specify manually the valid values for available_homes
.
A data.frame()
with the project ID (project
), the organism
, the
file_source
from where the data was accessed,
the recount3
project home location (project_home
), the project
project_type
that differentiates between data_sources
and compilations
,
the n_samples
with the number of samples in the given project.
## Find all the human projects
human_projects <- available_projects()
#> 2023-05-07 00:10:05.710375 caching file sra.recount_project.MD.gz.
#> 2023-05-07 00:10:06.059883 caching file gtex.recount_project.MD.gz.
#> 2023-05-07 00:10:06.403144 caching file tcga.recount_project.MD.gz.
## Explore the results
dim(human_projects)
#> [1] 8742 6
head(human_projects)
#> project organism file_source project_home project_type n_samples
#> 1 SRP107565 human sra data_sources/sra data_sources 216
#> 2 SRP149665 human sra data_sources/sra data_sources 4
#> 3 SRP017465 human sra data_sources/sra data_sources 23
#> 4 SRP119165 human sra data_sources/sra data_sources 6
#> 5 SRP133965 human sra data_sources/sra data_sources 12
#> 6 SRP096765 human sra data_sources/sra data_sources 7
## How many are from a data source vs a compilation?
table(human_projects$project_type, useNA = "ifany")
#>
#> data_sources
#> 8742
## What are the unique file sources?
table(
human_projects$file_source[human_projects$project_type == "data_sources"]
)
#>
#> gtex sra tcga
#> 32 8677 33
## Note that big projects are broken up to make them easier to access
## For example, GTEx and TCGA are broken up by tissue
head(subset(human_projects, file_source == "gtex"))
#> project organism file_source project_home project_type
#> 8678 ADIPOSE_TISSUE human gtex data_sources/gtex data_sources
#> 8679 MUSCLE human gtex data_sources/gtex data_sources
#> 8680 BLOOD_VESSEL human gtex data_sources/gtex data_sources
#> 8681 HEART human gtex data_sources/gtex data_sources
#> 8682 OVARY human gtex data_sources/gtex data_sources
#> 8683 UTERUS human gtex data_sources/gtex data_sources
#> n_samples
#> 8678 1293
#> 8679 881
#> 8680 1398
#> 8681 942
#> 8682 195
#> 8683 159
head(subset(human_projects, file_source == "tcga"))
#> project organism file_source project_home project_type n_samples
#> 8710 ACC human tcga data_sources/tcga data_sources 79
#> 8711 BLCA human tcga data_sources/tcga data_sources 433
#> 8712 BRCA human tcga data_sources/tcga data_sources 1256
#> 8713 CESC human tcga data_sources/tcga data_sources 309
#> 8714 CHOL human tcga data_sources/tcga data_sources 45
#> 8715 COAD human tcga data_sources/tcga data_sources 546
## Find all the mouse projects
mouse_projects <- available_projects(organism = "mouse")
#> 2023-05-07 00:10:12.289013 caching file sra.recount_project.MD.gz.
## Explore the results
dim(mouse_projects)
#> [1] 10088 6
head(mouse_projects)
#> project organism file_source project_home project_type n_samples
#> 1 SRP170963 mouse sra data_sources/sra data_sources 368
#> 2 SRP045763 mouse sra data_sources/sra data_sources 22
#> 3 DRP005463 mouse sra data_sources/sra data_sources 12
#> 4 SRP172863 mouse sra data_sources/sra data_sources 8
#> 5 SRP186363 mouse sra data_sources/sra data_sources 13
#> 6 SRP101363 mouse sra data_sources/sra data_sources 22
## How many are from a data source vs a compilation?
table(mouse_projects$project_type, useNA = "ifany")
#>
#> data_sources
#> 10088
## What are the unique file sources?
table(
mouse_projects$file_source[mouse_projects$project_type == "data_sources"]
)
#>
#> sra
#> 10088
if (FALSE) {
## Use with a custom recount3_url:
available_projects(
recount3_url = "http://snaptron.cs.jhu.edu/data/temp/recount3test",
available_homes = "data_sources/sra"
)
## You can also rely on project_homes() if the custom URL has a text file
## that can be read with readLines() at:
## <recount3_url>/<organism>/homes_index
available_projects(
recount3_url = "http://snaptron.cs.jhu.edu/data/temp/recount3test"
)
}