List available projects in recount3

available_projects(
  organism = c("human", "mouse"),
  recount3_url = getOption("recount3_url", "http://duffel.rail.bio/recount3"),
  bfc = recount3_cache(),
  available_homes = project_homes(organism = organism, recount3_url = recount3_url)
)

Arguments

organism

A character(1) specifying which organism you want to download data from. Supported options are "human" or "mouse".

recount3_url

A character(1) specifying the home URL for recount3 or a local directory where you have mirrored recount3. Defaults to the load balancer http://duffel.rail.bio/recount3, but can also be https://recount-opendata.s3.amazonaws.com/recount3/release from https://registry.opendata.aws/recount/ or SciServer datascope from IDIES at JHU https://sciserver.org/public-data/recount3/data. You can set the R option recount3_url (for example in your .Rprofile) if you have a favorite mirror.

bfc

A BiocFileCache-class object where the files will be cached to, typically created by recount3_cache().

available_homes

A character() vector with the available project homes for the given recount3_url. If you use a non-standard recount3_url, you will likely need to specify manually the valid values for available_homes.

Value

A data.frame() with the project ID (project), the organism, the file_source from where the data was accessed, the recount3 project home location (project_home), the project project_type that differentiates between data_sources and compilations, the n_samples with the number of samples in the given project.

Examples


## Find all the human projects
human_projects <- available_projects()
#> 2023-05-07 00:10:05.710375 caching file sra.recount_project.MD.gz.
#> 2023-05-07 00:10:06.059883 caching file gtex.recount_project.MD.gz.
#> 2023-05-07 00:10:06.403144 caching file tcga.recount_project.MD.gz.

## Explore the results
dim(human_projects)
#> [1] 8742    6
head(human_projects)
#>     project organism file_source     project_home project_type n_samples
#> 1 SRP107565    human         sra data_sources/sra data_sources       216
#> 2 SRP149665    human         sra data_sources/sra data_sources         4
#> 3 SRP017465    human         sra data_sources/sra data_sources        23
#> 4 SRP119165    human         sra data_sources/sra data_sources         6
#> 5 SRP133965    human         sra data_sources/sra data_sources        12
#> 6 SRP096765    human         sra data_sources/sra data_sources         7

## How many are from a data source vs a compilation?
table(human_projects$project_type, useNA = "ifany")
#> 
#> data_sources 
#>         8742 

## What are the unique file sources?
table(
    human_projects$file_source[human_projects$project_type == "data_sources"]
)
#> 
#> gtex  sra tcga 
#>   32 8677   33 

## Note that big projects are broken up to make them easier to access
## For example, GTEx and TCGA are broken up by tissue
head(subset(human_projects, file_source == "gtex"))
#>             project organism file_source      project_home project_type
#> 8678 ADIPOSE_TISSUE    human        gtex data_sources/gtex data_sources
#> 8679         MUSCLE    human        gtex data_sources/gtex data_sources
#> 8680   BLOOD_VESSEL    human        gtex data_sources/gtex data_sources
#> 8681          HEART    human        gtex data_sources/gtex data_sources
#> 8682          OVARY    human        gtex data_sources/gtex data_sources
#> 8683         UTERUS    human        gtex data_sources/gtex data_sources
#>      n_samples
#> 8678      1293
#> 8679       881
#> 8680      1398
#> 8681       942
#> 8682       195
#> 8683       159
head(subset(human_projects, file_source == "tcga"))
#>      project organism file_source      project_home project_type n_samples
#> 8710     ACC    human        tcga data_sources/tcga data_sources        79
#> 8711    BLCA    human        tcga data_sources/tcga data_sources       433
#> 8712    BRCA    human        tcga data_sources/tcga data_sources      1256
#> 8713    CESC    human        tcga data_sources/tcga data_sources       309
#> 8714    CHOL    human        tcga data_sources/tcga data_sources        45
#> 8715    COAD    human        tcga data_sources/tcga data_sources       546

## Find all the mouse projects
mouse_projects <- available_projects(organism = "mouse")
#> 2023-05-07 00:10:12.289013 caching file sra.recount_project.MD.gz.

## Explore the results
dim(mouse_projects)
#> [1] 10088     6
head(mouse_projects)
#>     project organism file_source     project_home project_type n_samples
#> 1 SRP170963    mouse         sra data_sources/sra data_sources       368
#> 2 SRP045763    mouse         sra data_sources/sra data_sources        22
#> 3 DRP005463    mouse         sra data_sources/sra data_sources        12
#> 4 SRP172863    mouse         sra data_sources/sra data_sources         8
#> 5 SRP186363    mouse         sra data_sources/sra data_sources        13
#> 6 SRP101363    mouse         sra data_sources/sra data_sources        22

## How many are from a data source vs a compilation?
table(mouse_projects$project_type, useNA = "ifany")
#> 
#> data_sources 
#>        10088 

## What are the unique file sources?
table(
    mouse_projects$file_source[mouse_projects$project_type == "data_sources"]
)
#> 
#>   sra 
#> 10088 

if (FALSE) {
## Use with a custom recount3_url:
available_projects(
    recount3_url = "http://snaptron.cs.jhu.edu/data/temp/recount3test",
    available_homes = "data_sources/sra"
)

## You can also rely on project_homes() if the custom URL has a text file
## that can be read with readLines() at:
## <recount3_url>/<organism>/homes_index
available_projects(
    recount3_url = "http://snaptron.cs.jhu.edu/data/temp/recount3test"
)
}