This function merges the sample metadata from a RangedSummarizedExperiment object for some of the variables that we (at Jaffelab) typically create with our RNA-seq processing pipeline for samples that are sequenced in more than one lane.
merge_rse_metrics(rse)
A RangedSummarizedExperiment-class object with 'concordMapRate', 'overallMapRate', 'mitoRate', 'rRNA_rate', 'totalAssignedGene', 'numMapped', 'numReads', 'numUnmapped', 'mitoMapped', 'totalMapped', 'ERCCsumLogErr' as either NumericList() or IntegerList() objects in the colData() columns.
A RangedSummarizedExperiment-class with merged columns for 'concordMapRate', 'overallMapRate', 'mitoRate', 'rRNA_rate', 'totalAssignedGene', 'numMapped', 'numReads', 'numUnmapped', 'mitoMapped', 'totalMapped', 'ERCCsumLogErr'
## Taken from ?SummarizedExperiment
library("SummarizedExperiment")
#> Loading required package: MatrixGenerics
#> Loading required package: matrixStats
#>
#> Attaching package: ‘MatrixGenerics’
#> The following objects are masked from ‘package:matrixStats’:
#>
#> colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
#> colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
#> colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
#> colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
#> colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
#> colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
#> colWeightedMeans, colWeightedMedians, colWeightedSds,
#> colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
#> rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
#> rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
#> rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
#> rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
#> rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
#> rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
#> rowWeightedSds, rowWeightedVars
#> Loading required package: GenomicRanges
#> Loading required package: stats4
#> Loading required package: BiocGenerics
#>
#> Attaching package: ‘BiocGenerics’
#> The following object is masked from ‘package:limma’:
#>
#> plotMA
#> The following objects are masked from ‘package:stats’:
#>
#> IQR, mad, sd, var, xtabs
#> The following objects are masked from ‘package:base’:
#>
#> Filter, Find, Map, Position, Reduce, anyDuplicated, aperm, append,
#> as.data.frame, basename, cbind, colnames, dirname, do.call,
#> duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
#> lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
#> pmin.int, rank, rbind, rownames, sapply, saveRDS, setdiff, table,
#> tapply, union, unique, unsplit, which.max, which.min
#> Loading required package: S4Vectors
#>
#> Attaching package: ‘S4Vectors’
#> The following object is masked from ‘package:utils’:
#>
#> findMatches
#> The following objects are masked from ‘package:base’:
#>
#> I, expand.grid, unname
#> Loading required package: IRanges
#> Loading required package: GenomeInfoDb
#> Loading required package: Biobase
#> Welcome to Bioconductor
#>
#> Vignettes contain introductory material; view with
#> 'browseVignettes()'. To cite Bioconductor, see
#> 'citation("Biobase")', and for packages 'citation("pkgname")'.
#>
#> Attaching package: ‘Biobase’
#> The following object is masked from ‘package:MatrixGenerics’:
#>
#> rowMedians
#> The following objects are masked from ‘package:matrixStats’:
#>
#> anyMissing, rowMedians
nrows <- 200
ncols <- 6
set.seed(20181116)
counts <- matrix(runif(nrows * ncols, 1, 1e4), nrows)
rowRanges <- GRanges(rep(c("chr1", "chr2"), c(50, 150)),
IRanges(floor(runif(200, 1e5, 1e6)), width = 100),
strand = sample(c("+", "-"), 200, TRUE),
feature_id = sprintf("ID%03d", seq_len(200))
)
## Function for generating some data
rand_gen <- function(mu = 1e6, sd = 1e3, digits = 0) {
IRanges::NumericList(lapply(seq_len(6), function(x) {
round(rnorm(2, mu, sd), digits)
}))
}
colData <- DataFrame(
Treatment = rep(c("ChIP", "Input"), 3),
row.names = LETTERS[seq_len(6)],
concordMapRate = rand_gen(0.5, 0.05, 10),
overallMapRate = rand_gen(0.7, 0.05, 10),
mitoRate = rand_gen(0.2, 0.05, 10),
rRNA_rate = rand_gen(0.1, 0.01, 10),
totalAssignedGene = rand_gen(0.5, 0.05, 10),
numMapped = rand_gen(),
numReads = rand_gen(1e6 + 6e5),
numUnmapped = rand_gen(5e5),
mitoMapped = rand_gen(1e5),
totalMapped = rand_gen(1e6 + 1e5),
ERCCsumLogErr = rand_gen(-28.5, 4, 6)
)
rse <- SummarizedExperiment(
assays = SimpleList(counts = counts),
rowRanges = rowRanges, colData = colData
)
colData(rse)
#> DataFrame with 6 rows and 12 columns
#> Treatment concordMapRate overallMapRate mitoRate
#> <character> <NumericList> <NumericList> <NumericList>
#> A ChIP 0.541645,0.581919 0.638841,0.744259 0.196154,0.176830
#> B Input 0.636704,0.417308 0.720357,0.808207 0.206167,0.307936
#> C ChIP 0.480932,0.563030 0.766605,0.731026 0.263311,0.191516
#> D Input 0.514338,0.404294 0.680493,0.706884 0.199656,0.201472
#> E ChIP 0.575868,0.580396 0.675552,0.721660 0.283546,0.200031
#> F Input 0.487973,0.549464 0.758194,0.680816 0.254013,0.122518
#> rRNA_rate totalAssignedGene numMapped numReads
#> <NumericList> <NumericList> <NumericList> <NumericList>
#> A 0.0982325,0.1004535 0.534906,0.495279 999489,1001045 1600514,1599856
#> B 0.108117,0.105019 0.512784,0.585948 999904,999645 1600861,1600902
#> C 0.0935756,0.1128849 0.500233,0.543947 1000994, 999994 1599045,1599113
#> D 0.0983786,0.0961797 0.454453,0.394858 1000080,1001643 1600388,1601137
#> E 0.0786774,0.0902039 0.511399,0.444904 999405,998558 1598587,1600106
#> F 0.116688,0.101786 0.512436,0.581784 1000598, 999127 1598227,1598982
#> numUnmapped mitoMapped totalMapped ERCCsumLogErr
#> <NumericList> <NumericList> <NumericList> <NumericList>
#> A 498799,500663 100075, 99746 1100621,1098107 -30.5656,-40.7510
#> B 499160,501936 100454,100220 1099005,1100717 -26.1549,-25.9601
#> C 499887,499965 100077,100616 1099423,1101197 -29.4516,-27.9824
#> D 499315,499551 98586,97819 1098706,1098371 -22.6042,-28.8658
#> E 499205,500651 98690,100203 1099373,1101728 -22.6664,-31.5804
#> F 500096,501025 99278,98614 1101246,1098881 -28.1533,-28.5806
rse_merged <- merge_rse_metrics(rse)
#> 2024-12-12 22:07:51.349541 attempting to approximate ERCCsumLogErr.
colData(rse_merged)
#> DataFrame with 6 rows and 12 columns
#> Treatment concordMapRate overallMapRate mitoRate rRNA_rate
#> <character> <numeric> <numeric> <numeric> <numeric>
#> A ChIP 0.561778 0.691539 0.186484 0.0993439
#> B Input 0.527004 0.764283 0.257045 0.1065683
#> C ChIP 0.521982 0.748815 0.227432 0.1032254
#> D Input 0.459303 0.693692 0.200565 0.0972783
#> E ChIP 0.578133 0.698617 0.241806 0.0844382
#> F Input 0.518726 0.719496 0.188314 0.1092428
#> totalAssignedGene numMapped numReads numUnmapped mitoMapped totalMapped
#> <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
#> A 0.515077 2000534 3200370 999462 199821 2198728
#> B 0.549361 1999549 3201763 1001096 200674 2199722
#> C 0.522079 2000988 3198158 999852 200693 2200620
#> D 0.424632 2001723 3201525 998866 196405 2197077
#> E 0.478166 1997963 3198693 999856 198893 2201101
#> F 0.547085 1999725 3197209 1001121 197892 2200127
#> ERCCsumLogErr
#> <numeric>
#> A -35.5606
#> B -26.0575
#> C -28.7150
#> D -25.6981
#> E -27.0485
#> F -28.3668
## Use the BSP2 real data
local_bsp2 <- tempfile("BSP2_rse_gene.Rdata")
download.file(
"https://s3.us-east-2.amazonaws.com/libd-brainseq2/rse_gene_unfiltered.Rdata",
destfile = local_bsp2,
mode = "wb"
)
load(local_bsp2, verbose = TRUE)
#> Loading objects:
#> rse_gene
# lobstr::obj_size(rse_gene)
# 872.12 MB
bsp2_rse_merged <- merge_rse_metrics(rse_gene)
#> 2024-12-12 22:08:04.881405 attempting to approximate ERCCsumLogErr.
colData(bsp2_rse_merged)
#> DataFrame with 900 rows and 54 columns
#> SAMPLE_ID FQCbasicStats
#> <CharacterList> <CharacterList>
#> R10424 R10424_C4KC9ACXX_TAA.. PASS
#> R12195 R12195_C4E9TACXX_CGG.. PASS
#> R12198 R12198_C4KC9ACXX_TCC.. PASS
#> R12199 R12199_C4KC9ACXX_TCT.. PASS
#> R12200 R12200_C4KC9ACXX_AGC.. PASS
#> ... ... ...
#> R5760 R5760_C3ULGACXX_ATTA.. PASS
#> R5764 R5764_C3ULGACXX_TCCG.. PASS
#> R5765 R5765_C3KUEACXX_TCTC.. PASS
#> R5766 R5766_C41CPACXX_CTGA..,R5766_C4KY0ACXX_CTGA.. PASS
#> R5768 R5768_C3DE5ACXX_GAAT.. PASS
#> perBaseQual perTileQual perSeqQual perBaseContent
#> <CharacterList> <CharacterList> <CharacterList> <CharacterList>
#> R10424 PASS PASS PASS FAIL/WARN
#> R12195 PASS WARN PASS FAIL/WARN
#> R12198 PASS PASS PASS FAIL/WARN
#> R12199 PASS PASS PASS FAIL/WARN
#> R12200 PASS PASS PASS FAIL/WARN
#> ... ... ... ... ...
#> R5760 PASS PASS PASS FAIL/WARN
#> R5764 PASS PASS PASS FAIL/WARN
#> R5765 PASS PASS/WARN PASS FAIL/WARN
#> R5766 PASS,FAIL WARN/PASS,WARN PASS FAIL
#> R5768 PASS PASS PASS FAIL/WARN
#> GCcontent Ncontent SeqLengthDist SeqDuplication
#> <CharacterList> <CharacterList> <CharacterList> <CharacterList>
#> R10424 WARN/FAIL PASS PASS FAIL
#> R12195 WARN/FAIL PASS PASS WARN
#> R12198 WARN PASS PASS FAIL/WARN
#> R12199 WARN PASS PASS WARN
#> R12200 WARN/FAIL PASS PASS WARN/PASS
#> ... ... ... ... ...
#> R5760 WARN PASS PASS FAIL/WARN
#> R5764 WARN PASS PASS FAIL/WARN
#> R5765 WARN PASS PASS FAIL/WARN
#> R5766 WARN PASS PASS WARN
#> R5768 WARN PASS PASS FAIL/WARN
#> OverrepSeqs AdapterContent KmerContent percentGC_R1
#> <CharacterList> <CharacterList> <CharacterList> <IntegerList>
#> R10424 WARN PASS FAIL 43
#> R12195 WARN PASS FAIL 44
#> R12198 WARN PASS FAIL 45
#> R12199 WARN PASS FAIL 44
#> R12200 WARN PASS FAIL 45
#> ... ... ... ... ...
#> R5760 WARN FAIL FAIL 45
#> R5764 WARN WARN FAIL 45
#> R5765 WARN PASS FAIL 44
#> R5766 WARN WARN FAIL 47,46
#> R5768 WARN PASS FAIL 45
#> phred100_R1 phredGT30_R1 phredGT35_R1 Adapter88_R1
#> <NumericList> <NumericList> <NumericList> <NumericList>
#> R10424 33 0.909088 0.70897 2.74234
#> R12195 34 0.906698 0.706421 1.52881
#> R12198 33 0.904668 0.695954 1.79942
#> R12199 33 0.907879 0.704134 1.48579
#> R12200 33 0.908431 0.705177 1.8948
#> ... ... ... ... ...
#> R5760 34 0.901271 0.705441 11.8898
#> R5764 34 0.904807 0.705915 6.36412
#> R5765 34 0.932119 0.769267 2.87781
#> R5766 32,29 0.889595,0.769699 0.673802,0.438134 6.99110,6.57718
#> R5768 34 0.93067 0.785657 2.23658
#> percentGC_R2 phred100_R2 phredGT30_R2 phredGT35_R2
#> <IntegerList> <NumericList> <NumericList> <NumericList>
#> R10424 43 32 0.861428 0.663019
#> R12195 44 34 0.853311 0.65908
#> R12198 45 32 0.832541 0.619686
#> R12199 44 32 0.839551 0.635018
#> R12200 45 32 0.849138 0.647683
#> ... ... ... ... ...
#> R5760 45 33 0.855376 0.692943
#> R5764 45 33 0.850278 0.682974
#> R5765 44 32 0.8122 0.617688
#> R5766 47,46 31,30 0.855985,0.739332 0.687230,0.479329
#> R5768 45 34 0.876481 0.731134
#> Adapter88_R2 ERCCsumLogErr
#> <NumericList> <numeric>
#> R10424 2.7528 -110.3658
#> R12195 1.54912 -18.3144
#> R12198 1.80651 -23.1269
#> R12199 1.49456 -30.3037
#> R12200 1.90594 -24.5632
#> ... ... ...
#> R5760 12.2012 -623.141
#> R5764 6.53021 -623.141
#> R5765 2.72979 -623.141
#> R5766 7.13411,6.80373 -623.141
#> R5768 2.27037 -623.141
#> bamFile trimmed numReads
#> <CharacterList> <LogicalList> <integer>
#> R10424 /dcl01/lieber/ajaffe.. FALSE 137207346
#> R12195 /dcl01/lieber/ajaffe.. FALSE 186256540
#> R12198 /dcl01/lieber/ajaffe.. FALSE 211587890
#> R12199 /dcl01/lieber/ajaffe.. FALSE 127053710
#> R12200 /dcl01/lieber/ajaffe.. FALSE 80463970
#> ... ... ... ...
#> R5760 /dcl01/lieber/ajaffe.. TRUE 141075744
#> R5764 /dcl01/lieber/ajaffe.. FALSE 119501410
#> R5765 /dcl01/lieber/ajaffe.. FALSE 172994940
#> R5766 /dcl01/lieber/ajaffe..,/dcl01/lieber/ajaffe.. FALSE 111397580
#> R5768 /dcl01/lieber/ajaffe.. FALSE 118383286
#> numMapped numUnmapped overallMapRate concordMapRate totalMapped
#> <integer> <integer> <numeric> <numeric> <integer>
#> R10424 125018261 12189085 0.9112 0.8586 127414655
#> R12195 173098203 13158337 0.9294 0.8862 179828658
#> R12198 190675169 20912721 0.9012 0.8541 197766175
#> R12199 118502044 8551666 0.9327 0.8868 122760702
#> R12200 73183420 7280550 0.9095 0.8596 75668574
#> ... ... ... ... ... ...
#> R5760 129117038 11958706 0.9152 0.914900 107353997
#> R5764 102010413 17490997 0.8536 0.767300 80847720
#> R5765 157607321 15387619 0.9111 0.847400 120937201
#> R5766 94485287 16912293 0.8482 0.760247 79219995
#> R5768 110231568 8151718 0.9311 0.855700 85972502
#> mitoMapped mitoRate totalAssignedGene rRNA_rate RNum
#> <integer> <numeric> <numeric> <numeric> <character>
#> R10424 2250800 0.0173585 0.360012 5.26368e-05 R10424
#> R12195 2929274 0.0160282 0.400558 7.92850e-05 R12195
#> R12198 4838084 0.0238795 0.438474 8.75740e-05 R12198
#> R12199 1915632 0.0153648 0.438864 6.99381e-05 R12199
#> R12200 826483 0.0108044 0.362791 8.24223e-05 R12200
#> ... ... ... ... ... ...
#> R5760 29972076 0.218255 0.419577 1.86703e-05 R5760
#> R5764 26910188 0.249728 0.454518 1.91898e-05 R5764
#> R5765 44435124 0.268697 0.479815 1.57517e-05 R5765
#> R5766 21543948 0.213824 0.497928 2.62909e-05 R5766
#> R5768 29687209 0.256677 0.555499 8.52578e-06 R5768
#> BrNum Region RIN Age Sex Race
#> <character> <character> <NumericList> <numeric> <character> <character>
#> R10424 Br5168 DLPFC 6.7 64.08 M CAUC
#> R12195 Br5073 DLPFC 8.4 62.61 M AA
#> R12198 Br5217 DLPFC 8.5 29.98 M CAUC
#> R12199 Br5234 DLPFC 8.8 65.26 M CAUC
#> R12200 Br5372 DLPFC 7 32.35 M AA
#> ... ... ... ... ... ... ...
#> R5760 Br1627 HIPPO 7.1 0.5147 F CAUC
#> R5764 Br1832 HIPPO 5.7 54.8600 M AA
#> R5765 Br1840 HIPPO 5.6 55.3400 M AA
#> R5766 Br1884 HIPPO 8 38.4800 M AA
#> R5768 Br1898 HIPPO 7.8 34.7600 M AA
#> Dx snpPC1 snpPC2 snpPC3 snpPC4
#> <character> <numeric> <numeric> <numeric> <numeric>
#> R10424 Control 0.0541862 0.00141503 -0.02229830 6.71690e-06
#> R12195 Schizo -0.0255144 -0.00401754 0.00553881 -5.78574e-04
#> R12198 Schizo 0.0545858 -0.00788385 0.00479833 4.66608e-04
#> R12199 Schizo 0.0543872 -0.00683429 0.00556194 -8.37099e-05
#> R12200 Schizo -0.0434981 -0.00112584 0.00438767 -3.02685e-03
#> ... ... ... ... ... ...
#> R5760 Control 0.05343660 -5.27473e-03 0.00452529 -0.000197550
#> R5764 Control -0.07612950 -7.49554e-05 0.00128218 -0.000715601
#> R5765 Schizo -0.04487910 1.43625e-04 0.00581480 -0.002767710
#> R5766 Control 0.00203288 -5.36031e-03 0.00636164 0.003517570
#> R5768 Control -0.05808860 -1.22494e-03 0.00182823 0.000191893
#> snpPC5 snpPC6 snpPC7 snpPC8 snpPC9
#> <numeric> <numeric> <numeric> <numeric> <numeric>
#> R10424 8.66316e-05 -0.00505508 -0.001056890 0.00511064 -0.001438760
#> R12195 -1.47552e-03 0.00816687 -0.005289970 0.00590834 -0.006252970
#> R12198 1.82373e-04 0.00346950 -0.001874620 -0.00283671 -0.000653331
#> R12199 -6.26193e-04 -0.00131079 0.000433349 -0.00301263 0.001626550
#> R12200 1.78597e-03 -0.00347009 0.000658279 -0.00062054 0.002938670
#> ... ... ... ... ... ...
#> R5760 0.000762996 -0.00125775 -0.001663130 -0.00240119 2.14050e-03
#> R5764 0.001890170 0.00334776 0.003100850 -0.00130183 -5.45976e-05
#> R5765 -0.001926500 -0.00123727 -0.000452834 -0.00148963 2.95421e-03
#> R5766 0.002043590 -0.00792677 0.005129850 0.00190378 4.51872e-03
#> R5768 -0.001656520 0.00538843 -0.001508440 -0.00038706 -1.06840e-03
#> snpPC10
#> <numeric>
#> R10424 0.000946483
#> R12195 0.008017290
#> R12198 -0.002142300
#> R12199 -0.003569450
#> R12200 -0.000186203
#> ... ...
#> R5760 -0.005488660
#> R5764 -0.000805835
#> R5765 0.004231900
#> R5766 -0.002314040
#> R5768 0.000110817
## Compare the ERCCsumLogErr approximation against the mean of the
## error values. Note that you would actually have to load the actual
## ERCC quantification output files in order to compute the correct
## ERCCsumLogErr values.
bsp2_rse_merged$mean_ERCCsumLogErr <-
sapply(rse_gene$ERCCsumLogErr, mean)
with(
colData(bsp2_rse_merged)[lengths(rse_gene$ERCCsumLogErr) > 1, ],
plot(
mean_ERCCsumLogErr,
ERCCsumLogErr,
ylab = "Approximate ERCCsumLogErr"
)
)
abline(a = 0, b = 1, col = "red")
with(
colData(bsp2_rse_merged)[lengths(rse_gene$ERCCsumLogErr) > 1, ],
summary(mean_ERCCsumLogErr - ERCCsumLogErr)
)
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> -46.91332 -0.47002 -0.25959 -1.02531 -0.12544 -0.00001