library(MungeSumstats)
MungeSumstats now offers high throughput query and import functionality to data from the MRC IEU Open GWAS Project.
#### Search for datasets ####
metagwas <- MungeSumstats::find_sumstats(traits = c("parkinson","alzheimer"),
min_sample_size = 1000)
head(metagwas,3)
ids <- (dplyr::arrange(metagwas, nsnp))$id
## id trait group_name year author
## 1 ieu-a-298 Alzheimer's disease public 2013 Lambert
## 2 ieu-b-2 Alzheimer's disease public 2019 Kunkle BW
## 3 ieu-a-297 Alzheimer's disease public 2013 Lambert
## consortium
## 1 IGAP
## 2 Alzheimer Disease Genetics Consortium (ADGC), European Alzheimer's Disease Initiative (EADI), Cohorts for Heart and Aging Research in Genomic Epidemiology Consortium (CHARGE), Genetic and Environmental Risk in AD/Defining Genetic, Polygenic and Environmental Risk for Alzheimer's Disease Consortium (GERAD/PERADES),
## 3 IGAP
## sex population unit nsnp sample_size build
## 1 Males and Females European log odds 11633 74046 HG19/GRCh37
## 2 Males and Females European NA 10528610 63926 HG19/GRCh37
## 3 Males and Females European log odds 7055882 54162 HG19/GRCh37
## category subcategory ontology mr priority pmid sd
## 1 Disease Psychiatric / neurological NA 1 1 24162737 NA
## 2 Binary Psychiatric / neurological NA 1 0 30820047 NA
## 3 Disease Psychiatric / neurological NA 1 2 24162737 NA
## note ncase
## 1 Exposure only; Effect allele frequencies are missing; forward(+) strand 25580
## 2 NA 21982
## 3 Effect allele frequencies are missing; forward(+) strand 17008
## ncontrol N
## 1 48466 74046
## 2 41944 63926
## 3 37154 54162
You can supply import_sumstats()
with a list of as many OpenGWAS IDs as you
want, but we’ll just give one to save time.
datasets <- MungeSumstats::import_sumstats(ids = "ieu-a-298",
ref_genome = "GRCH37")
By default, import_sumstats
results a named list where the names are the Open
GWAS dataset IDs and the items are the respective paths to the formatted summary
statistics.
print(datasets)
## $`ieu-a-298`
## [1] "/tmp/Rtmpmmld9v/ieu-a-298.tsv.gz"
You can easily turn this into a data.frame as well.
results_df <- data.frame(id=names(datasets),
path=unlist(datasets))
print(results_df)
## id path
## ieu-a-298 ieu-a-298 /tmp/Rtmpmmld9v/ieu-a-298.tsv.gz
Optional: Speed up with multi-threaded download via axel.
datasets <- MungeSumstats::import_sumstats(ids = ids,
vcf_download = TRUE,
download_method = "axel",
nThread = max(2,future::availableCores()-2))
See the Getting started vignette for more information on how to use MungeSumstats and its functionality.
utils::sessionInfo()
## R Under development (unstable) (2024-01-16 r85808)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.3 LTS
##
## Matrix products: default
## BLAS: /home/biocbuild/bbs-3.19-bioc/R/lib/libRblas.so
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB LC_COLLATE=C
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: America/New_York
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] MungeSumstats_1.11.8 BiocStyle_2.31.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.0
## [2] dplyr_1.1.4
## [3] blob_1.2.4
## [4] filelock_1.0.3
## [5] R.utils_2.12.3
## [6] Biostrings_2.71.2
## [7] bitops_1.0-7
## [8] fastmap_1.1.1
## [9] RCurl_1.98-1.14
## [10] BiocFileCache_2.11.1
## [11] VariantAnnotation_1.49.6
## [12] GenomicAlignments_1.39.4
## [13] XML_3.99-0.16.1
## [14] digest_0.6.34
## [15] lifecycle_1.0.4
## [16] KEGGREST_1.43.0
## [17] RSQLite_2.3.5
## [18] googleAuthR_2.0.1
## [19] magrittr_2.0.3
## [20] compiler_4.4.0
## [21] rlang_1.1.3
## [22] sass_0.4.8
## [23] progress_1.2.3
## [24] tools_4.4.0
## [25] utf8_1.2.4
## [26] yaml_2.3.8
## [27] data.table_1.15.0
## [28] rtracklayer_1.63.0
## [29] knitr_1.45
## [30] prettyunits_1.2.0
## [31] S4Arrays_1.3.3
## [32] curl_5.2.0
## [33] bit_4.0.5
## [34] DelayedArray_0.29.5
## [35] xml2_1.3.6
## [36] abind_1.4-5
## [37] BiocParallel_1.37.0
## [38] BiocGenerics_0.49.1
## [39] R.oo_1.26.0
## [40] grid_4.4.0
## [41] stats4_4.4.0
## [42] fansi_1.0.6
## [43] biomaRt_2.59.1
## [44] SummarizedExperiment_1.33.3
## [45] cli_3.6.2
## [46] rmarkdown_2.25
## [47] crayon_1.5.2
## [48] generics_0.1.3
## [49] BSgenome.Hsapiens.1000genomes.hs37d5_0.99.1
## [50] httr_1.4.7
## [51] rjson_0.2.21
## [52] DBI_1.2.2
## [53] cachem_1.0.8
## [54] stringr_1.5.1
## [55] zlibbioc_1.49.0
## [56] assertthat_0.2.1
## [57] parallel_4.4.0
## [58] AnnotationDbi_1.65.2
## [59] BiocManager_1.30.22
## [60] XVector_0.43.1
## [61] restfulr_0.0.15
## [62] matrixStats_1.2.0
## [63] vctrs_0.6.5
## [64] Matrix_1.6-5
## [65] jsonlite_1.8.8
## [66] bookdown_0.37
## [67] IRanges_2.37.1
## [68] hms_1.1.3
## [69] S4Vectors_0.41.3
## [70] bit64_4.0.5
## [71] GenomicFiles_1.39.0
## [72] GenomicFeatures_1.55.3
## [73] jquerylib_0.1.4
## [74] glue_1.7.0
## [75] codetools_0.2-19
## [76] stringi_1.8.3
## [77] GenomeInfoDb_1.39.6
## [78] BiocIO_1.13.0
## [79] GenomicRanges_1.55.3
## [80] tibble_3.2.1
## [81] pillar_1.9.0
## [82] SNPlocs.Hsapiens.dbSNP155.GRCh37_0.99.24
## [83] rappdirs_0.3.3
## [84] htmltools_0.5.7
## [85] GenomeInfoDbData_1.2.11
## [86] BSgenome_1.71.2
## [87] R6_2.5.1
## [88] dbplyr_2.4.0
## [89] httr2_1.0.0
## [90] evaluate_0.23
## [91] lattice_0.22-5
## [92] Biobase_2.63.0
## [93] R.methodsS3_1.8.2
## [94] png_0.1-8
## [95] Rsamtools_2.19.3
## [96] gargle_1.5.2
## [97] memoise_2.0.1
## [98] bslib_0.6.1
## [99] SparseArray_1.3.4
## [100] xfun_0.42
## [101] fs_1.6.3
## [102] MatrixGenerics_1.15.0
## [103] pkgconfig_2.0.3