超大宏基因组数据集CuratedMetagenomicData

简介

CuratedMetagenomicData(https://waldronlab.github.io/curatedMetagenomicData/)的目标是用标准化的流程(MetaPhlan2、HUMAnN2)分析已发表的宏基因组数据并建立一个统一的数据集合。目前已经收录6000余个样本并在持续扩建当中。样本涵盖糖尿病、肥胖症、IBD等多种疾病,涉及皮肤、口腔、粪便等多处样本。

完整帮助信息

https://bioconductor.org/packages/devel/data/experiment/vignettes/curatedMetagenomicData/inst/doc/curatedMetagenomicData.html

安装

通过Bioconductor安装最新版本

## try http:// if https:// URLs are not supported
source("https://bioconductor.org/biocLite.R")
useDevel()
biocLite("curatedMetagenomicData")

基本使用

调用

suppressPackageStartupMessages(library(curatedMetagenomicData))

查看metadata

combined_metadata
## # A tibble: 6,058 x 80
##     dataset_name      sampleID subjectID body_site antibiotics_current_use
##            <chr>         <chr>     <chr>     <chr>                   <chr>
##  1 AsnicarF_2017 MV_FEI1_t1Q14   MV_FEI1     stool                    <NA>
##  2 AsnicarF_2017 MV_FEI2_t1Q14   MV_FEI2     stool                    <NA>
##  3 AsnicarF_2017 MV_FEI3_t1Q14   MV_FEI3     stool                    <NA>
##  4 AsnicarF_2017 MV_FEI4_t1Q14   MV_FEI4     stool                    <NA>
##  5 AsnicarF_2017 MV_FEI4_t2Q15   MV_FEI4     stool                    <NA>
##  6 AsnicarF_2017 MV_FEI5_t1Q14   MV_FEI5     stool                    <NA>
##  7 AsnicarF_2017 MV_FEI5_t2Q14   MV_FEI5     stool                    <NA>
##  8 AsnicarF_2017 MV_FEI5_t3Q15   MV_FEI5     stool                    <NA>
##  9 AsnicarF_2017 MV_FEM1_t1Q14   MV_FEM1     stool                    <NA>
## 10 AsnicarF_2017 MV_FEM2_t1Q14   MV_FEM2     stool                    <NA>
## # ... with 6,048 more rows, and 75 more variables: study_condition <chr>,
## #   disease <chr>, age <int>, infant_age <int>, age_category <chr>,
## #   gender <chr>, country <chr>, non_westernized <chr>,
## #   sequencing_platform <chr>, DNA_extraction_kit <chr>, PMID <chr>,
## #   number_reads <int>, number_bases <dbl>, minimum_read_length <int>,
## #   median_read_length <dbl>, pregnant <chr>, lactating <chr>,
## #   NCBI_accession <chr>, BMI <dbl>, antibiotics_family <chr>,
## #   momeducat <int>, alcohol <chr>, `flg-genotype` <chr>,
## #   disease_subtype <chr>, hdl <dbl>, triglycerides <dbl>, hba1c <dbl>,
## #   ldl <dbl>, tnm <chr>, body_subsite <chr>, visit_number <int>,
## #   days_from_first_collection <int>, `c-peptide` <dbl>, family <int>,
## #   cholesterol <dbl>, glucose <dbl>, mumps <chr>, adiponectin <dbl>,
## #   `insulin(cat)` <chr>, `fgf-19` <dbl>, hscrp <dbl>, leptin <dbl>,
## #   glutamate_decarboxylase_2_antibody <dbl>, creatinine <dbl>,
## #   `il-1` <dbl>, cd163 <dbl>, `glp-1` <dbl>, hitchip_probe_class <chr>,
## #   hitchip_probe_number <int>, protein_intake <dbl>,
## #   days_after_onset <int>, stec_count <chr>, shigatoxin_2_elisa <chr>,
## #   stool_texture <chr>, ferm_milk_prod_consumer <chr>,
## #   mgs_richness <dbl>, location <chr>, dyastolic_p <dbl>,
## #   systolic_p <dbl>, prothrombin_time <int>, creatine <dbl>, inr <dbl>,
## #   ctp <int>, albumine <dbl>, bilubirin <dbl>, smoker <chr>,
## #   ever_smoker <chr>, birth_control_pil <chr>, hla_drb12 <int>,
## #   hla_dqa12 <int>, hla_dqa11 <int>, hla_drb11 <int>,
## #   start_solidfood <int>, ajcc <chr>, fobt <chr>

查看metadata参数

colnames(combined_metadata)
##  [1] "dataset_name"                      
##  [2] "sampleID"                          
##  [3] "subjectID"                         
##  [4] "body_site"                         
##  [5] "antibiotics_current_use"           
##  [6] "study_condition"                   
##  [7] "disease"                           
##  [8] "age"                               
##  [9] "infant_age"                        
## [10] "age_category"                      
## [11] "gender"                            
## [12] "country"                           
## [13] "non_westernized"                   
## [14] "sequencing_platform"               
## [15] "DNA_extraction_kit"                
## [16] "PMID"                              
## [17] "number_reads"                      
## [18] "number_bases"                      
## [19] "minimum_read_length"               
## [20] "median_read_length"                
## [21] "pregnant"                          
## [22] "lactating"                         
## [23] "NCBI_accession"                    
## [24] "BMI"                               
## [25] "antibiotics_family"                
## [26] "momeducat"                         
## [27] "alcohol"                           
## [28] "flg-genotype"                      
## [29] "disease_subtype"                   
## [30] "hdl"                               
## [31] "triglycerides"                     
## [32] "hba1c"                             
## [33] "ldl"                               
## [34] "tnm"                               
## [35] "body_subsite"                      
## [36] "visit_number"                      
## [37] "days_from_first_collection"        
## [38] "c-peptide"                         
## [39] "family"                            
## [40] "cholesterol"                       
## [41] "glucose"                           
## [42] "mumps"                             
## [43] "adiponectin"                       
## [44] "insulin(cat)"                      
## [45] "fgf-19"                            
## [46] "hscrp"                             
## [47] "leptin"                            
## [48] "glutamate_decarboxylase_2_antibody"
## [49] "creatinine"                        
## [50] "il-1"                              
## [51] "cd163"                             
## [52] "glp-1"                             
## [53] "hitchip_probe_class"               
## [54] "hitchip_probe_number"              
## [55] "protein_intake"                    
## [56] "days_after_onset"                  
## [57] "stec_count"                        
## [58] "shigatoxin_2_elisa"                
## [59] "stool_texture"                     
## [60] "ferm_milk_prod_consumer"           
## [61] "mgs_richness"                      
## [62] "location"                          
## [63] "dyastolic_p"                       
## [64] "systolic_p"                        
## [65] "prothrombin_time"                  
## [66] "creatine"                          
## [67] "inr"                               
## [68] "ctp"                               
## [69] "albumine"                          
## [70] "bilubirin"                         
## [71] "smoker"                            
## [72] "ever_smoker"                       
## [73] "birth_control_pil"                 
## [74] "hla_drb12"                         
## [75] "hla_dqa12"                         
## [76] "hla_dqa11"                         
## [77] "hla_drb11"                         
## [78] "start_solidfood"                   
## [79] "ajcc"                              
## [80] "fobt"

获取人体各部位样本概况

table(combined_metadata$body_site)
## 
##        milk nasalcavity  oralcavity        skin       stool      vagina 
##           8          91         678         466        4810           5

获取包含的数据集

table(combined_metadata$dataset_name)
## 
##        AsnicarF_2017         BritoIL_2016  Castro-NallarE_2015 
##                   24                  312                   32 
##          ChngKR_2016           FengQ_2015      HanniganGD_2017 
##                   78                  154                   82 
## Heitz-BuschartA_2016             HMP_2012      KarlssonFH_2013 
##                   53                  749                  145 
##    LeChatelierE_2013             LiJ_2014            LiuW_2016 
##                  292                  260                  110 
##         LomanNJ_2013       NielsenHB_2014  Obregon-TitoAJ_2015 
##                   43                  396                   58 
##             OhJ_2014            QinJ_2012            QinN_2014 
##                  291                  363                  237 
##       RampelliS_2015        RaymondF_2016       SchirmerM_2016 
##                   38                   72                  471 
##          TettAJ_2016        VatanenT_2016        VincentC_2016 
##                   97                  785                  229 
##       VogtmannE_2016            XieH_2016             YuJ_2015 
##                  110                  250                  128 
##         ZellerG_2014 
##                  199

下载HMP 2012(HMP I)的MetaPhlAn2分析结果

dat <- curatedMetagenomicData("HMP_2012.metaphlan_bugs_list.*", dryrun=FALSE)
## Working on HMP_2012.metaphlan_bugs_list.nasalcavity
## Warning in strptime(x, fmt, tz = "GMT"): unknown timezone 'zone/tz/2017c.
## 1.0/zoneinfo/Asia/Singapore'
## snapshotDate(): 2017-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## loading from cache '/Users/mayuan//.ExperimentHub/424'
## Working on HMP_2012.metaphlan_bugs_list.oralcavity
## snapshotDate(): 2017-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## loading from cache '/Users/mayuan//.ExperimentHub/425'
## Working on HMP_2012.metaphlan_bugs_list.stool
## snapshotDate(): 2017-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## loading from cache '/Users/mayuan//.ExperimentHub/426'
## Working on HMP_2012.metaphlan_bugs_list.vagina
## snapshotDate(): 2017-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## loading from cache '/Users/mayuan//.ExperimentHub/427'

查看HMP I粪便数据的元数据

head( pData(dat[[3]]) )
##                    subjectID body_site body_subsite
## SRS056519 HMP_2012_765094712     stool        stool
## SRS016335 HMP_2012_765074482     stool        stool
## SRS011061 HMP_2012_158458797     stool        stool
## SRS016267 HMP_2012_764669880     stool        stool
## SRS053214 HMP_2012_159753524     stool        stool
## SRS013521 HMP_2012_159227541     stool        stool
##           antibiotics_current_use study_condition disease age age_category
## SRS056519                    <NA>         control healthy  19    schoolage
## SRS016335                    <NA>         control healthy  20        adult
## SRS011061                    <NA>         control healthy  30        adult
## SRS016267                    <NA>         control healthy  29        adult
## SRS053214                    <NA>         control healthy  23        adult
## SRS013521                    <NA>         control healthy  22        adult
##           gender visit_number BMI country non_westernized
## SRS056519   male           NA  NA     USA              no
## SRS016335   male           NA  NA     USA              no
## SRS011061 female           NA  NA     USA              no
## SRS016267   male           NA  NA     USA              no
## SRS053214 female           NA  NA     USA              no
## SRS013521 female           NA  NA     USA              no
##           DNA_extraction_kit number_reads number_bases minimum_read_length
## SRS056519             Qiagen    125076707  12506449555                  60
## SRS016335             Qiagen    125043127  11938455408                  60
## SRS011061             Qiagen     90085554   8028157196                  60
## SRS016267             Qiagen    103068446   9820433068                  60
## SRS053214             Qiagen    103639606   9687833906                  60
## SRS013521             Qiagen    115013916  10925100631                  60
##           median_read_length NCBI_accession
## SRS056519                100      SRS056519
## SRS016335                100      SRS016335
## SRS011061                 90      SRS011061
## SRS016267                100      SRS016267
## SRS053214                 95      SRS053214
## SRS013521                 95      SRS013521

查看MetaPhlan相对丰度表

hmp_stool_metaphlan <- exprs(dat[[3]])
hmp_stool_metaphlan[1:10, 1:10]
##                                                 SRS056519 SRS016335
## k__Viruses                                        0.00090   0.00179
## k__Bacteria                                      99.99910  99.96451
## k__Viruses|p__Viruses_noname                      0.00090   0.00179
## k__Bacteria|p__Actinobacteria                     0.25980   0.13760
## k__Bacteria|p__Firmicutes                         9.52434  18.67835
## k__Bacteria|p__Proteobacteria                     2.27579   2.32088
## k__Bacteria|p__Bacteroidetes                     87.93779  77.40923
## k__Viruses|p__Viruses_noname|c__Viruses_noname    0.00090   0.00179
## k__Bacteria|p__Actinobacteria|c__Actinobacteria   0.25980   0.13760
## k__Bacteria|p__Firmicutes|c__Bacilli              0.01382   0.02164
##                                                 SRS011061 SRS016267
## k__Viruses                                        0.00000   0.00000
## k__Bacteria                                     100.00000 100.00000
## k__Viruses|p__Viruses_noname                      0.00000   0.00000
## k__Bacteria|p__Actinobacteria                     0.04200   0.11105
## k__Bacteria|p__Firmicutes                         5.59309   3.98890
## k__Bacteria|p__Proteobacteria                     2.98345   0.55210
## k__Bacteria|p__Bacteroidetes                     91.07961  95.28808
## k__Viruses|p__Viruses_noname|c__Viruses_noname    0.00000   0.00000
## k__Bacteria|p__Actinobacteria|c__Actinobacteria   0.04200   0.11105
## k__Bacteria|p__Firmicutes|c__Bacilli              0.04196   0.00000
##                                                 SRS053214 SRS013521
## k__Viruses                                        0.00000   0.00882
## k__Bacteria                                     100.00000  99.99118
## k__Viruses|p__Viruses_noname                      0.00000   0.00882
## k__Bacteria|p__Actinobacteria                     0.46158   0.08515
## k__Bacteria|p__Firmicutes                        16.03493   1.73242
## k__Bacteria|p__Proteobacteria                     0.23830   0.49311
## k__Bacteria|p__Bacteroidetes                     80.04580  97.68049
## k__Viruses|p__Viruses_noname|c__Viruses_noname    0.00000   0.00882
## k__Bacteria|p__Actinobacteria|c__Actinobacteria   0.46158   0.08515
## k__Bacteria|p__Firmicutes|c__Bacilli              0.01578   0.02924
##                                                 SRS018817 SRS017247
## k__Viruses                                        0.00000   0.00000
## k__Bacteria                                     100.00000 100.00000
## k__Viruses|p__Viruses_noname                      0.00000   0.00000
## k__Bacteria|p__Actinobacteria                     0.08518   0.15577
## k__Bacteria|p__Firmicutes                        11.52659   4.35400
## k__Bacteria|p__Proteobacteria                     0.22363   0.21245
## k__Bacteria|p__Bacteroidetes                     87.99725  95.08394
## k__Viruses|p__Viruses_noname|c__Viruses_noname    0.00000   0.00000
## k__Bacteria|p__Actinobacteria|c__Actinobacteria   0.08518   0.15577
## k__Bacteria|p__Firmicutes|c__Bacilli              0.00443   0.01766
##                                                 SRS019968 SRS064557
## k__Viruses                                        0.00077   0.00194
## k__Bacteria                                      99.99923  99.99806
## k__Viruses|p__Viruses_noname                      0.00077   0.00194
## k__Bacteria|p__Actinobacteria                     0.16129   0.03814
## k__Bacteria|p__Firmicutes                        16.11446   2.69009
## k__Bacteria|p__Proteobacteria                     0.17522   1.50628
## k__Bacteria|p__Bacteroidetes                     82.54661  95.76355
## k__Viruses|p__Viruses_noname|c__Viruses_noname    0.00077   0.00194
## k__Bacteria|p__Actinobacteria|c__Actinobacteria   0.16129   0.03814
## k__Bacteria|p__Firmicutes|c__Bacilli              0.01099   0.01147

Related

comments powered by Disqus