In this session, we are focus on variant annotation. You will learn how to annotate variants with rsID of dbSNP and consequences of amino acid changes using the Bioconductor package VariantAnnotation. The VCF file is the same as session 1. You can find it in this file path “data/SAMN01882168_filt.vcf.gz”.
NOTE: The annotation process is highly memory intensive. We will just focus on the variants in Chromosome 1.
Required resources - dbSNP: SNPlocs.Hsapiens.dbSNP144.GRCh37 - Annotation database: TxDb.Hsapiens.UCSC.hg19.knownGene
Subset variants in chromosome 1
Annotate rsID from dbSNP
Predict amino acid changes
Integrate information into single table
Unlike other Bioconductor objects the VCF object is not easily subset by chromosome. We can use grepl to get a logical vector of all the vcf names that contain the pattern “chr1:”. This logical vecotr can then subset our VCF.
## [1] "chr1:815337_G/T" "chr1:815341_G/A"
## [1] TRUE TRUE
Retrieve dbSNP
Extract SNPs by Chromosome memory intensive
Merge SNPs and variant information by positions
## # SNPlocs object for Homo sapiens (dbSNP Human BUILD 144)
## # reference genome: GRCh37.p13
## # nb of SNPs: 135276726
## [1] "chr1" "chr2" "chr3" "chr4" "chr5" "chr6" "chr7" "chr8" "chr9"
## [10] "chr10" "chr11" "chr12" "chr13" "chr14" "chr15" "chr16" "chr17" "chr18"
## [19] "chr19" "chr20" "chr21" "chr22" "chrX" "chrY" "chrM"
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15"
## [16] "16" "17" "18" "19" "20" "21" "22" "X" "Y" "MT"
seqname in rd_chr1 is chr1 but in all_snps is 1
## [1] "UCSC"
## [1] "NCBI"
seqlevelsSytle in rd_chr1 is UCSC, but in all_snps is NCBI Ensembl
## chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11
## "hg19" "hg19" "hg19" "hg19" "hg19" "hg19" "hg19" "hg19" "hg19" "hg19" "hg19"
## chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22
## "hg19" "hg19" "hg19" "hg19" "hg19" "hg19" "hg19" "hg19" "hg19" "hg19" "hg19"
## chrX chrY chrM
## "hg19" "hg19" "hg19"
## 1 2 3 4 5 6
## "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13"
## 7 8 9 10 11 12
## "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13"
## 13 14 15 16 17 18
## "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13"
## 19 20 21 22 X Y
## "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13"
## MT
## "GRCh37.p13"
genome in rd_chr1 is hg19, but in all_snps is GRCh37.p13
Lets first grab all SNPs that are on Chr1. Though there are simpler ways to do this, this code will all still work if you are looking at all chromosomes.
tar_chr <- as.vector(seqnames(rd_chr1)@values)
tar_chr <- gsub("chr", "", tar_chr)
tar_chr[grepl(tar_chr, pattern = "M")] <- "MT"
my_snps <- snpsBySeqname(all_snps, c(tar_chr))
my_snps[1:2]
## UnstitchedGPos object with 2 positions and 2 metadata columns:
## seqnames pos strand | RefSNP_id alleles_as_ambig
## <Rle> <integer> <Rle> | <character> <character>
## [1] 1 10108 * | rs62651026 Y
## [2] 1 10109 * | rs376007522 W
## -------
## seqinfo: 25 sequences (1 circular) from GRCh37.p13 genome
We can update the seqlevelsStyle to “UCSC”. This automatically updates all the chromosome names to be UCSC style.We will then just have to update the genome to “hg19”.
We can extract out information about the SNPs to generate a reference data.frame that contains their position and ID. - posIDX: [chromosome]:[position] ~ the index column for merging data - rsID: rsID from dbSNP
snp_ID <- data.frame(posIDX = paste0(seqnames(my_snps), ":", pos(my_snps)), rsID = my_snps$RefSNP_id,
stringsAsFactors = FALSE)
head(snp_ID)
## posIDX rsID
## 1 chr1:10108 rs62651026
## 2 chr1:10109 rs376007522
## 3 chr1:10139 rs368469931
## 4 chr1:10150 rs371194064
## 5 chr1:10177 rs201752861
## 6 chr1:10180 rs201694901
Next we will reformat our variant information into a data.frame with our fields of interest. To do this we will have to break up the Variant strings.
## [1] "chr1:815337_G/T" "chr1:815341_G/A"
matV1$chromosome <- gsub("(.*):(.*)_(.*)/(.*)", "\\1", matV1$Variant)
matV1$start <- gsub("(.*):(.*)_(.*)/(.*)", "\\2", matV1$Variant)
matV1$end <- gsub("(.*):(.*)_(.*)/(.*)", "\\2", matV1$Variant)
matV1$ref_allele <- gsub("(.*):(.*)_(.*)/(.*)", "\\3", matV1$Variant)
matV1$alt_allele <- gsub("(.*):(.*)_(.*)/(.*)", "\\4", matV1$Variant)
matV1$posIDX <- gsub("(.*)_(.*)", "\\1", matV1$Variant)
matV1[1:2, ]
## Variant chromosome start end ref_allele alt_allele posIDX
## 1 chr1:815337_G/T chr1 815337 815337 G T chr1:815337
## 2 chr1:815341_G/A chr1 815341 815341 G A chr1:815341
Now that we have the variant and rsID data.frames we can merge them together to tie together our observed variants with their reference IDs. We can drop the posIDX after this as we do not need it.
## Variant chromosome start end ref_allele alt_allele
## 1 chr1:100309252_T/C chr1 100309252 100309252 T C
## 2 chr1:100523688_G/A chr1 100523688 100523688 G A
## rsID
## 1 rs520644
## 2 <NA>
dplyr::select is used to select or remove specific columns in a data frame.
##
## FALSE TRUE
## 730 4627
## TxDb object:
## # Db type: TxDb
## # Supporting package: GenomicFeatures
## # Data source: UCSC
## # Genome: hg19
## # Organism: Homo sapiens
## # Taxonomy ID: 9606
## # UCSC Table: knownGene
## # Resource URL: http://genome.ucsc.edu/
## # Type of Gene ID: Entrez Gene ID
## # Full dataset: yes
## # miRBase build ID: GRCh37
## # transcript_nrow: 82960
## # exon_nrow: 289969
## # cds_nrow: 237533
## # Db created by: GenomicFeatures package from Bioconductor
## # Creation time: 2015-10-07 18:11:28 +0000 (Wed, 07 Oct 2015)
## # GenomicFeatures version at creation time: 1.21.30
## # RSQLite version at creation time: 1.0.0
## # DBSCHEMAVERSION: 1.1
We can run the predictcoding() function by providing our VCF object. It will use the Ranges contained within it, to look within a TxDb object.
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 56 out-of-bound ranges located on sequences
## 2149, 2159, 6098, 6103, 2198, 2199, 2200, 2201, and 2204. Note that
## ranges located on a sequence whose length is unknown (NA) or on a
## circular sequence are not considered out-of-bound (use seqlengths() and
## isCircular() to get the lengths and circularity flags of the underlying
## sequences). You can use trim() to trim these ranges. See
## ?`trim,GenomicRanges-method` for more information.
## GRanges object with 1 range and 17 metadata columns:
## seqnames ranges strand | paramRangeID REF
## <Rle> <IRanges> <Rle> | <factor> <DNAStringSet>
## chr1:12907379_T/C chr1 12907379 - | NA T
## ALT QUAL FILTER varAllele
## <DNAStringSetList> <numeric> <character> <DNAStringSet>
## chr1:12907379_T/C C 76.77 . G
## CDSLOC PROTEINLOC QUERYID TXID CDSID
## <IRanges> <IntegerList> <integer> <character> <IntegerList>
## chr1:12907379_T/C 764 255 544 4428 13139
## GENEID CONSEQUENCE REFCODON VARCODON
## <character> <factor> <DNAStringSet> <DNAStringSet>
## chr1:12907379_T/C 649330 nonsynonymous GAT GGT
## REFAA VARAA
## <AAStringSet> <AAStringSet>
## chr1:12907379_T/C D G
## -------
## seqinfo: 25 sequences from hg19 genome
matA <- data.frame(Variant=names(coding),
chromosome=seqnames(coding),
start=start(coding),end=end(coding),
ref_allele=as.character(coding$REF),
alt_allele=unlist(lapply(lapply(
coding$ALT,`[[`,1),as.character)),
GeneID=coding$GENEID,
TxID=coding$TXID,
Protein_posi=unlist(lapply(lapply(
coding$PROTEINLOC,`[[`,1),as.integer)),
ref_AA=as.character(coding$REFAA),
alt_AA=as.character(coding$VARAA),
Type=coding$CONSEQUENCE)
matA$aaChange <- paste0("p.",matA$ref_AA,matA$Protein_posi,matA$alt_AA)
matA <- dplyr::select(matA,-Protein_posi,-ref_AA,-alt_AA)
## Variant chromosome start end ref_allele alt_allele GeneID
## 1 chr1:12907379_T/C chr1 12907379 12907379 T C 649330
## 2 chr1:12907379_T/C chr1 12907379 12907379 T C 343069
## TxID Type aaChange
## 1 4428 nonsynonymous p.D255G
## 2 4429 nonsynonymous p.D255G
var_in_coding <- data.frame(varName = names(vcf_chr1), in_coding = names(vcf_chr1) %in%
matA$Variant, stringsAsFactors = FALSE)
table(var_in_coding$in_coding)
##
## FALSE TRUE
## 5097 30
##
## nonsense nonsynonymous synonymous
## 1 49 20
matS$GeneID <- matA$GeneID[match(matS$Variant, matA$Variant)]
matS$AAChange <- matA$GeneID[match(matS$Variant, matA$Variant)]
matS[1:2, ]
## Variant chromosome start end ref_allele alt_allele
## 1 chr1:100309252_T/C chr1 100309252 100309252 T C
## 2 chr1:100523688_G/A chr1 100523688 100523688 G A
## rsID GeneID AAChange
## 1 rs520644 <NA> <NA>
## 2 <NA> <NA> <NA>