In this exercise, we will practice how to annotate variants. Please find this VCF file “data/SAMN01882168_filt.vcf.gz” and subset to variants in chromosome 21.
library(VariantAnnotation)
vcf_file <- "data/SAMN01882168_filt.vcf.gz"
vcf <- readVcf(vcf_file,"hg19")
vcf_sub <- vcf[grepl(names(vcf),pattern = "chr21")]
rd_sub <- rowRanges(vcf_sub)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
coding <- predictCoding(vcf_sub, txdb, seqSource=Hsapiens)
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 32 out-of-bound ranges located on sequences
## 72996, 72997, 72998, 72999, and 73001. Note that ranges located on a
## sequence whose length is unknown (NA) or on a circular sequence are not
## considered out-of-bound (use seqlengths() and isCircular() to get the
## lengths and circularity flags of the underlying sequences). You can use
## trim() to trim these ranges. See ?`trim,GenomicRanges-method` for more
## information.
#
matA <- data.frame(Variant=names(coding),
chromosome=seqnames(coding),
start=start(coding),end=end(coding),
ref_allele=as.character(coding$REF),
alt_allele=unlist(lapply(lapply(
coding$ALT,`[[`,1),as.character)),
GeneID=coding$GENEID,
TxID=coding$TXID,
Protein_posi=unlist(lapply(lapply(
coding$PROTEINLOC,`[[`,1),as.integer)),
ref_AA=as.character(coding$REFAA),
alt_AA=as.character(coding$VARAA),
Type=coding$CONSEQUENCE,
stringsAsFactors = FALSE)
matA$aaChange <- paste0("p.",matA$ref_AA,matA$Protein_posi,matA$alt_AA)
matA <- dplyr::select(matA,-Protein_posi,-ref_AA,-alt_AA)
#
coding
## GRanges object with 18 ranges and 17 metadata columns:
## seqnames ranges strand | paramRangeID
## <Rle> <IRanges> <Rle> | <factor>
## chr21:10942924_CCTT/C chr21 10942924-10942927 - | NA
## chr21:10942924_CCTT/C chr21 10942924-10942927 - | NA
## chr21:10942924_CCTT/C chr21 10942924-10942927 - | NA
## chr21:10942924_CCTT/C chr21 10942924-10942927 - | NA
## chr21:10943003_C/T chr21 10943003 - | NA
## ... ... ... ... . ...
## chr21:11098723_T/C chr21 11098723 - | NA
## chr21:16337201_T/C chr21 16337201 - | NA
## chr21:16337201_T/C chr21 16337201 - | NA
## chr21:16337201_T/C chr21 16337201 - | NA
## chr21:16337201_T/C chr21 16337201 - | NA
## REF ALT QUAL FILTER
## <DNAStringSet> <DNAStringSetList> <numeric> <character>
## chr21:10942924_CCTT/C CCTT C 58.73 .
## chr21:10942924_CCTT/C CCTT C 58.73 .
## chr21:10942924_CCTT/C CCTT C 58.73 .
## chr21:10942924_CCTT/C CCTT C 58.73 .
## chr21:10943003_C/T C T 52.77 .
## ... ... ... ... ...
## chr21:11098723_T/C T C 121.77 .
## chr21:16337201_T/C T C 96.77 .
## chr21:16337201_T/C T C 96.77 .
## chr21:16337201_T/C T C 96.77 .
## chr21:16337201_T/C T C 96.77 .
## varAllele CDSLOC PROTEINLOC QUERYID
## <DNAStringSet> <IRanges> <IntegerList> <integer>
## chr21:10942924_CCTT/C G 660-663 220,221 7028
## chr21:10942924_CCTT/C G 606-609 202,203 7028
## chr21:10942924_CCTT/C G 546-549 182,183 7028
## chr21:10942924_CCTT/C G 246-249 82,83 7028
## chr21:10943003_C/T A 584 195 7029
## ... ... ... ... ...
## chr21:11098723_T/C G 15 5 8590
## chr21:16337201_T/C G 3313 1105 10467
## chr21:16337201_T/C G 3313 1105 10467
## chr21:16337201_T/C G 3313 1105 10467
## chr21:16337201_T/C G 3313 1105 10467
## TXID CDSID GENEID CONSEQUENCE
## <character> <IntegerList> <character> <factor>
## chr21:10942924_CCTT/C 72996 212929 7179 nonsynonymous
## chr21:10942924_CCTT/C 72997 212929 7179 nonsynonymous
## chr21:10942924_CCTT/C 72998 212929 7179 nonsynonymous
## chr21:10942924_CCTT/C 72999 212929 7179 nonsynonymous
## chr21:10943003_C/T 72996 212929 7179 nonsynonymous
## ... ... ... ... ...
## chr21:11098723_T/C 73004 212945 574 synonymous
## chr21:16337201_T/C 73028 212982 8204 nonsynonymous
## chr21:16337201_T/C 73029 212982 8204 nonsynonymous
## chr21:16337201_T/C 73030 212982 8204 nonsynonymous
## chr21:16337201_T/C 73031 212982 <NA> nonsynonymous
## REFCODON VARCODON REFAA
## <DNAStringSet> <DNAStringSet> <AAStringSet>
## chr21:10942924_CCTT/C AGAAGG AGG RR
## chr21:10942924_CCTT/C AGAAGG AGG RR
## chr21:10942924_CCTT/C AGAAGG AGG RR
## chr21:10942924_CCTT/C AGAAGG AGG RR
## chr21:10943003_C/T CGA CAA R
## ... ... ... ...
## chr21:11098723_T/C GCA GCG A
## chr21:16337201_T/C AAA GAA K
## chr21:16337201_T/C AAA GAA K
## chr21:16337201_T/C AAA GAA K
## chr21:16337201_T/C AAA GAA K
## VARAA
## <AAStringSet>
## chr21:10942924_CCTT/C R
## chr21:10942924_CCTT/C R
## chr21:10942924_CCTT/C R
## chr21:10942924_CCTT/C R
## chr21:10943003_C/T Q
## ... ...
## chr21:11098723_T/C A
## chr21:16337201_T/C E
## chr21:16337201_T/C E
## chr21:16337201_T/C E
## chr21:16337201_T/C E
## -------
## seqinfo: 25 sequences from hg19 genome
## Variant chromosome start end ref_allele alt_allele
## 1 chr21:10942924_CCTT/C chr21 10942924 10942927 CCTT C
## 2 chr21:10942924_CCTT/C chr21 10942924 10942927 CCTT C
## 3 chr21:10942924_CCTT/C chr21 10942924 10942927 CCTT C
## 4 chr21:10942924_CCTT/C chr21 10942924 10942927 CCTT C
## 5 chr21:10943003_C/T chr21 10943003 10943003 C T
## 6 chr21:10943003_C/T chr21 10943003 10943003 C T
## GeneID TxID Type aaChange
## 1 7179 72996 nonsynonymous p.RR220R
## 2 7179 72997 nonsynonymous p.RR202R
## 3 7179 72998 nonsynonymous p.RR182R
## 4 7179 72999 nonsynonymous p.RR82R
## 5 7179 72996 nonsynonymous p.R195Q
## 6 7179 72997 nonsynonymous p.R177Q
##
## nonsynonymous synonymous
## 13 5
#
ggplot(taC_dat,aes(x=Var1,y=Freq,fill=Var1))+
geom_bar(stat='Identity')+
labs(x="",y="Counts",fill="")+
theme(legend.position = "none")
library(SNPlocs.Hsapiens.dbSNP144.GRCh37)
all_snps <- SNPlocs.Hsapiens.dbSNP144.GRCh37
#
# Extract SNPs of chr21
tar_chr <- as.vector(seqnames(rd_sub)@values)
my_snps <- snpsBySeqname(all_snps,gsub("chr","",tar_chr))
#
seqlevels(my_snps)
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15"
## [16] "16" "17" "18" "19" "20" "21" "22" "X" "Y" "MT"
## 1 2 3 4 5 6 7 8
## 249250621 243199373 198022430 191154276 180915260 171115067 159138663 146364022
## 9 10 11 12 13 14 15 16
## 141213431 135534747 135006516 133851895 115169878 107349540 102531392 90354753
## 17 18 19 20 21 22 X Y
## 81195210 78077248 59128983 63025520 48129895 51304566 155270560 59373566
## MT
## 16569
## [1] "NCBI"
## 1 2 3 4 5 6
## "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13"
## 7 8 9 10 11 12
## "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13"
## 13 14 15 16 17 18
## "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13"
## 19 20 21 22 X Y
## "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13"
## MT
## "GRCh37.p13"
#
seqlevels(my_snps) <- paste0("chr",seqlevels(my_snps))
seqlevels(my_snps) <- gsub("MT","M",seqlevels(my_snps))
# change seqlevelsStyle
seqlevelsStyle(my_snps) <- "UCSC"
# change genome
genome(my_snps) <- "hg19"
#
snp_ID <- data.frame(
posIDX=paste0(seqnames(my_snps),":",pos(my_snps)),
rsID=my_snps$RefSNP_id,stringsAsFactors = FALSE)
#
matV1 <- data.frame(Variant=names(rd_sub),stringsAsFactors = FALSE)
matV1$chromosome <- gsub("(.*):(.*)_(.*)/(.*)","\\1",matV1$Variant)
matV1$start <- gsub("(.*):(.*)_(.*)/(.*)","\\2",matV1$Variant)
matV1$end <- gsub("(.*):(.*)_(.*)/(.*)","\\2",matV1$Variant)
matV1$ref_allele <- gsub("(.*):(.*)_(.*)/(.*)","\\3",matV1$Variant)
matV1$alt_allele <- gsub("(.*):(.*)_(.*)/(.*)","\\4",matV1$Variant)
matV1$posIDX <- gsub("(.*)_(.*)","\\1",matV1$Variant)
#
matV1$rsID <- snp_ID$rsID[match(matV1$posIDX,snp_ID$posIDX)]
##
## FALSE TRUE
## 1906 8724
#
ggplot(taC2_dat,aes(x=Var1,y=Freq,fill=Var1))+
geom_bar(stat='Identity')+
labs(x="",y="Counts",fill="in_dbSNP")+
theme(legend.position = "none")