Variant annotation

In this exercise, we will practice how to annotate variants. Please find this VCF file “data/SAMN01882168_filt.vcf.gz” and subset to variants in chromosome 21.

library(VariantAnnotation)
vcf_file <- "data/SAMN01882168_filt.vcf.gz"
vcf <- readVcf(vcf_file,"hg19")
vcf_sub <- vcf[grepl(names(vcf),pattern = "chr21")]
rd_sub <- rowRanges(vcf_sub)
  1. Please predict amino acid changes by using TxDb
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
coding <- predictCoding(vcf_sub, txdb, seqSource=Hsapiens)
## Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 32 out-of-bound ranges located on sequences
##   72996, 72997, 72998, 72999, and 73001. Note that ranges located on a
##   sequence whose length is unknown (NA) or on a circular sequence are not
##   considered out-of-bound (use seqlengths() and isCircular() to get the
##   lengths and circularity flags of the underlying sequences). You can use
##   trim() to trim these ranges. See ?`trim,GenomicRanges-method` for more
##   information.
#
matA <- data.frame(Variant=names(coding),
                   chromosome=seqnames(coding),
                   start=start(coding),end=end(coding),
                   ref_allele=as.character(coding$REF),
                   alt_allele=unlist(lapply(lapply(
                     coding$ALT,`[[`,1),as.character)),
                   GeneID=coding$GENEID,
                   TxID=coding$TXID,
                   Protein_posi=unlist(lapply(lapply(
                     coding$PROTEINLOC,`[[`,1),as.integer)),
                   ref_AA=as.character(coding$REFAA),
                   alt_AA=as.character(coding$VARAA),
                   Type=coding$CONSEQUENCE,
                   stringsAsFactors = FALSE)
matA$aaChange <- paste0("p.",matA$ref_AA,matA$Protein_posi,matA$alt_AA)
matA <- dplyr::select(matA,-Protein_posi,-ref_AA,-alt_AA)
#
coding
## GRanges object with 18 ranges and 17 metadata columns:
##                         seqnames            ranges strand | paramRangeID
##                            <Rle>         <IRanges>  <Rle> |     <factor>
##   chr21:10942924_CCTT/C    chr21 10942924-10942927      - |           NA
##   chr21:10942924_CCTT/C    chr21 10942924-10942927      - |           NA
##   chr21:10942924_CCTT/C    chr21 10942924-10942927      - |           NA
##   chr21:10942924_CCTT/C    chr21 10942924-10942927      - |           NA
##      chr21:10943003_C/T    chr21          10943003      - |           NA
##                     ...      ...               ...    ... .          ...
##      chr21:11098723_T/C    chr21          11098723      - |           NA
##      chr21:16337201_T/C    chr21          16337201      - |           NA
##      chr21:16337201_T/C    chr21          16337201      - |           NA
##      chr21:16337201_T/C    chr21          16337201      - |           NA
##      chr21:16337201_T/C    chr21          16337201      - |           NA
##                                    REF                ALT      QUAL      FILTER
##                         <DNAStringSet> <DNAStringSetList> <numeric> <character>
##   chr21:10942924_CCTT/C           CCTT                  C     58.73           .
##   chr21:10942924_CCTT/C           CCTT                  C     58.73           .
##   chr21:10942924_CCTT/C           CCTT                  C     58.73           .
##   chr21:10942924_CCTT/C           CCTT                  C     58.73           .
##      chr21:10943003_C/T              C                  T     52.77           .
##                     ...            ...                ...       ...         ...
##      chr21:11098723_T/C              T                  C    121.77           .
##      chr21:16337201_T/C              T                  C     96.77           .
##      chr21:16337201_T/C              T                  C     96.77           .
##      chr21:16337201_T/C              T                  C     96.77           .
##      chr21:16337201_T/C              T                  C     96.77           .
##                              varAllele    CDSLOC    PROTEINLOC   QUERYID
##                         <DNAStringSet> <IRanges> <IntegerList> <integer>
##   chr21:10942924_CCTT/C              G   660-663       220,221      7028
##   chr21:10942924_CCTT/C              G   606-609       202,203      7028
##   chr21:10942924_CCTT/C              G   546-549       182,183      7028
##   chr21:10942924_CCTT/C              G   246-249         82,83      7028
##      chr21:10943003_C/T              A       584           195      7029
##                     ...            ...       ...           ...       ...
##      chr21:11098723_T/C              G        15             5      8590
##      chr21:16337201_T/C              G      3313          1105     10467
##      chr21:16337201_T/C              G      3313          1105     10467
##      chr21:16337201_T/C              G      3313          1105     10467
##      chr21:16337201_T/C              G      3313          1105     10467
##                                TXID         CDSID      GENEID   CONSEQUENCE
##                         <character> <IntegerList> <character>      <factor>
##   chr21:10942924_CCTT/C       72996        212929        7179 nonsynonymous
##   chr21:10942924_CCTT/C       72997        212929        7179 nonsynonymous
##   chr21:10942924_CCTT/C       72998        212929        7179 nonsynonymous
##   chr21:10942924_CCTT/C       72999        212929        7179 nonsynonymous
##      chr21:10943003_C/T       72996        212929        7179 nonsynonymous
##                     ...         ...           ...         ...           ...
##      chr21:11098723_T/C       73004        212945         574 synonymous   
##      chr21:16337201_T/C       73028        212982        8204 nonsynonymous
##      chr21:16337201_T/C       73029        212982        8204 nonsynonymous
##      chr21:16337201_T/C       73030        212982        8204 nonsynonymous
##      chr21:16337201_T/C       73031        212982        <NA> nonsynonymous
##                               REFCODON       VARCODON         REFAA
##                         <DNAStringSet> <DNAStringSet> <AAStringSet>
##   chr21:10942924_CCTT/C         AGAAGG            AGG            RR
##   chr21:10942924_CCTT/C         AGAAGG            AGG            RR
##   chr21:10942924_CCTT/C         AGAAGG            AGG            RR
##   chr21:10942924_CCTT/C         AGAAGG            AGG            RR
##      chr21:10943003_C/T            CGA            CAA             R
##                     ...            ...            ...           ...
##      chr21:11098723_T/C            GCA            GCG             A
##      chr21:16337201_T/C            AAA            GAA             K
##      chr21:16337201_T/C            AAA            GAA             K
##      chr21:16337201_T/C            AAA            GAA             K
##      chr21:16337201_T/C            AAA            GAA             K
##                                 VARAA
##                         <AAStringSet>
##   chr21:10942924_CCTT/C             R
##   chr21:10942924_CCTT/C             R
##   chr21:10942924_CCTT/C             R
##   chr21:10942924_CCTT/C             R
##      chr21:10943003_C/T             Q
##                     ...           ...
##      chr21:11098723_T/C             A
##      chr21:16337201_T/C             E
##      chr21:16337201_T/C             E
##      chr21:16337201_T/C             E
##      chr21:16337201_T/C             E
##   -------
##   seqinfo: 25 sequences from hg19 genome
head(matA)
##                 Variant chromosome    start      end ref_allele alt_allele
## 1 chr21:10942924_CCTT/C      chr21 10942924 10942927       CCTT          C
## 2 chr21:10942924_CCTT/C      chr21 10942924 10942927       CCTT          C
## 3 chr21:10942924_CCTT/C      chr21 10942924 10942927       CCTT          C
## 4 chr21:10942924_CCTT/C      chr21 10942924 10942927       CCTT          C
## 5    chr21:10943003_C/T      chr21 10943003 10943003          C          T
## 6    chr21:10943003_C/T      chr21 10943003 10943003          C          T
##   GeneID  TxID          Type aaChange
## 1   7179 72996 nonsynonymous p.RR220R
## 2   7179 72997 nonsynonymous p.RR202R
## 3   7179 72998 nonsynonymous p.RR182R
## 4   7179 72999 nonsynonymous  p.RR82R
## 5   7179 72996 nonsynonymous  p.R195Q
## 6   7179 72997 nonsynonymous  p.R177Q
  1. How many variant types in the variants of coding regions?
taC <- table(matA$Type)
taC_dat <- as.data.frame(taC)
taC
## 
## nonsynonymous    synonymous 
##            13             5
#
ggplot(taC_dat,aes(x=Var1,y=Freq,fill=Var1))+
  geom_bar(stat='Identity')+
  labs(x="",y="Counts",fill="")+
  theme(legend.position = "none")

  1. Please annotate variants by using dbSNP
library(SNPlocs.Hsapiens.dbSNP144.GRCh37)
all_snps <- SNPlocs.Hsapiens.dbSNP144.GRCh37
#
# Extract SNPs of chr21
tar_chr <- as.vector(seqnames(rd_sub)@values)
my_snps <- snpsBySeqname(all_snps,gsub("chr","",tar_chr))
#
seqlevels(my_snps)
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14" "15"
## [16] "16" "17" "18" "19" "20" "21" "22" "X"  "Y"  "MT"
seqlengths(my_snps)
##         1         2         3         4         5         6         7         8 
## 249250621 243199373 198022430 191154276 180915260 171115067 159138663 146364022 
##         9        10        11        12        13        14        15        16 
## 141213431 135534747 135006516 133851895 115169878 107349540 102531392  90354753 
##        17        18        19        20        21        22         X         Y 
##  81195210  78077248  59128983  63025520  48129895  51304566 155270560  59373566 
##        MT 
##     16569
seqlevelsStyle(my_snps)
## [1] "NCBI"
genome(my_snps)
##            1            2            3            4            5            6 
## "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" 
##            7            8            9           10           11           12 
## "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" 
##           13           14           15           16           17           18 
## "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" 
##           19           20           21           22            X            Y 
## "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" "GRCh37.p13" 
##           MT 
## "GRCh37.p13"
#
seqlevels(my_snps) <- paste0("chr",seqlevels(my_snps))
seqlevels(my_snps) <- gsub("MT","M",seqlevels(my_snps))
# change seqlevelsStyle
seqlevelsStyle(my_snps) <- "UCSC"
# change genome
genome(my_snps) <- "hg19"
#
snp_ID <- data.frame(
  posIDX=paste0(seqnames(my_snps),":",pos(my_snps)),
  rsID=my_snps$RefSNP_id,stringsAsFactors = FALSE)
#
matV1 <- data.frame(Variant=names(rd_sub),stringsAsFactors = FALSE)
matV1$chromosome <- gsub("(.*):(.*)_(.*)/(.*)","\\1",matV1$Variant)
matV1$start <- gsub("(.*):(.*)_(.*)/(.*)","\\2",matV1$Variant)
matV1$end <- gsub("(.*):(.*)_(.*)/(.*)","\\2",matV1$Variant)
matV1$ref_allele <- gsub("(.*):(.*)_(.*)/(.*)","\\3",matV1$Variant)
matV1$alt_allele <- gsub("(.*):(.*)_(.*)/(.*)","\\4",matV1$Variant)
matV1$posIDX <- gsub("(.*)_(.*)","\\1",matV1$Variant)
#
matV1$rsID <- snp_ID$rsID[match(matV1$posIDX,snp_ID$posIDX)]
  1. How many variants are in dbSNP?
taC2 <- table(!is.na(matV1$rsID))
taC2_dat <- as.data.frame(taC2)
taC2
## 
## FALSE  TRUE 
##  1906  8724
#
ggplot(taC2_dat,aes(x=Var1,y=Freq,fill=Var1))+
  geom_bar(stat='Identity')+
  labs(x="",y="Counts",fill="in_dbSNP")+
  theme(legend.position = "none")