Genomic Variant manipulation

In this exercise, we will practice how to manipulate VCF files. Please find this VCF file “data/SAMN01882168_filt.vcf.gz” and use it to answer the following questions.

  1. Read in the VCF file and make a VRange object
library(VariantAnnotation)
vcf_file <- "data/SAMN01882168_filt.vcf.gz"
vcf <- readVcf(vcf_file,"hg19")
rd <- rowRanges(vcf)
  1. Please extract the genotype field and explain the abbreviations
geno(header(vcf))
## DataFrame with 10 rows and 3 columns
##             Number        Type            Description
##        <character> <character>            <character>
## GT               1      String               Genotype
## AD               R     Integer Allelic depths for t..
## DP               1     Integer Approximate read dep..
## GQ               1     Integer       Genotype Quality
## MIN_DP           1     Integer Minimum DP observed ..
## PGT              1      String Physical phasing hap..
## PID              1      String Physical phasing ID ..
## PL               G     Integer Normalized, Phred-sc..
## RGQ              1     Integer Unconditional refere..
## SB               4     Integer Per-sample component..
  1. Please calculate the incidence of mutation occurence in each chromosome.
contig_leng <- seqlengths(rd)
tbl <- table(seqnames(rd))
variant_count <- as.vector(tbl)
names(variant_count) <- names(tbl)

var_leng_dat <- data.frame(chr=names(variant_count),
                           stringsAsFactors = FALSE)
var_leng_dat$chrLeng <- contig_leng[match(var_leng_dat$chr,names(contig_leng))]
var_leng_dat$varCount <- variant_count[match(var_leng_dat$chr,names(variant_count))]
var_leng_dat$incidence <- (var_leng_dat$varCount / var_leng_dat$chrLeng) * 1000

ggplot(var_leng_dat,aes(x=reorder(chr,-incidence),y=incidence,label=chr))+
  geom_point()+geom_text(size=2,hjust=0.5, vjust=-1)+
  scale_y_continuous(trans='log10')+
  labs(x="",y="Log10_Incidence")+
  theme(axis.text.x=element_blank())

  1. Subset to just the variants on Chr21.
rd_sub <- rd[seqnames(rd) == "chr21"]
vcf_sub <- vcf[names(rd_sub)]
  1. Extract GT information from VCF subset and make a barchart to describe the variant number in each genotype.
tbl <- table(geno(vcf_sub)$GT)
tbl_dat <- as.data.frame(tbl)

ggplot(tbl_dat,aes(x=Var1,y=Freq,fill=Var1))+
  geom_bar(stat='Identity')+
  labs(x="",y="Counts",fill="")+theme_classic()

  1. Extract DP information from VCFf subset and make a histogram.
matDP <- geno(vcf_sub)$DP
summary(as.vector(matDP))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3.00   13.00   19.00   25.05   30.00  629.00
#
ggplot(as.data.frame(matDP),aes(x=SAMN01882168))+geom_histogram()+
  labs(x="",y="Counts")+
  scale_x_log10()+
  theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

  1. Extract GQ information from VCF subset and make histogram
matGQ <- geno(vcf_sub)$GQ
summary(as.vector(matGQ))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   99.00   99.00   91.32   99.00   99.00
#
ggplot(as.data.frame(matGQ),aes(x=SAMN01882168))+geom_histogram()+
  labs(x="",y="Counts")+
  scale_x_log10()+
  theme_classic()
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 12 rows containing non-finite values (stat_bin).

  1. Please make a data.frame including the following information demonstrated below.
# Genotype 1/2
var_2 <- rownames(geno(vcf_sub)$GT)[geno(vcf_sub)$GT=="1/2"]
varTab2 <- data.frame(variant=names(rd_sub)[names(rd_sub) %in% var_2],
                      chr=as.vector(seqnames(rd_sub)[names(rd_sub) %in% var_2]),
                      start=start(rd_sub)[names(rd_sub) %in% var_2],
                      end=end(rd_sub)[names(rd_sub) %in% var_2],
                      refBase=unlist(lapply(lapply(
                        alt(vcf_sub)[rownames(vcf_sub) %in% var_2],`[[`,1),as.character)),
                      altBase=unlist(lapply(lapply(
                        alt(vcf_sub)[rownames(vcf_sub) %in% var_2],`[[`,2),as.character)),
                      refCount=unlist(lapply(
                        geno(vcf_sub)$AD[rownames(geno(vcf_sub)$AD) %in% var_2],`[[`,2)),
                      altCount=unlist(
                        lapply(geno(vcf_sub)$AD[rownames(geno(vcf_sub)$AD) %in% var_2],`[[`,3)),
                      genoType=geno(vcf_sub)$GT[rownames(geno(vcf_sub)$GT) %in% var_2],
                      gtQuality=geno(vcf_sub)$GQ[rownames(geno(vcf_sub)$GQ) %in% var_2],
                      stringsAsFactors = FALSE)
#
# Genotype: 0/1, 1/1
varTab1 <- data.frame(variant=names(rd_sub)[!names(rd_sub) %in% var_2],
                      chr=as.vector(seqnames(rd_sub)[!names(rd_sub) %in% var_2]),
                      start=start(rd_sub)[!names(rd_sub) %in% var_2],
                      end=end(rd_sub)[!names(rd_sub) %in% var_2],
                      refBase=as.character(ref(vcf_sub)[!rownames(vcf_sub) %in% var_2]),
                      altBase=unlist(lapply(lapply(
                        alt(vcf_sub)[!rownames(vcf_sub) %in% var_2],`[[`,1),as.character)),
                      refCount=unlist(lapply(
                        geno(vcf_sub)$AD[!rownames(geno(vcf_sub)$AD) %in% var_2],`[[`,1)),
                      altCount=unlist(lapply(
                        geno(vcf_sub)$AD[!rownames(geno(vcf_sub)$AD) %in% var_2],`[[`,2)),
                      genoType=geno(vcf_sub)$GT[!rownames(geno(vcf_sub)$GT) %in% var_2],
                      gtQuality=geno(vcf_sub)$GQ[!rownames(geno(vcf_sub)$GQ) %in% var_2],
                      stringsAsFactors = FALSE)
#
# merge into table
varTab <- rbind(varTab1,varTab2)
varTab[1:2,]
##              variant   chr   start     end refBase altBase refCount altCount
## 1 chr21:9412358_TA/T chr21 9412358 9412359      TA       T        6        4
## 2  chr21:9413584_G/A chr21 9413584 9413584       G       A        3        8
##   genoType gtQuality
## 1      0/1        81
## 2      0/1        62
  1. Please differentiate variants by types (SNP/INS/DEL/Others) and count variants by each type
for(k in 1:length(varTab$variant)){
  if(width(varTab$refBase[k]) < width(varTab$altBase[k])){
    varTab$mutType[k] <- "INS"
  }else if(width(varTab$refBase[k]) > width(varTab$altBase[k])){
    varTab$mutType[k] <- "DEL"
  }else if(width(varTab$refBase[k])==1&width(varTab$altBase[k])==1){
    varTab$mutType[k] <- "SNP"
  }else{
    varTab$mutType[k] <- "Others"}}
#
tbl <- table(varTab$mutType)
tbl_dat <- as.data.frame(tbl)
tbl
## 
##  DEL  INS  SNP 
##  612  482 9536
#
ggplot(tbl_dat,aes(x=Var1,y=Freq,fill=Var1))+
  geom_bar(stat = 'identity')+
  labs(x="",y="Mutations",fill="")+
  theme_classic()

  1. Evaluate nucleotide substitutions
ti <- c("A>G","G>A","C>T","T>C")
# Transveersion (Tv)
tv <- c("A>T","A>C","G>T","G>C","C>A","C>G","T>A","T>G")
varTab$nuSub <- paste0(varTab$refBase,">",varTab$altBase)
varTab$TiTv[varTab$nuSub %in% ti] <- "Ti"
varTab$TiTv[varTab$nuSub %in% tv] <- "Tv"
varTab[1:2,]
##              variant   chr   start     end refBase altBase refCount altCount
## 1 chr21:9412358_TA/T chr21 9412358 9412359      TA       T        6        4
## 2  chr21:9413584_G/A chr21 9413584 9413584       G       A        3        8
##   genoType gtQuality mutType nuSub TiTv
## 1      0/1        81     DEL  TA>T <NA>
## 2      0/1        62     SNP   G>A   Ti
#
varX <- varTab[varTab$mutType=="SNP",]
tbl <- table(varX$nuSub)
tbl_dat <- as.data.frame(tbl)
ggplot(tbl_dat,aes(x=Var1,y=Freq,fill=Var1))+
  geom_bar(stat = 'identity')+
  labs(x="",y="Mutations",fill="")+
  theme(legend.position = "none")

#
tbl <- table(varX$TiTv)
tbl_dat <- as.data.frame(tbl)
ggplot(as.data.frame(table(varX$TiTv)),aes(x=Var1,y=Freq,fill=Var1))+
  geom_bar(stat = 'identity')+
  labs(x="",y="Mutations",fill="")+
  theme(legend.position = "none")