In this exercise, we will practice how to manipulate VCF files. Please find this VCF file “data/SAMN01882168_filt.vcf.gz” and use it to answer the following questions.
library(VariantAnnotation)
vcf_file <- "data/SAMN01882168_filt.vcf.gz"
vcf <- readVcf(vcf_file,"hg19")
rd <- rowRanges(vcf)
## DataFrame with 10 rows and 3 columns
## Number Type Description
## <character> <character> <character>
## GT 1 String Genotype
## AD R Integer Allelic depths for t..
## DP 1 Integer Approximate read dep..
## GQ 1 Integer Genotype Quality
## MIN_DP 1 Integer Minimum DP observed ..
## PGT 1 String Physical phasing hap..
## PID 1 String Physical phasing ID ..
## PL G Integer Normalized, Phred-sc..
## RGQ 1 Integer Unconditional refere..
## SB 4 Integer Per-sample component..
contig_leng <- seqlengths(rd)
tbl <- table(seqnames(rd))
variant_count <- as.vector(tbl)
names(variant_count) <- names(tbl)
var_leng_dat <- data.frame(chr=names(variant_count),
stringsAsFactors = FALSE)
var_leng_dat$chrLeng <- contig_leng[match(var_leng_dat$chr,names(contig_leng))]
var_leng_dat$varCount <- variant_count[match(var_leng_dat$chr,names(variant_count))]
var_leng_dat$incidence <- (var_leng_dat$varCount / var_leng_dat$chrLeng) * 1000
ggplot(var_leng_dat,aes(x=reorder(chr,-incidence),y=incidence,label=chr))+
geom_point()+geom_text(size=2,hjust=0.5, vjust=-1)+
scale_y_continuous(trans='log10')+
labs(x="",y="Log10_Incidence")+
theme(axis.text.x=element_blank())
tbl <- table(geno(vcf_sub)$GT)
tbl_dat <- as.data.frame(tbl)
ggplot(tbl_dat,aes(x=Var1,y=Freq,fill=Var1))+
geom_bar(stat='Identity')+
labs(x="",y="Counts",fill="")+theme_classic()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.00 13.00 19.00 25.05 30.00 629.00
#
ggplot(as.data.frame(matDP),aes(x=SAMN01882168))+geom_histogram()+
labs(x="",y="Counts")+
scale_x_log10()+
theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 99.00 99.00 91.32 99.00 99.00
#
ggplot(as.data.frame(matGQ),aes(x=SAMN01882168))+geom_histogram()+
labs(x="",y="Counts")+
scale_x_log10()+
theme_classic()
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 12 rows containing non-finite values (stat_bin).
# Genotype 1/2
var_2 <- rownames(geno(vcf_sub)$GT)[geno(vcf_sub)$GT=="1/2"]
varTab2 <- data.frame(variant=names(rd_sub)[names(rd_sub) %in% var_2],
chr=as.vector(seqnames(rd_sub)[names(rd_sub) %in% var_2]),
start=start(rd_sub)[names(rd_sub) %in% var_2],
end=end(rd_sub)[names(rd_sub) %in% var_2],
refBase=unlist(lapply(lapply(
alt(vcf_sub)[rownames(vcf_sub) %in% var_2],`[[`,1),as.character)),
altBase=unlist(lapply(lapply(
alt(vcf_sub)[rownames(vcf_sub) %in% var_2],`[[`,2),as.character)),
refCount=unlist(lapply(
geno(vcf_sub)$AD[rownames(geno(vcf_sub)$AD) %in% var_2],`[[`,2)),
altCount=unlist(
lapply(geno(vcf_sub)$AD[rownames(geno(vcf_sub)$AD) %in% var_2],`[[`,3)),
genoType=geno(vcf_sub)$GT[rownames(geno(vcf_sub)$GT) %in% var_2],
gtQuality=geno(vcf_sub)$GQ[rownames(geno(vcf_sub)$GQ) %in% var_2],
stringsAsFactors = FALSE)
#
# Genotype: 0/1, 1/1
varTab1 <- data.frame(variant=names(rd_sub)[!names(rd_sub) %in% var_2],
chr=as.vector(seqnames(rd_sub)[!names(rd_sub) %in% var_2]),
start=start(rd_sub)[!names(rd_sub) %in% var_2],
end=end(rd_sub)[!names(rd_sub) %in% var_2],
refBase=as.character(ref(vcf_sub)[!rownames(vcf_sub) %in% var_2]),
altBase=unlist(lapply(lapply(
alt(vcf_sub)[!rownames(vcf_sub) %in% var_2],`[[`,1),as.character)),
refCount=unlist(lapply(
geno(vcf_sub)$AD[!rownames(geno(vcf_sub)$AD) %in% var_2],`[[`,1)),
altCount=unlist(lapply(
geno(vcf_sub)$AD[!rownames(geno(vcf_sub)$AD) %in% var_2],`[[`,2)),
genoType=geno(vcf_sub)$GT[!rownames(geno(vcf_sub)$GT) %in% var_2],
gtQuality=geno(vcf_sub)$GQ[!rownames(geno(vcf_sub)$GQ) %in% var_2],
stringsAsFactors = FALSE)
#
# merge into table
varTab <- rbind(varTab1,varTab2)
varTab[1:2,]
## variant chr start end refBase altBase refCount altCount
## 1 chr21:9412358_TA/T chr21 9412358 9412359 TA T 6 4
## 2 chr21:9413584_G/A chr21 9413584 9413584 G A 3 8
## genoType gtQuality
## 1 0/1 81
## 2 0/1 62
for(k in 1:length(varTab$variant)){
if(width(varTab$refBase[k]) < width(varTab$altBase[k])){
varTab$mutType[k] <- "INS"
}else if(width(varTab$refBase[k]) > width(varTab$altBase[k])){
varTab$mutType[k] <- "DEL"
}else if(width(varTab$refBase[k])==1&width(varTab$altBase[k])==1){
varTab$mutType[k] <- "SNP"
}else{
varTab$mutType[k] <- "Others"}}
#
tbl <- table(varTab$mutType)
tbl_dat <- as.data.frame(tbl)
tbl
##
## DEL INS SNP
## 612 482 9536
#
ggplot(tbl_dat,aes(x=Var1,y=Freq,fill=Var1))+
geom_bar(stat = 'identity')+
labs(x="",y="Mutations",fill="")+
theme_classic()
ti <- c("A>G","G>A","C>T","T>C")
# Transveersion (Tv)
tv <- c("A>T","A>C","G>T","G>C","C>A","C>G","T>A","T>G")
varTab$nuSub <- paste0(varTab$refBase,">",varTab$altBase)
varTab$TiTv[varTab$nuSub %in% ti] <- "Ti"
varTab$TiTv[varTab$nuSub %in% tv] <- "Tv"
varTab[1:2,]
## variant chr start end refBase altBase refCount altCount
## 1 chr21:9412358_TA/T chr21 9412358 9412359 TA T 6 4
## 2 chr21:9413584_G/A chr21 9413584 9413584 G A 3 8
## genoType gtQuality mutType nuSub TiTv
## 1 0/1 81 DEL TA>T <NA>
## 2 0/1 62 SNP G>A Ti
#
varX <- varTab[varTab$mutType=="SNP",]
tbl <- table(varX$nuSub)
tbl_dat <- as.data.frame(tbl)
ggplot(tbl_dat,aes(x=Var1,y=Freq,fill=Var1))+
geom_bar(stat = 'identity')+
labs(x="",y="Mutations",fill="")+
theme(legend.position = "none")
#
tbl <- table(varX$TiTv)
tbl_dat <- as.data.frame(tbl)
ggplot(as.data.frame(table(varX$TiTv)),aes(x=Var1,y=Freq,fill=Var1))+
geom_bar(stat = 'identity')+
labs(x="",y="Mutations",fill="")+
theme(legend.position = "none")