params <- list(isSlides = "no") ## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set(echo = TRUE) AsSlides <- TRUE library(DESeq2) library(GenomicAlignments) library(Rsubread) ## ---- results='asis',include=TRUE,echo=FALSE---------------------------------- if(params$isSlides != "yes"){ cat("# Crispr Screenining analysis in R --- " ) } ## ----downloadFile,message=FALSE----------------------------------------------- download.file("https://github.com/LewisLabUCSD/PinAPL-Py/archive/master.zip","pieappleData.zip") unzip("pieappleData.zip") download.file("http://pinapl-py.ucsd.edu/example-data","TestData.zip") unzip("TestData.zip") download.file("http://pinapl-py.ucsd.edu/run/download/example-run","Results.zip") unzip("Results.zip") ## ----shortreada,include=FALSE------------------------------------------------- library(ShortRead) ## ----shortread, warning=F, message=F------------------------------------------ library(ShortRead) ## ----crsiprRep1Reads,echo=T,eval=T-------------------------------------------- fqSample <- FastqSampler("PinAPL-Py-master/Data/Tox-A_R01_98_S2_L008_R1_001_x.fastq.gz",n=10^6) fastq <- yield(fqSample) ## ----mycRep1ReadsShortReadQ--------------------------------------------------- fastq ## ----mycRep1ReadsAccessor----------------------------------------------------- readSequences <- sread(fastq) readQuality <- quality(fastq) readIDs <- id(fastq) readSequences ## ----mycRep1ReadsQScores------------------------------------------------------ readQuality <- quality(fastq) readQualities <- alphabetScore(readQuality) readQualities[1:10] ## ----mycRep1ReadsQScoresPlot,fig.height=3,fig.width=8------------------------- library(ggplot2) toPlot <- data.frame(ReadQ=readQualities) ggplot(toPlot,aes(x=ReadQ))+geom_histogram()+theme_minimal() ## ----mycRep1ReadsAlpFreq------------------------------------------------------ readSequences <- sread(fastq) readSequences_AlpFreq <- alphabetFrequency(readSequences) readSequences_AlpFreq[1:3,] ## ----mycRep1ReadsAlpFreqSum--------------------------------------------------- summed__AlpFreq <- colSums(readSequences_AlpFreq) summed__AlpFreq[c("A","C","G","T","N")] ## ----mycRep1ReadsAlpByCycle--------------------------------------------------- readSequences_AlpbyCycle <- alphabetByCycle(readSequences) readSequences_AlpbyCycle[1:4,1:10] ## ----mycRep1ReadsAlpByCyclePlot----------------------------------------------- AFreq <- readSequences_AlpbyCycle["A",] CFreq <- readSequences_AlpbyCycle["C",] GFreq <- readSequences_AlpbyCycle["G",] TFreq <- readSequences_AlpbyCycle["T",] toPlot <- data.frame(Count=c(AFreq,CFreq,GFreq,TFreq), Cycle=rep(1:max(width(readSequences)),4), Base=rep(c("A","C","G","T"),each=max(width(readSequences)))) ## ----mycRep1ReadsAlpByCyclePlot2,fig.height=4,fig.width=8--------------------- ggplot(toPlot,aes(y=Count,x=Cycle,colour=Base))+geom_line()+ theme_bw() ## ----mycRep1ReadsQByCycle----------------------------------------------------- qualAsMatrix <- as(readQuality,"matrix") qualAsMatrix[1:2,] ## ----mycRep1ReadsQByCyclePlotfig.width=8,fig.height=4------------------------- toPlot <- colMeans(qualAsMatrix) plot(toPlot) ## ---- results='asis',include=TRUE,echo=FALSE---------------------------------- if(params$isSlides == "yes"){ cat("class: inverse, center, middle # Aligning data

--- " ) }else{ cat("# Aligning data --- " ) } ## ----fa2---------------------------------------------------------------------- GeCKO <- read.delim("PinAPL-py_demo_data/GeCKOv21_Human.tsv") GeCKO[1:2,] ## ----fa3---------------------------------------------------------------------- require(Biostrings) sgRNAs <- DNAStringSet(GeCKO$seq) names(sgRNAs) <- GeCKO$UID ## ----fa4---------------------------------------------------------------------- writeXStringSet(sgRNAs, file="GeCKO.fa") ## ---- echo=TRUE,eval=TRUE----------------------------------------------------- library(Rsubread) buildindex("GeCKO","GeCKO.fa", indexSplit=FALSE) ## ---- echo=TRUE, eval=TRUE---------------------------------------------------- myFQs <- "PinAPL-py_demo_data/Control_R1_S14_L008_R1_001_x.fastq.gz" myMapped <- align("GeCKO",myFQs,output_file = gsub(".fastq.gz",".bam",myFQs), nthreads=4,unique=TRUE,nBestLocations=1,type = "DNA") ## ---- echo=TRUE,eval=TRUE----------------------------------------------------- myMapped ## ---- echo=TRUE, eval=TRUE---------------------------------------------------- myFQs <- "PinAPL-py_demo_data/Control_R1_S14_L008_R1_001_x.fastq.gz" myMapped <- align("GeCKO",myFQs,output_file = gsub(".fastq.gz",".bam",myFQs), nthreads=4,unique=TRUE,nBestLocations=1,type = "DNA",TH1 = 1) ## ---- echo=TRUE,eval=TRUE----------------------------------------------------- myMapped ## ---- echo=TRUE, eval=TRUE---------------------------------------------------- myFQs <- "PinAPL-py_demo_data/Control_R1_S14_L008_R1_001_x.fastq.gz" myMapped <- align("GeCKO",myFQs,output_file = gsub(".fastq.gz",".bam",myFQs), nthreads=4,unique=TRUE,nBestLocations=1,type = "DNA",TH1 = 1, maxMismatches = 0,indels = 0) ## ---- echo=TRUE,eval=TRUE----------------------------------------------------- myMapped ## ---- echo=TRUE,eval=TRUE----------------------------------------------------- require(GenomicAlignments) temp <- readGAlignments(gsub(".fastq.gz",".bam",myFQs)) temp ## ---- echo=TRUE,eval=TRUE----------------------------------------------------- temp ## ---- echo=TRUE,eval=TRUE----------------------------------------------------- cigars <- cigar(temp) cigars[1:5] ## ---- echo=TRUE,eval=TRUE----------------------------------------------------- cigarRLE <- cigarToRleList(cigars) cigarRLE[1] ## ---- echo=TRUE,eval=TRUE----------------------------------------------------- cigarMat <- matrix(as.vector(unlist(cigarRLE)),ncol=50,byrow = TRUE) cigarMat[1:2,] ## ---- echo=TRUE,eval=TRUE----------------------------------------------------- cigarFreq <- apply(cigarMat,2,table) cigarFreq[1:2,] ## ---- echo=TRUE,eval=TRUE,fig.height=3,fig.width=8---------------------------- require(ggplot2) toPlot <- data.frame(Freq=c(cigarFreq["S",],cigarFreq["M",]), Cigar=rep(c("S","M"),each=ncol(cigarFreq)), Cycle=rep(seq(1,ncol(cigarFreq)),2)) ggplot(toPlot,aes(x=Cycle,y=Freq,colour=Cigar))+geom_line()+theme_bw() ## ---- echo=TRUE,eval=TRUE----------------------------------------------------- counts <- data.frame(table(seqnames(temp)),row.names = "Var1") counts[1:2,,drop=FALSE] ## ---- echo=TRUE,eval=TRUE,fig.height=3,fig.width=8---------------------------- ss <- read.delim("New_Example_1580342908_4/Analysis/01_Alignment_Results/ReadCounts_per_sgRNA/Control_1_GuideCounts.txt") new <- merge(ss,counts,by.x=1,by.y=0) plot(new$X0,new$Freq) ## ---- echo=TRUE,eval=TRUE,fig.height=3,fig.width=8---------------------------- counts <- data.frame(table(seqnames(temp[width(temp) == 20])),row.names = "Var1") new <- merge(ss,counts,by.x=1,by.y=0) plot(new$X0,new$Freq) ## ---- echo=TRUE--------------------------------------------------------------- load("data/sgRNACounts.RData") sgRNAcounts[1:4,] ## ---- echo=TRUE--------------------------------------------------------------- library(DESeq2) metadata <- DataFrame(Group=factor(c("Control","Control","ToxB","ToxB"), levels = c("Control","ToxB")), row.names = colnames(sgRNAcounts)) dds <- DESeqDataSetFromMatrix(sgRNAcounts,colData = metadata,design = ~Group) ## ---- echo=TRUE--------------------------------------------------------------- dds <- DESeq(dds) ## ---- echo=TRUE,fig.height=3,fig.width=8-------------------------------------- normCounts <- counts(dds,normalized=TRUE) boxplot(log2(normCounts+0.1)) ## ---- echo=TRUE--------------------------------------------------------------- ToxBvsControl <- results(dds,contrast=c("Group","ToxB","Control")) ToxBvsControl <- ToxBvsControl[order(ToxBvsControl$pvalue),] ToxBvsControl ## ---- echo=TRUE,fig.height=3,fig.width=8-------------------------------------- ss <- read.delim("New_Example_1580342908_4/Analysis/02_sgRNA-Ranking_Results/sgRNA_Rankings/ToxB_avg_0.01_Sidak_sgRNAList.txt") toPlt <- merge(ss,as.data.frame(ToxBvsControl),by.x=1,by.y=0) corr <- cor(log2(toPlt[!is.na(toPlt$padj),]$fold.change),toPlt[!is.na(toPlt$padj),]$log2FoldChange) plot(log2(toPlt[!is.na(toPlt$padj),]$fold.change),toPlt[!is.na(toPlt$padj),]$log2FoldChange,main=corr) ## ----------------------------------------------------------------------------- ToxBvsControl <- as.data.frame(ToxBvsControl)[order(ToxBvsControl$pvalue),] ToxBvsControl$Enriched <- !is.na(ToxBvsControl$padj) & ToxBvsControl$pvalue < 0.05 &ToxBvsControl$log2FoldChange > 0 ToxBvsControl[1:2,] ## ----------------------------------------------------------------------------- ToxBvsControl <- merge(GeCKO,ToxBvsControl,by.x=2,by.y=0) ToxBvsControl[1:2,] ## ----eval=FALSE--------------------------------------------------------------- ## genes <- unique(ToxBvsControl$gene_id) ## listofGene <- list() ## for(i in 1:length(genes)){ ## tempRes <- ToxBvsControl[ToxBvsControl$gene_id %in% genes[i],] ## meanLogFC <- mean(tempRes$log2FoldChange,na.rm=TRUE) ## logFCs <- paste0(tempRes$log2FoldChange,collapse=";") ## minPvalue <- min(tempRes$pvalue,na.rm=TRUE) ## pvalues <- paste0(tempRes$pvalue,collapse=";") ## nEnriched <- sum(tempRes$Enriched,na.rm=TRUE) ## listofGene[[i]] <- data.frame(Gene=genes[i],meanLogFC,logFCs,minPvalue,pvalues,nEnriched) ## } ## geneTable <- do.call(rbind,listofGene) ## geneTable <- geneTable[order(geneTable$nEnriched,decreasing=TRUE),] ## ----include=FALSE------------------------------------------------------------ load("data/geneTable.RData") ## ----------------------------------------------------------------------------- geneTable[1:3,] ## ---- echo=TRUE,eval=FALSE,include=FALSE-------------------------------------- ## myFQs <- c("/Users/thomascarroll/Downloads/PinAPL-py_demo_data/Control_R1_S14_L008_R1_001_x.fastq.gz", "/Users/thomascarroll/Downloads/PinAPL-py_demo_data/Control_R2_S15_L008_R1_001_x.fastq.gz", "/Users/thomascarroll/Downloads/PinAPL-py_demo_data/ToxA_R1_98_S2_L008_R1_001_x.fastq.gz", "/Users/thomascarroll/Downloads/PinAPL-py_demo_data/ToxA_R2_S2_L005_R1_001_x.fastq.gz", "/Users/thomascarroll/Downloads/PinAPL-py_demo_data/ToxB_R1_98_S4_L008_R1_001_x.fastq.gz", "/Users/thomascarroll/Downloads/PinAPL-py_demo_data/ToxB_R2_S4_L005_R1_001_x.fastq.gz") ## require(GenomicAlignments) ## counts <- list() ## counts2 <- list() ## stats <- list() ## ## for(f in 1:length(myFQs)){ ## stats[[f]] <- align("GeCKO",myFQs[f],output_file = gsub(".fastq.gz",".bam",myFQs[f]), ## nthreads=2,unique=TRUE,nBestLocations=1,type = "DNA",TH1 = 1,maxMismatches = 0,indels = 0) ## temp <- readGAlignments(gsub(".fastq.gz",".bam",myFQs[f])) ## counts[[f]] <- data.frame(table(seqnames(temp[width(temp) == "20"])),row.names = "Var1") ## counts2[[f]] <- data.frame(table(seqnames(temp)),row.names = "Var1") ## } ## myRes <- do.call(cbind,counts) ##