--- title: "Hormaphis cornu Red-Green DE analysis" output: html_notebook --- DE of salivary glands of red vs green samples All samples genotyped at multiple markers in g16073 to confirm genotype Samples from 2020 were of very low quality. Used only 2019 samples. Samples Gall_color Genotype: First set June 15, 2019 Sample_1_AKG_4.txt Green G/G Sample_2_DSG_6.txt Green Sample_3_DSG_7.txt Green G/G Sample_4_DSG_8.txt Green G/G Sample_5_DSG_9.txt Green Sample_6_DSG_10.txt Green Sample_7_DSG_11.txt Green Sample_8_DSG_13.txt Green Sample_9_DSR_8.txt Red R/R Sample_10_DSR_9.txt Red R/G Sample_11_DSR_11.txt Red Sample_12_DSR_12.txt Red R/G Sample_13_DSR_13.txt Red R/G Sample_14_DSR_15.txt Red R/G Sample_15_DSR_16.txt Red R/G Sample_16_DSR_17.txt Red R/G Sample_17_DSR_18.txt Red R/G Sample_7_AKR1.txt Red Sample_8_AKR2.txt Red Sample_9_AKR3.txt Red Sample_10_AKR4.txt Red Sample_11_DSR5.txt Red Sample_12_DSR6.txt Red Sample_13_DSR7.txt Red Sample_14_DSR.txt Red Trim reads #trim paired reads for i in *_1.fastq.gz; do numbertwo=`basename ${i} _1.fastq.gz`_2.fastq.gz; \ bsub -n 8 -o trim_reads \ "cutadapt -j 8 --trim-n \ -a revcompTruSeq_Univ_Adap=AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT \ -A revcompTruSeq_Univ_Adap=AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT \ -a TruSeq_Idx_Adap=GATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNATCTCGTATGCCGTCTTCTGCTTG \ -A TruSeq_Idx_Adap=GATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNATCTCGTATGCCGTCTTCTGCTTG \ -m 36 \ -q 10,10 \ --nextseq-trim=10 \ -o catrim.${i} \ -p catrim.${numbertwo} \ ${i} ${numbertwo}" \ ; done *DONE Map reads with hisat2 for gwas study bsub -n 16 -o hisat2_idx "hisat2-build \ /groups/stern/home/sternd/hormaphis/hormaphis_cornu_26Sep2017_PoQx8.fasta \ ./Hcor_hisat \ -p 16" *DONE home=/groups/stern/home/sternd #within PE folder for i in catrim*1.fastq.gz; do root_name=`basename ${i} _1.fastq.gz`; \ second_name=${root_name}_2.fastq.gz; \ bsub -n 4 -o hisat2_aln${root_name} "hisat2 \ -x ${home}/hormaphis/Hcor_hisat \ -p 4 \ -1 \ ${i} \ -2 \ ${second_name} \ | ${home}/bin/samtools sort -O BAM -@ 4 - > ${root_name}.bam" \ ;done *DONE #grab just g16072 and g16073 for i in *sorted; do name=`basename ${i} .bam`; \ rootname=`basename ${i} .sorted` ; \ bsub -o select_${i} "samtools view -b -o ${rootname}.g16072-3.bam ${i} 'ScPoQx8_4268;HRSCAF=5273:40449000-40511000'"; \ done #in home directory for i in *bam; do samtools index ${i}; done for i in *sorted; do name=`basename ${i} .sorted`; \ bsub -o rmdup_${i} "samtools rmdup ${i} ${name}.rmdup.bam"; \ done *RUNNING Count reads per gene with htseq-count for i in catrim*sorted; do mv ${i} ${i}.bam; done for i in *bam; do \ bsub -o count.${i}.txt \ "htseq-count -i "gene_id" --stranded=no -f bam -t exon \ ${i} /groups/stern/home/sternd/hormaphis/Augustus.updated_w_annots.21Aug20.exons.gff3 > ${i}.counts"; \ done #Pool replicate counts from RNAseq_4vi20 and RNAseq_16vi20 for i in RNAseq_4vi20/*counts; do filename=`basename $i sorted.bam.counts`; paste RNAseq_4vi20/${filename}sorted.bam.counts RNAseq_16vi20/${filename}bam.counts |awk '{print $1 "\t" $2+$4;}' > ${filename}bam.counts; done scp 'sternd@login2.int.janelia.org:~/hormaphis/red-green/RNAseq/*counts' counts scp 'sternd@login2.int.janelia.org:~/hormaphis/red-green/*Sample_20JRC*counts' counts scp 'sternd@login2.int.janelia.org:~/hormaphis/red-green/RNAseq/red2_RNAseq/earlier/*counts' counts If you have not previously loaded the required packages ```{r eval=FALSE, include=FALSE} source("Install_packages.R") ``` Load libraries when you start a new R session ```{r include=FALSE} source("~/Dropbox (HHMI)/Projects/Aphid_gall/LoadLibraries.R") ``` ```{r} #change this command to suit your actual folder setwd("/Users/sternd/Aphid_gall/Hormaphis_cornu/red_vs_green_gwas/RNAseq-SalGl-17Jun19/") ``` RNAseq newly dissected salivary gland samples of red and green Genotyped red to ensure all sequenced animals carry g16073 SNPs load files ```{r} files <- dir(path="counts/",pattern="*") y <- readDGE(files,path="counts/",columns=c(1,2),head=FALSE) y <- y[-grep("_",row.names(y$counts)),] y_start <-y ``` QC - eliminate low count genes ```{r} cpm <- cpm(y) lcpm <- cpm(y,log=TRUE) numsamples <-length(row.names(y$samples)) table(rowSums(y$counts==0) == numsamples) #Only interested in genes that are expressed #Therefore, exclude any genes that are not expressed at cpm > 1 in at least N samples #8 green 17 red samples, keep only genes expressed in at least half of one group, so 4 keep.exprs <- rowSums(cpm>1)>=8 y <- y[keep.exprs,, keep.lib.sizes=FALSE] length(keep.exprs[keep.exprs==TRUE]) ``` QC ```{r} nsamples <- ncol(y) colors <- rainbow(nsamples) par(mfrow=c(1,2)) lcpm<-cpm(y_start,log=TRUE) plot(density(lcpm[,1]), lwd=2, ylim=c(0,0.21), las=2, main="", xlab="") title(main="A. Raw data", xlab="Log-cpm") abline(v=0, lty=3) for (i in 2:nsamples){ den <- density(lcpm[,i]) lines(den$x, den$y, lwd=1,col=colors[i]) } lcpm <- cpm(y, log=TRUE) plot(density(lcpm[,1]), lwd=2, ylim=c(0,0.21), las=2, main="", xlab="") title(main="B. Filtered data", xlab="Log-cpm") abline(v=0, lty=3) for (i in 2:nsamples){ den <- density(lcpm[,i]) lines(den$x, den$y, lwd=1,col=colors[i]) } ``` QC ```{r} # The las argument rotates the axis names barplot(y$samples$lib.size,names=colnames(y),las=2,cex.names=.6) # Add a title to the plot title("Library sizes") # Get log2 counts per million logcounts <- cpm(y,log=TRUE,normalized.lib.sizes=TRUE) # Check distributions of samples using boxplots par(cex.axis=.6) boxplot(logcounts, xlab="", ylab="Log2 counts per million",names=colnames(y),las=2) # Let's add a blue horizontal line that corresponds to the median logCPM abline(h=median(logcounts),col="blue") title(main="Boxplots of logCPMs (unnormalised)",cex=.7) #normalize samples y <- calcNormFactors(y) logcounts <- cpm(y,log=TRUE,normalized.lib.sizes=TRUE) # Check distributions of samples using boxplots par(cex.axis=.5) boxplot(logcounts, xlab="", ylab="Log2 counts per million",las=2) # Let's add a blue horizontal line that corresponds to the median logCPM abline(h=median(logcounts),col="blue") title("Boxplots of logCPMs (normalised)") ``` ```{r} y$samples$group <- factor(y$samples$group,levels = c("Green","Red")) y$samples$group[grepl(pattern = "AKG",x = rownames(y$samples))] <- "Green" y$samples$group[grepl(pattern = "DSG",x = rownames(y$samples))] <- "Green" y$samples$group[grepl(pattern = "AKR",x = rownames(y$samples))] <- "Red" y$samples$group[grepl(pattern = "DSR",x = rownames(y$samples))] <- "Red" y <- calcNormFactors(y) ``` ```{r} glMDSPlot(y,groups=y$samples$group,labels=row.names(y$samples),folder = "Glimma.Hcor.red-green") ``` Outliers catrim.Sample_9_AKR3.sorted.bam catrim.Sample_7_AKR1.sorted.bam catrim.Sample_1_AKG_4.sorted.bam ```{r} to_exclude <- c( "catrim.Sample_9_AKR3.sorted.bam", "catrim.Sample_7_AKR1.sorted.bam", "catrim.Sample_1_AKG_4.sorted.bam" ) ysub <- y[,c(-which(row.names(y$samples) %in% to_exclude,))] ysub <- calcNormFactors(ysub) ``` ```{r} glMDSPlot(ysub,groups=ysub$samples$group,labels=row.names(ysub$samples),folder = "Glimma.Hcor.red-green.subset") ``` DE analysis - Compare Red vs Green ```{r} group <- factor(ysub$samples$group) design <- model.matrix(~0 + group) colnames(design) <- levels(group) ysub <- estimateDisp(ysub,design) fit <- glmQLFit(ysub, design) my.contrasts <- makeContrasts( RedGreen = Red-Green, levels=design) qlf.Red <- glmQLFTest(fit, contrast = my.contrasts) dt.Red<-decideTests(qlf.Red,p.value = 0.05) summary(dt.Red) ``` ```{r} #test for effect of gall down_genes <- row.names(dt.Red)[dt.Red == -1] pdf(file="MDplot_Red1vsGreen.pdf") par(mar = c(5.1, 5.1, 4.1, 2.1)) plotMD(qlf.Red, xlab = "Mean Expression Level (log2 CPM)", ylab = "Salivary Gland (Red - Green) Fold Difference (log2 FC)", cex.axis = 1.3, cex.lab = 1.3, main = NULL, p.value = 0.05 ) #axis(1, las = 1, ) #axis(2, las = 0, cex.axis = 2) text(qlf.Red$table["g16073",]$logCPM,qlf.Red$table["g16073",]$logFC,labels="dgc",pos=4,offset=.75,cex=1.3, adj=c(0,0), col = "blue") dev.off() summary(dt.Red) down_genes ``` g16073 is only strongly significant effect, the same gene as revealed by the GWAS hit. Volcano plot ```{r} DEredgrSG <- qlf.Red$table DEredgrSG$qValue <- p.adjust(DEredgrSG$PValue, method = "fdr") ptlabelsize = 5 pdf("VPRed1vsGr.pdf") p1 <- EnhancedVolcano(toptable = DEredgrSG, lab = NA, x = "logFC", y = "PValue", title = NULL, subtitle = NULL, caption = NULL, pCutoff = 1.14e-5, FCcutoff = FALSE, cutoffLineType = "blank", hline = c(1.14e-5, 2.28e-6),#empirically determined by replacing second value in DEredgrSG$PValue #empirically determined FDR = 0.01 is 2.099e-6 #test <- DEredgrSG$PValue[order(DEredgrSG$PValue)] #test[2] = 1.95e-5;head(p.adjust(test, method = "fdr")) -> fdr = 0.05 #test[4] = 2.28e-6;head(p.adjust(test, method = "fdr")) -> fdr = 0.01 hlineCol = "blue", hlineType = "longdash", hlineWidth = 0.5, pointSize = c(ifelse(DEredgrSG$qValue < 0.05, 3, 1)), ylim = c(0, max(-log10(DEredgrSG$PValue)+1, na.rm=TRUE)), #xlim = c(-5,5), xlab = NULL, col = c("black", "black", "royalblue", "red2") ) p1 + theme(legend.position = 'none', axis.text.x = element_text(angle = 0, size = 18, vjust= 1), axis.text.y = element_text(angle = 0, size = 18, vjust= 0.5), axis.title.x = element_text(angle = 0,size = 18,vjust = 0), axis.title.y = element_text(angle = 90,size = 20,vjust = 0), ) + labs(x = 'Fundatrix Salivary Gland (Red - Green)\nFold Change (log2 CPM)') + geom_text( data=DEredgrSG %>% filter(row.names(DEredgrSG) %in% "g16073"), aes(x = logFC, y = -log10(PValue), label = "dgc"), nudge_x = 1, #nudge_y = 0.000001, size = ptlabelsize+1, colour = "red2", fontface = "plain", label.size = 0) + annotate("text", label = c("FDR = 0.05","FDR = 0.01"), x = 7, y = c(-log10(1.14e-5),-log10(2.28e-6)) + .2, size = ptlabelsize, colour = "blue") dev.off() ``` Plot CPM for red and green samples of g16073 cluster genes Sample names in Green_samples Red_samples ```{r} g16073_bicycle_cluster<-c("g16067", "g16068", "g16070", "g16071", "g16072", "g16073", "g16075", "g16078", "g16079") counts=t(subset(cpm(ysub,normalized.lib.sizes=TRUE) ,rownames(ysub$counts) %in% g16073_bicycle_cluster)) type=as.character(ysub$samples$group) counts=cbind(counts,type) counts = as.data.frame(counts) counts_for_plot = counts %>% gather(Gene, Counts, g16067:g16079) pdf("g16073_cluster_red_green_SG_cpm.pdf", width=16) ggplot(counts_for_plot, aes(x = Gene, y = as.numeric(as.character(Counts)), fill = type)) + geom_boxplot(outlier.shape = NA) + geom_point(pch = 16, size=2, position = position_jitterdodge(jitter.width = .15), aes(group=type)) + scale_y_log10(breaks = c(0,10,100,1000), labels = c(0,10,100,1000), limits = c(1,3000), name="Counts Per Million Reads") + scale_x_discrete(labels = c("16067","16068","16070","16071","16072","dgc","16075","16078","16079")) + theme(axis.text.x=element_text(size=18, angle = 0), axis.text.y=element_text(size=22, vjust = 0.5), axis.title.x=element_text(size=20, vjust = -1), axis.title.y=element_text(size=20), legend.position = "none" ) + scale_fill_manual(values=c("green","red")) dev.off() ``` ```{r} counts=t(subset(cpm(ysub,normalized.lib.sizes=TRUE, logcounts = TRUE) ,rownames(ysub$counts) %in% g16073_bicycle_cluster)) norm_counts<- counts / rowMeans(counts) type=as.character(ysub$samples$group) norm_counts=cbind(norm_counts,type) norm_counts = as.data.frame(norm_counts) norm_counts_for_plot = norm_counts %>% gather(Gene, Counts, g16067:g16079) pdf("g16073_cluster_red_green_SG_norm_cpm.pdf", width=16) ggplot(norm_counts_for_plot, aes(x = Gene, y = as.numeric(as.character(Counts))+1, fill = type)) + geom_boxplot(outlier.shape = NA) + geom_point(pch = 16, size=2, position = position_jitterdodge(jitter.width = .15), aes(group=type)) + #scale_y_continuous(n.breaks = 6, limits = c(0,10), name="Normalized Counts Per Million Reads") + scale_y_log10(breaks = c(1,2,6), labels = c(0,1,5), limits = c(1,7), name="Normalized log CPM)") + scale_x_discrete(labels = c("16067","16068","16070","16071","16072","dgc","16075","16078","16079")) + theme(axis.text.x=element_text(size=18, angle = 0), axis.text.y=element_text(size=22, vjust = 0.5), axis.title.x=element_text(size=20, vjust = -1), axis.title.y=element_text(size=20), legend.position = "none" ) + scale_fill_manual(values=c("green","red")) dev.off() ``` Plot expression for red gall 13 and 16 across g16073 cluster scp 'sternd@login2.int.janelia.org:hormaphis/red-green/*Redgall*counts' red_gall_counts ```{r message=FALSE, include=FALSE} files <- dir(path="red_gall_counts/",pattern="*") yrg <- readDGE(files,path="red_gall_counts/",columns=c(1,2),head=FALSE) yrg <- yrg[-grep("_",row.names(yrg$counts)),] ``` ```{r} redgalls<-t(yrg$counts[g16073_bicycle_cluster,2]) redgalls_norm_counts<- redgalls / rowMeans(redgalls) #type= c("RedGall_13", "RedGall_16") type= c("RedGall_16") redgalls_norm_counts=cbind(redgalls_norm_counts,type) redgalls_norm_counts = as.data.frame(redgalls_norm_counts) redgalls_norm_counts_for_plot = redgalls_norm_counts %>% gather(Gene, Counts, g16067:g16079) pdf("redgalls_SG_norm_cpm.pdf", width=16) ggplot(redgalls_norm_counts_for_plot, aes(x = Gene, y = as.numeric(as.character(Counts))+1, col = type)) + geom_point(pch = 16, size=6, aes(group=type)) + #scale_y_continuous(n.breaks = 6, limits = c(0,10), name="Normalized Counts Per Million Reads") + scale_y_log10(breaks = c(1,2,6), labels = c(0,1,5), limits = c(1,7), name="Normalized log CPM)") + scale_x_discrete(labels = c("16067","16068","16070","16071","16072","dgc","16075","16078","16079")) + theme(axis.text.x=element_text(size=18, angle = 0), axis.text.y=element_text(size=22, vjust = 0.5), axis.title.x=element_text(size=20, vjust = -1), axis.title.y=element_text(size=20), legend.position = "none" ) + scale_color_manual(values=c("blue")) dev.off() #type= c("RedGall_13", "RedGall_16") type= c("RedGall_16") redgalls=cbind(redgalls,type) redgalls = as.data.frame(redgalls) redgalls_counts_for_plot = redgalls %>% gather(Gene, Counts, g16067:g16079) pdf("redgalls_SG_cpm.pdf", width=16) ggplot(redgalls_counts_for_plot, aes(x = Gene, y = as.numeric(as.character(Counts))+1, col = type)) + geom_point(pch = 16, size=6, position = position_jitter(width = 0.05,seed = 1235), aes(group=type)) + scale_y_log10(name="CPM)") + scale_x_discrete(labels = c("16067","16068","16070","16071","16072","dgc","16075","16078","16079")) + theme(axis.text.x=element_text(size=18, angle = 0), axis.text.y=element_text(size=22, vjust = 0.5), axis.title.x=element_text(size=20, vjust = -1), axis.title.y=element_text(size=20), legend.position = "none" ) + scale_color_manual(values=c("blue")) dev.off() ``` Sushi plots ```{r} #read in gene info gff.exon <-readGFF("/Users/sternd/Aphid_gall/Hormaphis_cornu/DE analysis/AnalysisPipeline5.18Aug20/Augustus.updated_w_annots.21Aug20.gff3.gz", filter = list(type = "exon")) gff.bed <- as_tibble(gff.exon) %>% rename(chrom = seqid, stop = end, gene = Parent) %>% dplyr::select(chrom, start, stop, gene, score, strand, type) %>% tidyr::unnest(gene) %>% mutate(strand = case_when(strand == "-" ~ -1, strand == "+" ~ +1)) %>% mutate(type = "exon") gff.bed$chrom <- recode(gff.bed$chrom,"ScPoQx8_4268;HRSCAF=5273" = "chr1", "ScPoQx8_1269;HRSCAF=1516" = "chr2", "ScPoQx8_9746;HRSCAF=11751" = "chr3", "ScPoQx8_7909;HRSCAF=9379" ="chr4", "ScPoQx8_3632;HRSCAF=4405" ="chr5", "ScPoQx8_9972;HRSCAF=12060"= "chr6", "ScPoQx8_8456;HRSCAF=10181"= "chr7", "ScPoQx8_7882;HRSCAF=9334" ="chr8", "ScPoQx8_9720;HRSCAF=11714"= "chr9") gff.bed <- as.data.frame(gff.bed) #gff.bed <- gff.bed[,c(1,2,3,7,4,5,6)] ``` Plot a single gene region Copy/paste convertstrandinfo and plotGenes functions from https://github.com/mmoisse/Sushi/tree/master/R ```{r} #' Converts strand info to 1 / -1 #' #' @param strandvector vector of strand information to convert from +/- to 1/-1 if neccesary convertstrandinfo <- function(strandvector) { if (class((strandvector)) == "character") { print ("yes") b = rep(1,length((strandvector))) b[which((strandvector) == "-")] = -1 strandvector = b } return (strandvector) } ``` ```{r} gff.chr1bed <- gff.bed[gff.bed$chrom == "chr1",] chr <- "chr1" chrstart <- 40360000 chrstop <- 40594000 #p<-layout(matrix(c(1,2),2,1,byrow=TRUE)) #layout.show(p) g16073_bicycle_cluster<-c("g16067", "g16068", "g16070", "g16071", "g16072", "g16073", "g16075", "g16078", "g16079") #select bicycle genes gff.g16073_cluster <- gff.chr1bed[grep(paste(g16073_bicycle_cluster,collapse="|"),x = gff.chr1bed$gene),] #keep only first transcript gff.g16073_cluster <- gff.g16073_cluster[grep(pattern = ".*1$", x = gff.g16073_cluster$gene),] gff.g16073_cluster$gene <- gsub(pattern = ".t.*",replacement = "",x = gff.g16073_cluster$gene) #replace g16073 with dgc gff.g16073_cluster$gene <- gsub(pattern = "g16073", replacement = "dgc",x = gff.g16073_cluster$gene) pdf(file = "g16073_cluster.pdf",width = 16) par(mar=c(15,1,0,1)) pg<-plotGenesStern(geneinfo = gff.g16073_cluster, chrom = chr, chromstart=chrstart, chromend=chrstop, types = gff.g16073_cluster$type, bheight=0.04, plotgenetype="box", col = "black", #packrow = T, #maxrows = 1, bentline=F, labeloffset=.085, fontsize=1.3, wigglefactor = .001, labeltext=T, labelat = "middle", ylim = c(.4,.6) ) labelgenome(chrom = chr,chromstart=chrstart, chromend=chrstop,scale ="Mb", side = 3,line = -7, chromcex = 2, scalecex = 2) dev.off() ```