--- title: "DE_analysis_genes_annot_kallisto_3Nov19" output: html_notebook --- Previously had predicted UTRs and removed internal UTRs in AnalysisPipeline4. "Hcor-3-best-score_Aug_sub_cleanUTRs.gff" Starting with these predictions, update with new annotations. Manually annotated all biCYCle proteins identified in previous analyses Downloaded Annotations.gff3.gz Renamed mv Annotations.gff3.gz Annotations.21Aug20.gff3.gz gunzip Annotations.21Aug20.gff3.gz python fix_chromosome_names.py Annotations.21Aug20.gff3 Annotations.21Aug20.fixed_chrom_names.gff3 python fix_gff_ID+Name.py Annotations.21Aug20.fixed_chrom_names.gff3 Annotations.21Aug20.fixed_chrom_names.ID_fixed.gff3 python replace_BRAKERgff_w_Annotsgff.py Augustus_subtracted.gff Annotations.21Aug20.fixed_chrom_names.ID_fixed.gff3 Augustus.updated_w_annots.21Aug20.gff3 Now have correct gff. Augustus.updated_w_annots.21Aug20.gff3 Make transcripts and pep files gffread Augustus.updated_w_annots.21Aug20.gff3 -g /Users/sternd/Dropbox\ \(HHMI\)/Projects/Aphid_gall/Hormaphis_cornu/genomes/Dovetail_Genomics/Results/HiC_HiCRise_PoQx8/hormaphis_cornu_26Sep2017_PoQx8/hormaphis_cornu_26Sep2017_PoQx8.fasta \ -w Augustus.updated_w_annots.21Aug20.transcripts.fa gffread Augustus.updated_w_annots.21Aug20.gff3 -g /Users/sternd/Dropbox\ \(HHMI\)/Projects/Aphid_gall/Hormaphis_cornu/genomes/Dovetail_Genomics/Results/HiC_HiCRise_PoQx8/hormaphis_cornu_26Sep2017_PoQx8/hormaphis_cornu_26Sep2017_PoQx8.fasta \ -y Augustus.updated_w_annots.21Aug20.pep.fa #remove CDS info from transcripts files sed -i.bak 's/CDS=.*//' Augustus.updated_w_annots.21Aug20.transcripts.fa gzip Augustus.updated_w_annots.21Aug20.gff3 scp Augustus.updated_w_annots.21Aug20.gff3.gz sternd@login2.int.janelia.org:hormaphis **************** qual trim reads for i in *1.fastq.gz; do second=`basename ${i} 1.fastq.gz`2.fastq.gz; \ prefix=`basename ${i} _1.fastq.gz`; \ bsub -n 4 -o qualtrim.`basename ${i} 1.fastq.gz` \ "cutadapt --trim-n -j 4 -n 2 \ -a GATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNATCTCGTATGCCGTCTTCTGCTTG \ -A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT \ -m 36 \ -q 10 \ -o catrim.m36.${prefix}_1.fq.gz \ -p catrim.m36.${prefix}_2.fq.gz \ ${i} ${second}" \ ; done *DONE for sexuals - 3' tags only for i in *fastq.gz; do \ prefix=`basename ${i} _08.pooled.fastq.gz`; \ bsub -n 4 -o qualtrim.`basename ${i} _08.pooled.fastq.gz` \ "cutadapt --trim-n -j 4 -n 2 \ -a AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT \ -m 36 \ -q 10 \ -o catrim.m36.${prefix}.pooled.fq.gz \ ${i}" \ ; done *DONE ************************** Run hisat2 and htseq to quantify reads hisat2-build hormaphis_cornu_26Sep2017_PoQx8/hormaphis_cornu_26Sep2017_PoQx8.fasta Hcor_hisat #Pool samples known to be technical reps. #Two sets of fundatrix samples #paired end for all samples ~/hormaphis/fq/fundatrix/demux_2019 #single end for original run including 1st and 2nd instars. Add to hisat2 mapping ~/hormaphis/fq/fundatrix/demux_2019/first_run_first_samples/ for i in catrim*1.fq.gz; do root_name=`basename ${i} _1.fq.gz`; \ second_name=${root_name}_2.fq.gz; \ bsub -n 4 -o hisat2_aln${root_name} "hisat2 \ -x ${home}/hormaphis/Hcor_hisat \ -p 4 \ -1 \ ${i} \ -2 \ ${second_name} \ | ${home}/bin/samtools sort -O BAM -@ 4 - > ${root_name}.bam" \ ;done *DONE counts *DONE ~/hormaphis/fq/fundatrix/demux_2019/first_run_first_samples Need to align and count PE and SE reads separately and then pool sums home=/groups/stern/home/sternd #within PE folder for i in catrim*1.fq.gz; do root_name=`basename ${i} _1.fq.gz`; \ second_name=${root_name}_2.fq.gz; \ bsub -n 4 -o hisat2_aln${root_name} "hisat2 \ -x ${home}/hormaphis/Hcor_hisat \ -p 4 \ -1 \ ${i} \ -2 \ ${second_name} \ | ${home}/bin/samtools sort -O BAM -@ 4 - > ${root_name}.bam" \ ;done *DONE counts * cd ~/hormaphis/fq/fundatrix/demux_2019/first_run_first_samples #within SE folder for i in catrim*.fq.gz; do root_name=`basename ${i} .fq.gz`; \ bsub -n 4 -o hisat2_aln${root_name} "hisat2 \ -x ${home}/hormaphis/Hcor_hisat \ -p 4 \ -U \ ${single_end_name} \ | ${home}/bin/samtools sort -O BAM -@ 4 - > ${root_name}.bam" \ ;done *DONE counts * #combine PE and SE counts for i in PE/*counts; do filename=`basename $i`; paste PE/${filename} SE/${filename} |awk '{print $1 "\t" $2+$4;}' > ${filename}; done cd ~/hormaphis/fq/fundatrix/WholeBody for i in *1.fq.gz.PwU.qtrim.fq.gz; do root_name=`basename ${i} .1.fq.gz.PwU.qtrim.fq.gz`; \ second_name=${root_name}.2.fq.gz.PwU.qtrim.fq.gz; \ bsub -n 4 -o hisat2_aln${root_name} "hisat2 \ -x ${home}/hormaphis/Hcor_hisat \ -p 4 \ -1 \ ${i} \ -2 \ ${second_name} \ | ${home}/bin/samtools sort -O BAM -@ 4 - > ${root_name}.bam" \ ;done *DONE counts (see command below) *RUNNING #2019 2nd gen samples ~/hormaphis/fq/2nd_gen_SG_RNAseq/catrim*1.fq.gz for i in catrim*1.fq.gz; do root_name=`basename ${i} _1.fq.gz`; \ second_name=${root_name}2.fq.gz; \ bsub -n 4 -o hisat2_aln${root_name} "hisat2 \ -x ${home}/hormaphis/Hcor_hisat \ -p 4 \ -1 \ ${i} \ -2 \ ${second_name} \ | ${home}/bin/samtools sort -O BAM -@ 4 - > ${root_name}.bam" \ ;done *DONE counts (see command below) *DONE #original 2nd gen samples ~/hormaphis/fq/2nd_gen_SG_RNAseq/*1.fq.gz.PwU.qtrim.fq.gz for i in *1.fq.gz.PwU.qtrim.fq.gz; do root_name=`basename ${i} .1.fq.gz.PwU.qtrim.fq.gz`; \ second_name=${root_name}2.fq.gz.PwU.qtrim.fq.gz; \ bsub -n 4 -o hisat2_aln${root_name} "hisat2 \ -x ${home}/hormaphis/Hcor_hisat \ -p 4 \ -1 \ ${i} \ -2 \ ${second_name} \ | ${home}/bin/samtools sort -O BAM -@ 4 - > ${root_name}.bam" \ ;done *DONE counts (see command below) *DONE ~/hormaphis/fq/2nd_gen_SG_RNAseq/carcass for i in *1.fq.gz.PwU.qtrim.fq.gz; do root_name=`basename ${i} .1.fq.gz.PwU.qtrim.fq.gz`; \ second_name=${root_name}.2.fq.gz.PwU.qtrim.fq.gz; \ bsub -n 4 -o hisat2_aln${root_name} "hisat2 \ -x ${home}/hormaphis/Hcor_hisat \ -p 4 \ -1 \ ${i} \ -2 \ ${second_name} \ | ${home}/bin/samtools sort -O BAM -@ 4 - > ${root_name}.bam" \ ;done *DONE counts (see command below) #pooled repeated runs of ex_Betula samples ~/hormaphis/fq/ex_Betula_RNAseq/pooledSG/catrim*1.fq.gz cd ~/hormaphis/fq/ex_Betula_RNAseq/pooledSG for i in catrim*1.fq.gz; do root_name=`basename ${i} _1.fq.gz`; \ second_name=${root_name}_2.fq.gz; \ bsub -n 4 -o hisat2_aln${root_name} "hisat2 \ -x ${home}/hormaphis/Hcor_hisat \ -p 4 \ -1 \ ${i} \ -2 \ ${second_name} \ | ${home}/bin/samtools sort -O BAM -@ 4 - > ${root_name}.bam" \ ;done *DONE counts (see command below) #exBetula carcass ~/hormaphis/fq/ex_Betula_RNAseq/ex_Betula_carcass retrim reads *DONE for i in catrim*1.fq.gz; do root_name=`basename ${i} _1.fq.gz`; \ second_name=${root_name}_2.fq.gz; \ bsub -n 4 -o hisat2_aln${root_name} "hisat2 \ -x ${home}/hormaphis/Hcor_hisat \ -p 4 \ -1 \ ${i} \ -2 \ ${second_name} \ | ${home}/bin/samtools sort -O BAM -@ 4 - > ${root_name}.bam" \ ;done *DONE counts (see command below) *DONE #Sexuals PE wgb ~/hormaphis/fq/sexuals_RNAseq/6xi19/catrim*1.fq.gz cd ~/hormaphis/fq/sexuals_RNAseq/6xi19/ for i in catrim*1.fq.gz; do root_name=`basename ${i} _1.fq.gz`; \ second_name=${root_name}_2.fq.gz; \ bsub -n 4 -o hisat2_aln${root_name} "hisat2 \ -x ${home}/hormaphis/Hcor_hisat \ -p 4 \ -1 \ ${i} \ -2 \ ${second_name} \ | ${home}/bin/samtools sort -O BAM -@ 4 - > ${root_name}.bam" \ ;done *DONE counts (see command below) *DONE #Sexuals 3' tags ~/hormaphis/fq/sexuals_RNAseq/pooled/catrim*pooled.fq.gz for i in catrim*.fq.gz; do root_name=`basename ${i} .fq.gz`; \ bsub -n 4 -o hisat2_aln${root_name} "hisat2 \ -x ${home}/hormaphis/Hcor_hisat \ -p 4 \ -U \ ${i} \ | ${home}/bin/samtools sort -O BAM -@ 4 - > ${i}.bam" \ ;done *DONE counts (see command below) *DONE #Get gff into correct format to htseq #correct format from Cnek 141398 AUGUSTUS exon 1211 1522 . + . transcript_id=g1.t1; gene_id=g1 208668 AUGUSTUS exon 101 260 . + . transcript_id=g2.t1; gene_id=g2 208668 AUGUSTUS exon 418 2055 . - . transcript_id=g3.t1; gene_id=g3 #current format of Augustus.updated_w_annots.21Aug20.gff3.gz Scaffold_11493 AUGUSTUS gene 82 1161 1 + . g1 Scaffold_11493 AUGUSTUS transcript 82 1161 1 + . g1.t1 Scaffold_11493 AUGUSTUS start_codon 82 84 . + 0 transcript_id "g1.t1"; gene_id "g1"; Scaffold_11493 AUGUSTUS CDS 82 1161 1 + 0 transcript_id "g1.t1"; gene_id "g1"; Scaffold_11493 AUGUSTUS exon 82 1161 . + . transcript_id "g1.t1"; gene_id "g1"; Scaffold_11493 AUGUSTUS stop_codon 1159 1161 . + 0 transcript_id "g1.t1"; gene_id "g1"; zgrep exon Augustus.updated_w_annots.21Aug20.gff3.gz | sed 's|Parent|transcript_id|' > exons.gff3 ScPoQx8_4355;HRSCAF=5393 AUGUSTUS exon 1110 1540 . - . transcript_id=g1.t1 ScPoQx8_4355;HRSCAF=5393 AUGUSTUS exon 15395 15430 . + . transcript_id=g2.t1 ScPoQx8_4355;HRSCAF=5393 AUGUSTUS exon 17109 17216 . + . transcript_id=g2.t1 ScPoQx8_4355;HRSCAF=5393 AUGUSTUS exon 17663 17931 . + . transcript_id=g2.t1 sed 's|.*transcript_id=\(g.*\).t.*|; gene_id=\1|' exons.gff3 > genes.gff3 paste -d \0 exons.gff3 genes.gff3 > Augustus.updated_w_annots.21Aug20.exons.gff3 ScPoQx8_4355;HRSCAF=5393 AUGUSTUS exon 1110 1540 . - . transcript_id=g1.t1; gene_id=g1 ScPoQx8_4355;HRSCAF=5393 AUGUSTUS exon 15395 15430 . + . transcript_id=g2.t1; gene_id=g2 ScPoQx8_4355;HRSCAF=5393 AUGUSTUS exon 17109 17216 . + . transcript_id=g2.t1; gene_id=g2 ScPoQx8_4355;HRSCAF=5393 AUGUSTUS exon 17663 17931 . + . transcript_id=g2.t1; gene_id=g2 ScPoQx8_4355;HRSCAF=5393 AUGUSTUS exon 18161 18396 . + . transcript_id=g2.t1; gene_id=g2 ScPoQx8_4355;HRSCAF=5393 AUGUSTUS exon 18588 18880 . + . transcript_id=g2.t1; gene_id=g2 #for each folder above for i in *bam; do \ bsub -o count.${i}.txt \ "htseq-count -i "gene_id" --stranded=no -f bam -t exon \ ${i} /groups/stern/home/sternd/hormaphis/Augustus.updated_w_annots.21Aug20.exons.gff3 > ${i}.counts"; \ done within counts folder scp 'sternd@login2.int.janelia.org:hormaphis/fq/counts/*' . ```{r message=FALSE, warning=FALSE} source("/Users/sternd/Aphid_gall/LoadLibraries.R") ``` ```{r} setwd("/Users/sternd/Aphid_gall/Hormaphis_cornu/DE analysis/AnalysisPipeline5.18Aug20/") ``` colorblind palette ```{r} # The palette with black: #approx: black orange cyan green yellow blue red purple cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") names(cbbPalette) <- c("black", "orange", "cyan", "green", "yellow", "blue", "red", "purple") ``` Redo MD plots, Alignments, LOGOs, and Trees with annotated dataset Load counts data ```{r} files <- dir(path="counts/",pattern="*") y <- readDGE(files,path="counts/",columns=c(1,2),head=FALSE) y <- y[-grep("_",row.names(y$counts)),] y_start <-y ``` Define groups catrim.m36.D02_out Sex_Head catrim.m36.E02_out Sex_Body catrim.m36.F02_out Sex_SG catrim.m36.G02_out Sex_SG catrim.m36.H02_out Sex_SG ```{r} y$samples$group <- factor(y$samples$group,levels = c("SexSG","G1SG","G2SG","SexCarc","G1Carc","G2Carc","GBetSG","GBetCarc")) y$samples$group[grep(pattern = "Sample",x = row.names(y$samples))] <- "G1SG" #replace whole and G2 y$samples$group[grep(pattern = "Sample_whole",x = row.names(y$samples))] <- "G1Carc" y$samples$group[grep(pattern = "Sample_2ndgen_3rdinst_sg",x = row.names(y$samples))] <- "G2SG" y$samples$group[grep(pattern = "Sample_2ndgen_3rdinst_whole",x = row.names(y$samples))] <- "G2Carc" y$samples$group[grep(pattern = "catrim.m36.D02",x = row.names(y$samples))] <- "SexCarc" y$samples$group[grep(pattern = "catrim.m36.E02",x = row.names(y$samples))] <- "SexCarc" y$samples$group[grep(pattern = "catrim.m36.F02",x = row.names(y$samples))] <- "SexSG" y$samples$group[grep(pattern = "catrim.m36.G02",x = row.names(y$samples))] <- "SexSG" y$samples$group[grep(pattern = "catrim.m36.H02",x = row.names(y$samples))] <- "SexSG" y$samples$group[grep(pattern = "sexF_SG",x = row.names(y$samples))] <- "SexSG" y$samples$group[grep(pattern = "2nd_gen_1stinst_SG",x = row.names(y$samples))] <- "G2SG" y$samples$group[grep(pattern = "2nd_gen_Ad_SG",x = row.names(y$samples))] <- "G2SG" y$samples$group[grep(pattern = "Hcor_ex_bet_4th",x = row.names(y$samples))] <- "GBetSG" y$samples$group[grep(pattern = "Hcor_ex_bet_Ad",x = row.names(y$samples))] <- "GBetSG" y$samples$group[grep(pattern = "Hcor_ex_bet_nymphs",x = row.names(y$samples))] <- "GBetCarc" ``` QC ```{r} cpm <- cpm(y) lcpm <- cpm(y,log=TRUE) numsamples <-length(row.names(y$samples)) table(rowSums(y$counts==0) == numsamples) #260 genes have 0 counts in all samples #Have 21 samples of fundatrix salivary glands #Only interested in genes that are expressed in these glands #Therefore, exclude any genes that are not expressed at cpm > 1 #in at least these samples keep.exprs <- rowSums(cpm>1)>=21 y <- y[keep.exprs,, keep.lib.sizes=FALSE] length(keep.exprs[keep.exprs==TRUE]) ``` QC ```{r} nsamples <- ncol(y) colors <- rainbow(nsamples) par(mfrow=c(1,2)) lcpm<-cpm(y_start,log=TRUE) plot(density(lcpm[,1]), lwd=2, ylim=c(0,0.21), las=2, main="", xlab="") title(main="A. Raw data", xlab="Log-cpm") abline(v=0, lty=3) for (i in 2:nsamples){ den <- density(lcpm[,i]) lines(den$x, den$y, lwd=1,col=colors[i]) } lcpm <- cpm(y, log=TRUE) plot(density(lcpm[,1]), lwd=2, ylim=c(0,0.21), las=2, main="", xlab="") title(main="B. Filtered data", xlab="Log-cpm") abline(v=0, lty=3) for (i in 2:nsamples){ den <- density(lcpm[,i]) lines(den$x, den$y, lwd=1,col=colors[i]) } ``` QC ```{r} # The las argument rotates the axis names barplot(y$samples$lib.size,names=colnames(y),las=2,cex.names=.4) # Add a title to the plot title("Library sizes") # Get log2 counts per million logcounts <- cpm(y,log=TRUE) # Check distributions of samples using boxplots par(cex.axis=.5) boxplot(logcounts, xlab="", ylab="Log2 counts per million",names=colnames(y),las=2) # Let's add a blue horizontal line that corresponds to the median logCPM abline(h=median(logcounts),col="blue") title(main="Boxplots of logCPMs (unnormalised)",cex=.7) #normalize samples y <- calcNormFactors(y) logcounts <- cpm(y,log=TRUE) # Check distributions of samples using boxplots par(cex.axis=.5) boxplot(logcounts, xlab="", ylab="Log2 counts per million",las=2) # Let's add a blue horizontal line that corresponds to the median logCPM abline(h=median(logcounts),col="blue") title("Boxplots of logCPMs (normalised)") ``` interactive MDS plot ```{r} glMDSPlot(y,groups=y$samples$group,labels=row.names(y$samples),folder = "MDSPlot.AllSamples",gene.selection = "pairwise",top = 500,prior.count = 2) ``` Remove outliers catrim.m36.Sample_1st_I_catrimFundSG (Outlier in Dim 1) catrim.m36.Sample_2nd_gen_1stinst_SG2 (Outlier in Dim 2) catrim.m36.G02.pooled.fq.gz (Outlier in Dim 2) catrim.m36.Sample_2nd_gen_1stinst_SG1 (Outlier in Dim 6) ```{r} ysub <- y[,c(-grep(pattern = "Sample_1st_I.bam", x = row.names(y$samples)), -grep(pattern = "Sample_2nd_gen_1stinst_SG2", x = row.names(y$samples)), -grep(pattern = "G02.pooled", x = row.names(y$samples)), -grep(pattern = "Sample_2nd_gen_1stinst_SG1", x = row.names(y$samples)))] ``` export all salivary gland data ```{r eval=FALSE, include=FALSE} ysub_SG <- ysub[,ysub$samples$group == "G1SG"] save(ysub_SG, file = "Fund_SG_RNAseq.Rmd") ``` ```{r} glMDSPlot(ysub,groups=ysub$samples$group,labels=row.names(ysub$samples),folder = "MDSPlot.Subset",gene.selection = "pairwise",top = 500,prior.count = 2) ``` Search for genes upregulated in fundatrix sal gland relative to sexual sal gland and upregulated in fund sal gland relative to whole body, then look for intersection to get frundatrix salivary gland specific upregulated genes ```{r} group <- factor(ysub$samples$group) design <- model.matrix(~ 0 + group) colnames(design) <- levels(group) ysub <- estimateDisp(ysub,design) fit <- glmQLFit(ysub, design) my.contrasts <- makeContrasts( SalGl.FundvsSex = G1SG-SexSG, SalGLvsWhole.Fund = G1SG-G1Carc, levels=design) #test both contrasts simultaneously mult.contrasts.fit <- glmQLFTest(fit, contrast=my.contrasts) #Test one at a time qlf.SGFundvsSGSex <- glmQLFTest(fit, contrast=my.contrasts[,"SalGl.FundvsSex"]) qlf.SGvsWholeFund <- glmQLFTest(fit, contrast=my.contrasts[,"SalGLvsWhole.Fund"]) dt.SG<-decideTests(qlf.SGFundvsSGSex) dt.SGvsWholeFund <-decideTests(qlf.SGvsWholeFund) dt.all<-merge(dt.SG,dt.SGvsWholeFund,by="row.names",all=TRUE) vennC_up <- vennCounts2(dt.all[,2:3],method = "up") #Plotting options #venn from Broman - simple, scaled, not flexible - does not show # intersecting ``` ```{r} pdf(file = "VennDiagram.pdf") vennDiagram(dt.all[,2:3]==1, names=c('Fundatrix vs Sexual\nSalivary Gland','Fundatrix Salivary Gland\nvs Whole Body'),circle.col=c("turquoise", "salmon"),cex=1,main=NULL) dev.off() summary(decideTests(qlf.SGFundvsSGSex)) summary(decideTests(qlf.SGvsWholeFund)) save(qlf.SGFundvsSGSex, file = "qlf.SGFundvsSGSex.RData") ``` MD plot of fundatrix salivary gland specific upregulated genes. ```{r} pdf(file = "MDplot_FundVSSexSalGl.pdf") par(mar = c(5.1, 7.1, 4.1, 2.1)) plotMD(qlf.SGFundvsSGSex, main = NULL, xlab = "Mean Expression Level (log2 CPM)", ylab = "Salivary Gland (Fundatrix - Sexual)\nFold Difference (log2)", hl.col = c(colwalpha("blue",alpha = .5),colwalpha("red",alpha = .5)), hl.cex = c(.75,.75), bg.pch = 16, bg.cex = .3, bg.col = colwalpha("grey75"), cex.lab = 2, cex.axis = 2, legend="topleft") dev.off() pdf(file = "MDplot_FundSGVSFundWhole.pdf") par(mar = c(5.1, 7.1, 4.1, 2.1)) plotMD(qlf.SGvsWholeFund, main = NULL, xlab = "Mean Expression Level (log2 CPM)", ylab = "Fundatrix (Salivary Gland - Body)\nFold Difference (log2)", hl.col = c(colwalpha("blue",alpha = .5),colwalpha("red",alpha = .5)), hl.cex = c(.75,.75), bg.pch = 16, bg.cex = .3, bg.col = colwalpha("grey75"), cex.lab = 2, cex.axis = 2, legend = "bottomright") dev.off() #Plot only Fund SG specific genes that are DE #transcripts upregulated in both conditions Fund Sal Gl > Emig Sal Gl & Fund Sal Gl > Fund Body sigUpSG <- qlf.SGFundvsSGSex[decideTests(qlf.SGFundvsSGSex) == 1,] sigUpSGvsWhole <- qlf.SGvsWholeFund[decideTests(qlf.SGvsWholeFund) == 1,] sigUpSGgenes <- row.names(sigUpSG) sigUpSGvsWholegenes <- row.names(sigUpSGvsWhole) FundSGgenes <- intersect(sigUpSGgenes,sigUpSGvsWholegenes) gene_status <- if_else(rownames(qlf.SGFundvsSGSex$table) %in% FundSGgenes, "Fund SG Specific","Not Specific") pdf(file = "MDplot_FundSGSpp_FundVSSexSalGl.pdf") par(mar = c(5.1, 7.1, 4.1, 2.1)) plotMD(qlf.SGFundvsSGSex, status = gene_status, main = NULL, xlab = "Mean Expression Level (log2 CPM)", ylab = "Salivary Gland (Fundatrix - Sexual)\nFold Difference (log2)", hl.col = c(colwalpha("purple",alpha = .7),"black"), hl.cex = c(.75,.3), bg.pch = 16, bg.cex = .7, bg.col = colwalpha("black",alpha = 0.3), cex.lab = 2, cex.axis = 2, legend = "topleft") dev.off() pdf(file = "MDplot_FundSGSpp_FundSGVSBody.pdf") par(mar = c(5.1, 7.1, 4.1, 2.1)) plotMD(qlf.SGvsWholeFund, status = gene_status, main = NULL, xlab = "Mean Expression Level (log2 CPM)", ylab = "Fundatrix (Salivary Gland - Carcass)\nFold Difference (log2)", hl.col = c(colwalpha("purple",alpha = .7),"black"), hl.cex = c(.75,.3), bg.pch = 16, bg.cex = .7, bg.col = colwalpha("black",alpha = 0.3), cex.lab = 2, cex.axis = 2, legend = "bottomright") dev.off() ``` Volcano plots ```{r} DEredgrSG <- qlf.SGFundvsSGSex$table DEredgrSG$qValue <- p.adjust(DEredgrSG$PValue, method = "fdr") orderedDEredgrSG<-DEredgrSG[order(DEredgrSG$qValue),] #find cutoffs P05cutoff<-orderedDEredgrSG[orderedDEredgrSG$qValue>0.05,][1,] P01cutoff<-orderedDEredgrSG[orderedDEredgrSG$qValue>0.01,][1,] ptlabelsize = 4 p <- P05cutoff$PValue keyvals <- rep('grey75', nrow(DEredgrSG)) names(keyvals) <- rep('NS', nrow(DEredgrSG)) keyvals[which(DEredgrSG$PValue < p & DEredgrSG$logFC > 0)] <- 'blue2' names(keyvals)[which(DEredgrSG$PValue < p & DEredgrSG$logFC > 0)] <- 'Signif. down-regulated' keyvals[which(DEredgrSG$PValue < p & DEredgrSG$logFC < 0)] <- 'red2' names(keyvals)[which(DEredgrSG$PValue < p & DEredgrSG$logFC < 0)] <- 'Signif. up-regulated' unique(keyvals) unique(names(keyvals)) pdf("VPFundVsSexSG.pdf") p1 <- EnhancedVolcano(toptable = DEredgrSG, lab = NA, x = "logFC", y = "PValue", title = NULL, subtitle = NULL, caption = NULL, colCustom = keyvals, pCutoff = p, FCcutoff = FALSE, cutoffLineType = "blank", hline = c(p,P01cutoff$PValue), hlineCol = "black", hlineType = "longdash", hlineWidth = 0.5, pointSize = c(ifelse(DEredgrSG$qValue < 0.05, 3, 1)), ylim = c(0, max(-log10(DEredgrSG$PValue), na.rm=TRUE)), xlab = "Salivary Gland (Fundatrix - Sexual)\nFold Change (log2)", col = c("black", "black", "royalblue", "red2") ) p1 + theme(legend.position = 'none', axis.text.x = element_text(angle = 0, size = 18, vjust= 1), axis.text.y = element_text(angle = 0, size = 18, vjust= 0.5), axis.title.x = element_text(angle = 0,size = 20,vjust = 0), axis.title.y = element_text(angle = 90,size = 22,vjust = 0) ) + annotate("text", label =c("FDR = 0.05", "FDR = 0.01"), x = c(-11.5,-11.5), y = c(0.9,2.95), size = 5, colour = "black") dev.off() ``` ```{r} DEredgrSG <- qlf.SGvsWholeFund$table DEredgrSG$qValue <- p.adjust(DEredgrSG$PValue, method = "fdr") orderedDEredgrSG<-DEredgrSG[order(DEredgrSG$qValue),] #find cutoffs P05cutoff<-orderedDEredgrSG[orderedDEredgrSG$qValue>0.05,][1,] P01cutoff<-orderedDEredgrSG[orderedDEredgrSG$qValue>0.01,][1,] ptlabelsize = 4 p <- P05cutoff$PValue keyvals <- rep('grey75', nrow(DEredgrSG)) names(keyvals) <- rep('NS', nrow(DEredgrSG)) keyvals[which(DEredgrSG$PValue < p & DEredgrSG$logFC > 0)] <- 'blue2' names(keyvals)[which(DEredgrSG$PValue < p & DEredgrSG$logFC > 0)] <- 'Signif. down-regulated' keyvals[which(DEredgrSG$PValue < p & DEredgrSG$logFC < 0)] <- 'red2' names(keyvals)[which(DEredgrSG$PValue < p & DEredgrSG$logFC < 0)] <- 'Signif. up-regulated' unique(keyvals) unique(names(keyvals)) pdf("VPFundSGVsBody.pdf") p1 <- EnhancedVolcano(toptable = DEredgrSG, lab = NA, x = "logFC", y = "PValue", title = NULL, subtitle = NULL, caption = NULL, colCustom = keyvals, pCutoff = p, FCcutoff = FALSE, cutoffLineType = "blank", hline = c(p,P01cutoff$PValue), hlineCol = "black", hlineType = "longdash", hlineWidth = 0.5, pointSize = c(ifelse(DEredgrSG$qValue < 0.05, 3, 1)), ylim = c(0, max(-log10(DEredgrSG$PValue), na.rm=TRUE)), xlab = "Fundatrix (Salivary Gland - Body)\nFold Change (log2)", col = c("black", "black", "royalblue", "red2") ) p1 + theme(legend.position = 'none', axis.text.x = element_text(angle = 0, size = 18, vjust= 1), axis.text.y = element_text(angle = 0, size = 18, vjust= 0.5), axis.title.x = element_text(angle = 0,size = 20,vjust = 0), axis.title.y = element_text(angle = 90,size = 22,vjust = 0) ) + annotate("text", label =c("FDR = 0.05", "FDR = 0.01"), x = c(-7.5,-7.5), y = c(0.8,3), size = 5, colour = "black") dev.off() ``` Get CDS and PEP files from Hcor-3-best-score.utrme.UTR.gff Make transcripts and pep files gffread Augustus.updated_w_annots.21Aug20.gff3 -g /Users/sternd/Dropbox\ \(HHMI\)/Projects/Aphid_gall/Hormaphis_cornu/genomes/Dovetail_Genomics/Results/HiC_HiCRise_PoQx8/hormaphis_cornu_26Sep2017_PoQx8/hormaphis_cornu_26Sep2017_PoQx8.fasta \ -w Augustus.updated_w_annots.21Aug20.transcripts.fa *DONE gffread Augustus.updated_w_annots.21Aug20.gff3 -g /Users/sternd/Dropbox\ \(HHMI\)/Projects/Aphid_gall/Hormaphis_cornu/genomes/Dovetail_Genomics/Results/HiC_HiCRise_PoQx8/hormaphis_cornu_26Sep2017_PoQx8/hormaphis_cornu_26Sep2017_PoQx8.fasta \ -y Augustus.updated_w_annots.21Aug20.pep.fa *DONE #remove CDS info from transcripts files sed -i.bak 's/CDS=.*//' Augustus.updated_w_annots.21Aug20.transcripts.fa *DONE scp Augustus.updated_w_annots.21Aug20*fa sternd@login2.int.janelia.org:hormaphis/Annotate On cluster, annotate predicted transcripts: bsub -n 16 -We 600 -o blastx "/usr/local/blast+/bin/blastx -query Augustus.updated_w_annots.21Aug20.transcripts.fa -db ~/sternlab/blast_stuff/uniprot_sprot.pep -num_threads 16 -max_target_seqs 1 -outfmt 6 -evalue 1e-3 > Augustus.CDS.fasta.blastx.outfmt6" *DONE bsub -n 16 -We 600 -o blastp "/usr/local/blast+/bin/blastp -query Augustus.updated_w_annots.21Aug20.pep.fa -db ~/sternlab/blast_stuff/uniprot_sprot.pep -num_threads 16 -max_target_seqs 1 -outfmt 6 > Augustus.pep.fasta.blastp.outfmt6" *DONE bsub -n 16 -We 600 -o hmmscan "/usr/local/hmmer-3.1b2/bin/hmmscan --cpu 16 --cut_ga --domtblout hmmscan.out ~/sternlab/blast_stuff/Pfam-A.hmm Augustus.updated_w_annots.21Aug20.pep.fa > pfam.log" *DONE bsub -n 8 -We 60 -o signalp "mkdir /scratch/sternd/signalp; signalp -format short -tmp /scratch/sternd/signalp -gff3 -prefix Augustus.pep.fasta.signalp5 -fasta Augustus.updated_w_annots.21Aug20.pep.fa" *DONE bsub -n 8 -We 60 -o tmhmm "/usr/local/tmhmm-2.0/bin/tmhmm --short < Augustus.updated_w_annots.21Aug20.pep.fa > Augustus.pep.fasta.tmhmm.out" gzip files scp "sternd@login2.int.janelia.org:hormaphis/Annotate/*.gz" . import annotations ```{r message=FALSE, include=FALSE} #read_blast6, from package eclectic, not available under R4.0 blastx <- read.blast("Annotate/Augustus.CDS.fasta.blastx.outfmt6.gz") blastp <- read.blast("Annotate/Augustus.pep.fasta.blastp.outfmt6.gz") #remove values with evalue >0.001 (cutoff used in blastx) blastp <- blastp[blastp$evalue <= 0.001,] hmm <- read_domtblout("Annotate/hmmscan.out.gz") hmm <- hmm[hmm$domain_ievalue <= 0.001,] colnames(hmm)[4] <- "qseqid" hmm$sseqid <- paste(hmm$domain_name,hmm$description) sigp <- readGFF("Annotate/Augustus.pep.fasta.signalp5.gff3.gz",version=3) colnames(sigp)[1] <- "qseqid" sigp$sseqid <- paste("sigp",sigp$start,sigp$end,sep="_") tm <- read.csv("Annotate/Augustus.pep.fasta.tmhmm.out.gz",sep = "\t",header = FALSE, col.names = c("qseqid", "protein_length", "expAA", "First60", "PredHel", "Topology")) tm$protein_length <- gsub("len=","",tm$protein_length) tm$expAA <- gsub("ExpAA=","",tm$expAA) tm$First60 <- gsub("First60=","",tm$First60) tm$PredHel <- gsub("PredHel=","",tm$PredHel) tm$Topology <- gsub("Topology=","",tm$Topology) #filter out non-TM hits tm <- tm[tm$PredHel > 0,] #Make new sseqid columns tm$sseqid <- paste("PredHel=",tm$PredHel,sep="") ``` ```{r} annot_genes <- data.frame(FundSGgenes,row.names = FundSGgenes) #make new column in each annotation file of gene name. Then reduce to best hit per gene blastp_gene <- blastp %>% mutate(gene = gsub("\\..*","",query.id)) %>% group_by(gene) %>% top_n(-1,evalue) %>% dplyr::slice(1) blastp_gene <- rename(blastp_gene,sseqid = subject.id) blastx_gene <- blastx %>% mutate(gene = gsub("\\..*","",query.id)) %>% group_by(gene) %>% top_n(-1,evalue) %>% dplyr::slice(1) blastx_gene <- rename(blastx_gene,sseqid = subject.id) hmm_gene <- hmm %>% mutate(gene = gsub("\\..*","",qseqid)) %>% group_by(gene) %>% top_n(-1,sequence_evalue) %>% dplyr::slice(1) tm_gene <- tm %>% mutate(gene = gsub("\\..*","",qseqid)) %>% group_by(gene) %>% dplyr::slice(1) sigp_gene <- sigp %>% mutate(gene = gsub("\\..*","",qseqid)) %>% group_by(gene) %>% top_n(1,score) %>% dplyr::slice(1) match_transcripts <- function(x,y){ l <- match(x$FundSGgenes, y$gene, nomatch = NA) y$sseqid[l] # return only the replaced column } #sort annotation dataframes from best hit to worst to ensure that pmatch, which takes only first hit, gets the best hit annot_genes$blastx <- match_transcripts(annot_genes,blastx_gene) annot_genes$blastp <- match_transcripts(annot_genes,blastp_gene) annot_genes$hmm <- match_transcripts(annot_genes,hmm_gene) annot_genes$sigp <- match_transcripts(annot_genes,sigp_gene) annot_genes$tm <- match_transcripts(annot_genes,tm_gene) #mark all transcripts that have no database annotations annot_genes$AnyAnnots<-ifelse(is.na(annot_genes$blastx) & is.na(annot_genes$blastp) & is.na(annot_genes$hmm),0,1) up_no_annot <-sum(annot_genes$AnyAnnots==0) up_annot <-sum(annot_genes$AnyAnnots==1) num_annot <-dim(annot_genes)[1] up_sigp <- sum(!is.na(annot_genes$sigp)) num_no_annot_sigp <- sum(!is.na(annot_genes$sigp) & annot_genes$AnyAnnots==0) num_annot_sigp <- sum(!is.na(annot_genes$sigp) & annot_genes$AnyAnnots==1) print("Number of upregulated genes with no annotations") up_no_annot print("Total number of upregulated genes") num_annot print("Number of upregulated genes with predicted signal peptides") up_sigp print("Number of unannotated upregulated genes with predicted signal peptides") num_no_annot_sigp print("Number of annotated upregulated genes with predicted signal peptides") num_annot_sigp print("Fraction of upregulated unannotated with signal peptides") num_no_annot_sigp / up_no_annot print("Fraction of upregulated annotated with signal peptides") num_annot_sigp / up_annot sigp_annot_test <- matrix(c (num_no_annot_sigp,up_no_annot-num_no_annot_sigp, num_annot_sigp,up_annot-num_annot_sigp), nrow=2) dimnames(sigp_annot_test) <- list(Sigp = c("Yes","No"), Annotation = c("No","Yes") ) Xsq<-chisq.test(sigp_annot_test)#,simulate.p.value = TRUE, B=10000) Xsq$observed Xsq print("Over-representation of sigp in unnanotated genes highly unlikely by chance") up_no_annot_names <-row.names(annot_genes)[annot_genes$AnyAnnots==0] up_annot_names <-row.names(annot_genes)[annot_genes$AnyAnnots==1] annot_names <-row.names(annot_genes) up_sigp_names <- row.names(annot_genes)[!is.na(annot_genes$sigp)] no_annot_sigp_names <- row.names(annot_genes)[!is.na(annot_genes$sigp) & annot_genes$AnyAnnots==0] annot_sigp_names <- row.names(annot_genes)[!is.na(annot_genes$sigp) & annot_genes$AnyAnnots==1] no_annot_no_sigp_names <- row.names(annot_genes)[is.na(annot_genes$sigp) & annot_genes$AnyAnnots==0] annot_no_sigp_names <- row.names(annot_genes)[is.na(annot_genes$sigp) & annot_genes$AnyAnnots==1] write(up_no_annot_names, file = "up_no_annot_names") write(up_annot_names, file = "up_annot_names") write(annot_names, file = "annot_names") write(up_sigp_names, file = "up_sigp_names") write(no_annot_sigp_names, file = "no_annot_sigp_names") write(annot_sigp_names, file = "annot_sigp_names") write(no_annot_no_sigp_names, file = "no_annot_no_sigp_names") write(annot_no_sigp_names, file = "annot_no_sigp_names") ``` MD Plot salivary gland samples, different symbols for annotated and uannotated genes, than add sigp ```{r} annotated_genes <- annot_genes$FundSGgenes[annot_genes$AnyAnnots == 1] unannotated_genes <- annot_genes$FundSGgenes[annot_genes$AnyAnnots == 0] annotation_gene_status <- if_else(rownames(qlf.SGFundvsSGSex$table) %in% annotated_genes, true = "Annotated",if_else(rownames(qlf.SGFundvsSGSex$table) %in% unannotated_genes ,true="Not Annotated", false = "Not Enriched")) annotation_gene_status<- factor(annotation_gene_status,c("Not Enriched","Not Annotated","Annotated")) pdf(file = "MDplot_FundSGSpp_Annotated_FundVSSexSalGl.pdf") par(mar = c(5.1, 7.1, 4.1, 2.1)) plotMD(qlf.SGFundvsSGSex, status = annotation_gene_status, main = NULL, xlab = "Mean Expression Level (log2 CPM)", ylab = "Salivary Gland (Fundatrix - Sexual)\nFold Difference (log2)", hl.col = c(colwalpha("purple",alpha = .6),colwalpha("orange",alpha = .6)), hl.cex = c(.75,.75), cex.lab = 2, cex.axis = 2, legend ="topleft" ) dev.off() #plot sigP status annot_sigp <- annot_genes %>% dplyr::filter(!is.na(sigp) & AnyAnnots ==1) unannot_sigp <- annot_genes %>% dplyr::filter(!is.na(sigp) & AnyAnnots ==0) annotation_gene_status <- if_else(rownames(qlf.SGFundvsSGSex$table) %in% annot_sigp$FundSGgenes, true = "Annotated with Signal Peptides",if_else(rownames(qlf.SGFundvsSGSex$table) %in% unannot_sigp$FundSGgenes ,true="Unannotated with Signal Peptides", false = "No Signal Peptide")) annotation_gene_status<- factor(annotation_gene_status,c("No Signal Peptide","Unannotated with Signal Peptides","Annotated with Signal Peptides")) #deep pink for Unannotated genes, blue for annotated pdf(file = "MDplot_FundSGSpp_SigP_FundVSSexSalGl.pdf") par(mar = c(5.1, 7.1, 4.1, 2.1)) plotMD(qlf.SGFundvsSGSex, status = annotation_gene_status, main=NULL, xlab = "Mean Expression Level (log2 CPM)", ylab = "Salivary Gland (Fundatrix - Sexual)\nFold Difference (log2)", hl.col = c(colwalpha("purple", alpha = .6), colwalpha("orange",alpha = .6)), hl.cex = c(.75,.75), cex.lab = 2, cex.axis = 2, legend ="topleft") dev.off() ``` Volcano plots of same above. ```{r} annotated_genes <- annot_genes$FundSGgenes[annot_genes$AnyAnnots == 1] unannotated_genes <- annot_genes$FundSGgenes[annot_genes$AnyAnnots == 0] annotation_gene_status <- if_else(rownames(qlf.SGFundvsSGSex$table) %in% annotated_genes, true = "Annotated",if_else(rownames(qlf.SGFundvsSGSex$table) %in% unannotated_genes ,true="Not Annotated", false = "Not Enriched")) annotation_gene_status<- factor(annotation_gene_status,c("Not Enriched","Not Annotated","Annotated")) DEredgrSG <- qlf.SGFundvsSGSex$table DEredgrSG$qValue <- p.adjust(DEredgrSG$PValue, method = "fdr") orderedDEredgrSG<-DEredgrSG[order(DEredgrSG$qValue),] #find cutoffs P05cutoff<-orderedDEredgrSG[orderedDEredgrSG$qValue>0.05,][1,] P01cutoff<-orderedDEredgrSG[orderedDEredgrSG$qValue>0.01,][1,] ptlabelsize = 4 p <- P05cutoff$PValue keyvals <- rep('grey75', nrow(DEredgrSG)) names(keyvals) <- rep('NS', nrow(DEredgrSG)) keyvals[which(annotation_gene_status == "Annotated")] <- cbbPalette["cyan"] names(keyvals)[which(annotation_gene_status == "Annotated")] <- 'Annotated' keyvals[which(annotation_gene_status == "Not Annotated")] <- cbbPalette["red"] names(keyvals)[which(annotation_gene_status == "Not Annotated")] <- 'Not Annotated' unique(keyvals) unique(names(keyvals)) pdf("VPFundVsSexSG_FundSGSpecHighlight.pdf") p1 <- EnhancedVolcano(toptable = DEredgrSG, lab = NA, x = "logFC", y = "PValue", title = NULL, subtitle = NULL, caption = NULL, colCustom = keyvals, pCutoff = p, FCcutoff = FALSE, cutoffLineType = "blank", hline = c(p,P01cutoff$PValue), hlineCol = "black", hlineType = "longdash", hlineWidth = 0.5, pointSize = c(ifelse(keyvals == 'grey75', 1, 3)), ylim = c(0, max(-log10(DEredgrSG$PValue), na.rm=TRUE)), xlab = "Salivary Gland (Fundatrix - Sexual)\nFold Change (log2)" ) p1 + theme(legend.position = c(.2,.97), legend.text = element_text(size = 16, colour = "black"), axis.text.x = element_text(angle = 0, size = 18, vjust= 1), axis.text.y = element_text(angle = 0, size = 18, vjust= 0.5), axis.title.x = element_text(angle = 0,size = 20,vjust = 0), axis.title.y = element_text(angle = 90,size = 22,vjust = 0) ) + annotate("text", label =c("FDR = 0.05", "FDR = 0.01"), x = c(-10,-10), y = c(0.9,2.9), size = 5, colour = "black") dev.off() ``` This figure provides compelling evidence that most of the most strongly enriched genes are not annotated. Examine annotated genes first, then cluster unannotated genes ```{r} #get longest transcript for BLAST searches #import peps and get lengths fa<-seqinr::read.fasta(file="Annotate/Augustus.updated_w_annots.21Aug20.pep.fa.gz") aa_length <- data.frame(transcript = getName(fa),transcript_length=getLength(fa),row.names = getName(fa)) rm(fa) #get longest transcript aa_length <- aa_length %>% mutate(gene = gsub("\\..*","",transcript)) %>% group_by(gene) %>% top_n(1,transcript_length) %>% dplyr::slice(1) #paste longest transcript name into annot_genes annot_genes_length<-merge(x=annot_genes,y=aa_length,by.x = "FundSGgenes",by.y = "gene") write(as.character(annot_genes_length$transcript[annot_genes_length$AnyAnnots == 1]),file="sal_gl_up_annotated_gene_names") write(as.character(annot_genes_length$transcript[annot_genes_length$AnyAnnots == 0]),file="sal_gl_up_unannotated_gene_names") ``` In Python, compile mfasta of these genes for blastp GO analysis of annotated genes Make fa of annotated genes and map to D. melanogaster genome ********************** seqkit grep -f sal_gl_up_annotated_gene_names Annotate/Augustus.updated_w_annots.21Aug20.pep.fa.gz > sal_gl_up_annotated_gene_names.fa scp sal_gl_up_annotated_gene_names.fa sternd@login2.int.janelia.org:hormaphis/Annotate on cluster: blastp to dmel-all-translation-r6.24.fasta bsub -n 16 -We 300 -o blastp "/usr/local/blast+/bin/blastp -query sal_gl_up_annotated_gene_names.fa -db /groups/stern/home/sternd/sternlab/blast_stuff/dmel-all-translation-r6.24.fasta -num_threads 16 -max_target_seqs 1 -outfmt 6 > sal_gl_up_annotated_gene_names.fa.blastp.outfmt6" *RUNNING scp sternd@login2.int.janelia.org:hormaphis/Annotate/sal_gl_up_annotated_gene_names.fa.blastp.outfmt6 . ```{r message=FALSE, include=FALSE} annot_dmel_blastp <- read.blast("Annotate/sal_gl_up_annotated_gene_names.fa.blastp.outfmt6") dmel_converter <-read.table("fbgn_fbtr_fbpp_fb_2018_05.tsv.gz",sep="",header=FALSE,col.names=c("FlyBase_FBgn","FlyBase_FBtr","FlyBase_FBpp"),skip = 6,fill=TRUE) match_FB_annots <- function(x,y){ l <- match(x$subject.id, y$FlyBase_FBpp, nomatch = NA) y$FlyBase_FBgn[l] } annot_dmel_blastp$FlyBase_FBgn <- match_FB_annots(annot_dmel_blastp,dmel_converter) #for http://www.webgestalt.org/, append "FLYBASE:" write.table(paste("FLYBASE:",annot_dmel_blastp$FlyBase_FBgn,sep = ""),file="sal_gl_up_annotated_dmel_hits.txt",quote=FALSE,row.names=FALSE,col.names = FALSE) ``` Submit list of hits blastp hits of upregulated genes to http://www.webgestalt.org/ Choose FlybaseID ********************* ********************* Cluster unannotated genes to look for similar clusters. jackhmmer unnanotated against self to get scores. compile mfasta of unannotated transcripts for jackhmmer > seqkit grep -f sal_gl_up_unannotated_gene_names Annotate/Augustus.updated_w_annots.21Aug20.pep.fa.gz > sal_gl_up_unannotated_gene_names.fa *DONE jackhmmer fails because of period "." found in sequence. Remove these. >sed -i.bak -E '/>/!s/\.//g' sal_gl_up_unannotated_gene_names.fa scp sal_gl_up_unannotated_gene_names.fa sternd@login2.int.janelia.org:hormaphis/Annotate bsub -n 32 -We 300 -o jackhmmer.SGUUSGUU "/usr/local/hmmer-3.1b2/bin/jackhmmer --cpu 32 -E 0.01 --incE 0.01 --max --tblout jackhmmer.SGUU_to_SGUU.hits.table.txt sal_gl_up_unannotated_gene_names.fa sal_gl_up_unannotated_gene_names.fa" *DONE scp sternd@login2.int.janelia.org:hormaphis/Annotate/jackhmmer.SGUU_to_SGUU.hits.table.txt.gz . Hierarchical clustering of jackhmmer hits ```{r} jhmmer_SGUU_SGUU <- read_tblout("Annotate/jackhmmer.SGUU_to_SGUU.hits.table.txt.gz") #using only significant hits #Use simple Bonferroni correction to get E-value threshold # num_searches <- jhmmer_SGUU_SGUU %>% distinct(query_name) %>% summarize(num_searches = length(query_name)) num_searches <- as.integer(num_searches[1]) #I prefer a conservative cutoff of 0.01 P_val_cutoff <- 0.01 / num_searches #reduce table to only hits meeting this threshold SGUU_SGUU_hits <- jhmmer_SGUU_SGUU %>% dplyr::filter(best_domain_evalue <= P_val_cutoff) SGUU_SGUU_similarity_matrix <- reshape2::dcast(SGUU_SGUU_hits,query_name ~ domain_name,value.var = "sequence_score",fill=0) row.names(SGUU_SGUU_similarity_matrix) <- SGUU_SGUU_similarity_matrix$query_name SGUU_SGUU_similarity_matrix$query_name <- NULL my.breaks <- c(seq(-4.4, -1.05, length.out=50),seq(-1, 2, length.out=50),seq(2.05,4.4, length.out=50)) ``` ```{r} distance = dist(data.matrix(SGUU_SGUU_similarity_matrix), method = "canberra") clusterrow = hclust(distance, method = "ward.D2") distance = dist(data.matrix(t(SGUU_SGUU_similarity_matrix)), method = "canberra") clustercol = hclust(distance, method = "ward.D2") ``` Label each of the groups ```{r} pdf(file = "Heatmap_SGUU_2_SGUU.canberra.wardD2.labelled.pdf") heatmap.2(data.matrix(SGUU_SGUU_similarity_matrix), Rowv = as.dendrogram(clusterrow), Colv = as.dendrogram(clustercol), col=bluered(149), trace="none", cexCol=.6, labRow=FALSE, labCol=FALSE, key=0, dendrogram='both', breaks = my.breaks, add.expr=text(x=c(600,625), y=c(255,500), cex = 1.5, labels=c("bicycle genes","CWG genes")) ) dev.off() ``` ```{r} #cuts into 3 groups based on column clusters gene_clusters<-cutree(clustercol, k = 3) #which cluster is which gene_cluster1 <- names(gene_clusters[gene_clusters == 1]) gene_cluster2 <- names(gene_clusters[gene_clusters == 2]) gene_cluster3 <- names(gene_clusters[gene_clusters == 3]) print("Number of genes in 'Clusters' 1, 2, 3, 4") length(gene_cluster1) length(gene_cluster2) length(gene_cluster3) length(unique(gsub("\\..*","", gene_cluster1))) length(unique(gsub("\\..*","", gene_cluster2))) length(unique(gsub("\\..*","", gene_cluster3))) ``` Cluster1 = unclustered Cluster2 = bicycle Cluster3 = CWG LOGO plots of bicycle and CWG ```{r} write(gene_cluster1,file="gene_cluster1.names") write(gene_cluster2,file="gene_cluster2.names") write(gene_cluster3,file="gene_cluster3.names") ``` seqkit grep -f gene_cluster2.names Annotate/Augustus.updated_w_annots.21Aug20.pep.fa.gz > gene_cluster2.fa seqkit grep -f gene_cluster3.names Annotate/Augustus.updated_w_annots.21Aug20.pep.fa.gz > gene_cluster3.fa Align with MAFFT mafft --auto --thread 2 gene_cluster2.fa > gene_cluster2.mafft_auto.fa mafft --auto --thread 2 gene_cluster3.fa > gene_cluster3.mafft_auto.fa trim alignments #include only seqns with at least 50% coverage trimal -in gene_cluster2.mafft_auto.fa -out gene_cluster2.mafft_auto.trimal_50.fa -gt 0.50 trimal -in gene_cluster3.mafft_auto.fa -out gene_cluster3.mafft_auto.trimal_50.fa -gt 0.50 trimal -in gene_cluster3.mafft_auto.fa -out gene_cluster3.mafft_auto.trimal_70.fa -gt 0.70 trimal -in gene_cluster3.mafft_auto.fa -out gene_cluster3.mafft_auto.trimal_80.fa -gt 0.80 trimal -in gene_cluster2.mafft_auto.fa -out gene_cluster2.mafft_auto.trimal_30.fa -gt 0.30 trimal -in gene_cluster3.mafft_auto.fa -out gene_cluster3.mafft_auto.trimal_30.fa -gt 0.30 trimal -in gene_cluster2.mafft_auto.fa -out gene_cluster2.mafft_auto.trimal_55.fa -gt 0.55 ```{r} aln<-read.alignment("gene_cluster2.mafft_auto.trimal_50.fa",format="fasta") aln$seq <- unlist(aln$seq) aln$seq <- toupper(aln$seq) logo <- ggseqlogo(aln$seq) + theme(axis.text.x = element_blank()) pdf(file = "gene_cluster2.mafft_auto.trimal_50.pdf",width=20,height=4) logo dev.off() aln<-read.alignment("gene_cluster2.mafft_auto.trimal_30.fa",format="fasta") aln$seq <- unlist(aln$seq) aln$seq <- toupper(aln$seq) logo <- ggseqlogo(aln$seq) + theme(axis.text.x = element_blank()) pdf(file = "gene_cluster2.mafft_auto.trimal_30.pdf",width=20,height=4) logo dev.off() aln<-read.alignment("gene_cluster3.mafft_auto.trimal_50.fa",format="fasta") aln$seq <- unlist(aln$seq) aln$seq <- toupper(aln$seq) seq_length <- str_length(aln$seq[[1]]) seqlist <-list() seqlist$line1 <- substr(aln$seq,1,ceiling(seq_length/4)) seqlist$line2 <- substr(aln$seq,ceiling(seq_length/4) + 1,ceiling(seq_length/2)) seqlist$line3 <- substr(aln$seq,ceiling(seq_length/2) + 1,ceiling(3*seq_length/4)) seqlist$line4 <- substr(aln$seq,ceiling(3*seq_length/4) + 1,ceiling(seq_length)) logo <- ggseqlogo(seqlist,facet = "wrap", nrow = 4) + theme(axis.text.x = element_blank(), strip.text = element_blank()) pdf(file = "gene_cluster3.mafft_auto.trimal_50.pdf",width=20,height=16) logo dev.off() aln<-read.alignment("gene_cluster3.mafft_auto.trimal_30.fa",format="fasta") aln$seq <- unlist(aln$seq) aln$seq <- toupper(aln$seq) seq_length <- str_length(aln$seq[[1]]) seqlist <-list() seqlist$line1 <- substr(aln$seq,1,ceiling(seq_length/4)) seqlist$line2 <- substr(aln$seq,ceiling(seq_length/4) + 1,ceiling(seq_length/2)) seqlist$line3 <- substr(aln$seq,ceiling(seq_length/2) + 1,ceiling(3*seq_length/4)) seqlist$line4 <- substr(aln$seq,ceiling(3*seq_length/4) + 1,ceiling(seq_length)) logo <- ggseqlogo(seqlist,facet = "wrap", nrow = 4) + theme(axis.text.x = element_blank(), strip.text = element_blank()) pdf(file = "gene_cluster3.mafft_auto.trimal_30.pdf",width=20,height=16) logo dev.off() aln<-read.alignment("gene_cluster3.mafft_auto.trimal_70.fa",format="fasta") aln$seq <- unlist(aln$seq) aln$seq <- toupper(aln$seq) seq_length <- str_length(aln$seq[[1]]) seqlist <-list() seqlist$line1 <- substr(aln$seq,1,ceiling(seq_length/4)) seqlist$line2 <- substr(aln$seq,ceiling(seq_length/4) + 1,ceiling(seq_length/2)) seqlist$line3 <- substr(aln$seq,ceiling(seq_length/2) + 1,ceiling(3*seq_length/4)) seqlist$line4 <- substr(aln$seq,ceiling(3*seq_length/4) + 1,ceiling(seq_length)) logo <- ggseqlogo(seqlist,facet = "wrap", nrow = 4) + theme(axis.text.x = element_blank(), strip.text = element_blank()) pdf(file = "gene_cluster3.mafft_auto.trimal_70.pdf",width=20,height=16) logo dev.off() ``` bicycle logo for pub ```{r} aln<-read.alignment("gene_cluster2.mafft_auto.trimal_55.fa",format="fasta") aln$seq <- unlist(aln$seq) aln$seq <- toupper(aln$seq) seq_length <- str_length(aln$seq[[1]]) seqlist <-list() seqlist$line1 <- substr(aln$seq,1,seq_length) logo <- ggseqlogo(seqlist,facet = "wrap",nrow = 1) + theme(axis.text.x = element_blank(),strip.text.x = element_blank(), legend.position = c(.5,-0.1), legend.direction = "horizontal") + annotate('segment', x = c(0.5, 24.5,83.5), xend=c(18.5,76.5,137.5), y=-0.1, yend=-0.1, size=1) + annotate('text', x=9.25, y=-0.7, label = "signal\nsequence", lineheight=0.8) + annotate('text', x=c(50,111), y=-0.68, label='CYC motif') + scale_x_continuous(expand=c(0,.1)) + scale_y_continuous(limits = c(-1,4.2), breaks=c(-1,0,1,2,3,4), labels = c("",0,1,2,3,4)) pdf(file = "gene_cluster2.mafft_auto.trimal_55.pdf",width=10,height=2) logo dev.off() ``` ```{r} aln<-read.alignment("gene_cluster3.mafft_auto.trimal_80.fa",format="fasta") aln$seq <- unlist(aln$seq) aln$seq <- toupper(aln$seq) seq_length <- str_length(aln$seq[[1]]) seqlist <-list() seqlist$line1 <- substr(aln$seq,1,ceiling(seq_length/4)) seqlist$line2 <- substr(aln$seq,ceiling(seq_length/4) + 1,ceiling(seq_length/2)) seqlist$line3 <- substr(aln$seq,ceiling(seq_length/2) + 1,ceiling(3*seq_length/4)) seqlist$line4 <- substr(aln$seq,ceiling(3*seq_length/4) + 1,ceiling(seq_length)) logo <- ggseqlogo(seqlist,facet = "wrap", nrow = 4) + theme(axis.text.x = element_blank(), strip.text = element_blank(), legend.position = "none", axis.title.y = element_text(size=30), axis.text.y = element_text(size=24)) pdf(file = "gene_cluster3.mafft_auto.trimal_80.pdf",width=26,height=16) logo dev.off() ``` MD and VP and life cycle plot these genes ```{r} unannotated_genes <- annot_genes$FundSGgenes[annot_genes$AnyAnnots == 0] bicycle_genes <- annot_genes$FundSGgenes[annot_genes$FundSGgenes %in% gsub("\\..*","",gene_cluster2)] annotation_gene_status <- if_else(rownames(qlf.SGFundvsSGSex$table) %in% bicycle_genes, true = "bicycle", if_else(rownames(qlf.SGFundvsSGSex$table) %in% unannotated_genes ,true="Not Annotated", false = "Not Enriched")) annotation_gene_status<- factor(annotation_gene_status,c("Not Enriched","Not Annotated","bicycle")) pdf(file = "MDplot_FundSGSpp_bicycle_FundVSSexSalGl.pdf") par(mar = c(5.1, 7.1, 4.1, 2.1)) plotMD(qlf.SGFundvsSGSex, status = annotation_gene_status, main = NULL, xlab = "Mean Expression Level (log2 CPM)", ylab = "Salivary Gland (Fundatrix - Sexual)\nFold Difference (log2)", hl.col = c(colwalpha(cbbPalette["green"],alpha = .6),colwalpha(cbbPalette["red"],alpha = .6)), hl.cex = c(.75,.75), cex.lab = 2, cex.axis = 2, legend ="topleft" ) dev.off() #Volcano plots of same above. DEredgrSG <- qlf.SGFundvsSGSex$table DEredgrSG$qValue <- p.adjust(DEredgrSG$PValue, method = "fdr") orderedDEredgrSG<-DEredgrSG[order(DEredgrSG$qValue),] #find cutoffs P05cutoff<-orderedDEredgrSG[orderedDEredgrSG$qValue>0.05,][1,] P01cutoff<-orderedDEredgrSG[orderedDEredgrSG$qValue>0.01,][1,] ptlabelsize = 4 p <- P05cutoff$PValue keyvals <- rep('grey75', nrow(DEredgrSG)) names(keyvals) <- rep('NS', nrow(DEredgrSG)) keyvals[which(annotation_gene_status == "bicycle")] <- cbbPalette["green"] names(keyvals)[which(annotation_gene_status == "bicycle")] <- 'bicycle' keyvals[which(annotation_gene_status == "Not Annotated")] <- cbbPalette["red"] names(keyvals)[which(annotation_gene_status == "Not Annotated")] <- 'Not Annotated' unique(keyvals) unique(names(keyvals)) pdf("VPFundVsSexSG_bicycle.pdf") p1 <- EnhancedVolcano(toptable = DEredgrSG, lab = NA, x = "logFC", y = "PValue", title = NULL, subtitle = NULL, caption = NULL, colCustom = keyvals, pCutoff = p, FCcutoff = FALSE, cutoffLineType = "blank", hline = c(p,P01cutoff$PValue), hlineCol = "black", hlineType = "longdash", hlineWidth = 0.5, pointSize = c(ifelse(keyvals == 'grey75', 1, 3)), ylim = c(0, max(-log10(DEredgrSG$PValue), na.rm=TRUE)), xlab = "Salivary Gland (Fundatrix - Sexual)\nFold Change (log2)" ) p1 + theme(legend.position = c(.2,.97), legend.text = element_text(size = 16, colour = "black"), axis.text.x = element_text(angle = 0, size = 18, vjust= 1), axis.text.y = element_text(angle = 0, size = 18, vjust= 0.5), axis.title.x = element_text(angle = 0,size = 20,vjust = 0), axis.title.y = element_text(angle = 90,size = 22,vjust = 0) ) + annotate("text", label =c("FDR = 0.05", "FDR = 0.01"), x = c(-10,-10), y = c(0.9,2.9), size = 5, colour = "black") dev.off() ``` CWG ```{r} unannotated_genes <- annot_genes$FundSGgenes[annot_genes$AnyAnnots == 0] CWG_genes <- annot_genes$FundSGgenes[annot_genes$FundSGgenes %in% gsub("\\..*","",gene_cluster3)] annotation_gene_status <- if_else(rownames(qlf.SGFundvsSGSex$table) %in% CWG_genes, true = "CWG", if_else(rownames(qlf.SGFundvsSGSex$table) %in% unannotated_genes ,true="Not Annotated", false = "Not Enriched")) annotation_gene_status<- factor(annotation_gene_status,c("Not Enriched","Not Annotated","CWG")) pdf(file = "MDplot_FundSGSpp_CWG_FundVSSexSalGl.pdf") par(mar = c(5.1, 7.1, 4.1, 2.1)) plotMD(qlf.SGFundvsSGSex, status = annotation_gene_status, main = NULL, xlab = "Mean Expression Level (log2 CPM)", ylab = "Salivary Gland (Fundatrix - Sexual)\nFold Difference (log2)", hl.col = c(colwalpha(cbbPalette["red"],alpha = .6),colwalpha(cbbPalette["blue"],alpha = .6)), hl.cex = c(.75,.75), cex.lab = 2, cex.axis = 2, legend ="topleft" ) dev.off() #Volcano plots of same above. DEredgrSG <- qlf.SGFundvsSGSex$table DEredgrSG$qValue <- p.adjust(DEredgrSG$PValue, method = "fdr") orderedDEredgrSG<-DEredgrSG[order(DEredgrSG$qValue),] #find cutoffs P05cutoff<-orderedDEredgrSG[orderedDEredgrSG$qValue>0.05,][1,] P01cutoff<-orderedDEredgrSG[orderedDEredgrSG$qValue>0.01,][1,] ptlabelsize = 4 p <- P05cutoff$PValue keyvals <- rep('grey75', nrow(DEredgrSG)) names(keyvals) <- rep('NS', nrow(DEredgrSG)) keyvals[which(annotation_gene_status == "CWG")] <- cbbPalette["blue"] names(keyvals)[which(annotation_gene_status == "CWG")] <- 'CWG' keyvals[which(annotation_gene_status == "Not Annotated")] <- cbbPalette["red"] names(keyvals)[which(annotation_gene_status == "Not Annotated")] <- 'Not Annotated' unique(keyvals) unique(names(keyvals)) pdf("VPFundVsSexSG_CWG.pdf") p1 <- EnhancedVolcano(toptable = DEredgrSG, lab = NA, x = "logFC", y = "PValue", title = NULL, subtitle = NULL, caption = NULL, colCustom = keyvals, pCutoff = p, FCcutoff = FALSE, cutoffLineType = "blank", hline = c(p,P01cutoff$PValue), hlineCol = "black", hlineType = "longdash", hlineWidth = 0.5, pointSize = c(ifelse(keyvals == 'grey75', 1, 3)), ylim = c(0, max(-log10(DEredgrSG$PValue), na.rm=TRUE)), xlab = "Salivary Gland (Fundatrix - Sexual)\nFold Change (log2)" ) p1 + theme(legend.position = c(.2,.97), legend.text = element_text(size = 16, colour = "black"), axis.text.x = element_text(angle = 0, size = 18, vjust= 1), axis.text.y = element_text(angle = 0, size = 18, vjust= 0.5), axis.title.x = element_text(angle = 0,size = 20,vjust = 0), axis.title.y = element_text(angle = 90,size = 22,vjust = 0) ) + annotate("text", label =c("FDR = 0.05", "FDR = 0.01"), x = c(-15,-15), y = c(0.9,2.95), size = 5, colour = "black") dev.off() ``` Unclustered ```{r} unannotated_genes <- annot_genes$FundSGgenes[annot_genes$AnyAnnots == 0] unclustered_genes <- annot_genes$FundSGgenes[annot_genes$FundSGgenes %in% gsub("\\..*","",gene_cluster1)] annotation_gene_status <- if_else(rownames(qlf.SGFundvsSGSex$table) %in% unclustered_genes, true = "Unclustered", if_else(rownames(qlf.SGFundvsSGSex$table) %in% unannotated_genes ,true="Not Annotated", false = "Not Enriched")) annotation_gene_status<- factor(annotation_gene_status,c("Not Enriched","Not Annotated","Unclustered")) pdf(file = "MDplot_FundSGSpp_Unclustered_FundVSSexSalGl.pdf") par(mar = c(5.1, 7.1, 4.1, 2.1)) plotMD(qlf.SGFundvsSGSex, status = annotation_gene_status, main = NULL, xlab = "Mean Expression Level (log2 CPM)", ylab = "Salivary Gland (Fundatrix - Sexual)\nFold Difference (log2)", hl.col = c(colwalpha(cbbPalette["red"],alpha = .6),colwalpha(cbbPalette["blue"],alpha = .6)), hl.cex = c(.75,.75), cex.lab = 2, cex.axis = 2, legend ="topleft" ) dev.off() #Volcano plots of same above. DEredgrSG <- qlf.SGFundvsSGSex$table DEredgrSG$qValue <- p.adjust(DEredgrSG$PValue, method = "fdr") orderedDEredgrSG<-DEredgrSG[order(DEredgrSG$qValue),] #find cutoffs P05cutoff<-orderedDEredgrSG[orderedDEredgrSG$qValue>0.05,][1,] P01cutoff<-orderedDEredgrSG[orderedDEredgrSG$qValue>0.01,][1,] ptlabelsize = 4 p <- P05cutoff$PValue keyvals <- rep('grey75', nrow(DEredgrSG)) names(keyvals) <- rep('NS', nrow(DEredgrSG)) keyvals[which(annotation_gene_status == "Unclustered")] <- cbbPalette["blue"] names(keyvals)[which(annotation_gene_status == "Unclustered")] <- 'Unclustered' keyvals[which(annotation_gene_status == "Not Annotated")] <- cbbPalette["red"] names(keyvals)[which(annotation_gene_status == "Not Annotated")] <- 'Not Annotated' unique(keyvals) unique(names(keyvals)) pdf("VPFundVsSexSG_Unclustered.pdf") p1 <- EnhancedVolcano(toptable = DEredgrSG, lab = NA, x = "logFC", y = "PValue", title = NULL, subtitle = NULL, caption = NULL, colCustom = keyvals, pCutoff = p, FCcutoff = FALSE, cutoffLineType = "blank", hline = c(p,P01cutoff$PValue), hlineCol = "black", hlineType = "longdash", hlineWidth = 0.5, pointSize = c(ifelse(keyvals == 'grey75', 1, 3)), ylim = c(0, max(-log10(DEredgrSG$PValue), na.rm=TRUE)), xlab = "Salivary Gland (Fundatrix - Sexual)\nFold Change (log2)", col = c("black", "black", "purple", "cyan") ) p1 + theme(legend.position = c(.2,.97), legend.text = element_text(size = 16, colour = "black"), axis.text.x = element_text(angle = 0, size = 18, vjust= 1), axis.text.y = element_text(angle = 0, size = 18, vjust= 0.5), axis.title.x = element_text(angle = 0,size = 20,vjust = 0), axis.title.y = element_text(angle = 90,size = 22,vjust = 0) ) + annotate("text", label =c("FDR = 0.05", "FDR = 0.01"), x = c(-15,-15), y = c(0.9,2.95), size = 5, colour = "black") dev.off() ``` life cycle plots ```{r} ybiCYCle <- ysub[row.names(ysub) %in% sub("\\..*","",gene_cluster2),] life_cycle_counts <- data.frame( "gene" = row.names(ybiCYCle$counts), "maxCount" = rowMeans(cpm(ybiCYCle)[,ybiCYCle$samples$group == "G1SG"]), "G1SG" = rowMeans(cpm(ybiCYCle)[,ybiCYCle$samples$group == "G1SG"]), "G2SG" = rowMeans(cpm(ybiCYCle)[,ybiCYCle$samples$group == "G2SG"]), "GBetSG" = rowMeans(cpm(ybiCYCle)[,ybiCYCle$samples$group == "GBetSG"]), "SexSG" = rowMeans(cpm(ybiCYCle)[,ybiCYCle$samples$group == "SexSG"]) ) life_cycle_counts_for_plot <- life_cycle_counts %>% gather(Generation, Counts, G1SG:SexSG) life_cycle_counts_for_plot$maxCount <- life_cycle_counts_for_plot$maxCount + 1 life_cycle_counts_for_plot$Counts <- life_cycle_counts_for_plot$Counts + 1 pdf("LifeCycleGallBicycleCounts.pdf") ggplot(data = life_cycle_counts_for_plot, mapping = aes(x = Generation,y=Counts, group = gene)) + scale_color_gradientn(colours=c('red', 'yellow', 'cyan', 'blue'), trans = "log10")+ geom_line(size = .4, alpha=.3,aes(colour=maxCount)) + geom_jitter(size = 1, alpha=.9,col = "black", width = .33, height = 0 ) + scale_y_log10(breaks = c(1,11,101,1001,10001), labels = c(0,10,100,1000,10000), limits = c(1,3e4), name="Counts Per Million Reads") + geom_hline(yintercept=c(log10(1)),alpha=0.5) + scale_x_discrete(labels = c("G1SG"="G1","G2SG"="G2","GBetSG"="G5","SexSG"="G7")) + theme(axis.text.x=element_text(size=16), axis.text.y=element_text(size=16, vjust = 0.5), axis.title.x=element_text(size=20), axis.title.y=element_text(size=20), legend.position = "none" ) dev.off() ybiCYCle <- ysub[row.names(ysub) %in% sub("\\..*","",gene_cluster3),] life_cycle_counts <- data.frame( "gene" = row.names(ybiCYCle$counts), "maxCount" = rowMeans(cpm(ybiCYCle)[,ybiCYCle$samples$group == "G1SG"]), "G1SG" = rowMeans(cpm(ybiCYCle)[,ybiCYCle$samples$group == "G1SG"]), "G2SG" = rowMeans(cpm(ybiCYCle)[,ybiCYCle$samples$group == "G2SG"]), "GBetSG" = rowMeans(cpm(ybiCYCle)[,ybiCYCle$samples$group == "GBetSG"]), "SexSG" = rowMeans(cpm(ybiCYCle)[,ybiCYCle$samples$group == "SexSG"]) ) life_cycle_counts_for_plot <- life_cycle_counts %>% gather(Generation, Counts, G1SG:SexSG) life_cycle_counts_for_plot$maxCount <- life_cycle_counts_for_plot$maxCount + 1 life_cycle_counts_for_plot$Counts <- life_cycle_counts_for_plot$Counts + 1 pdf("LifeCycleGallCWGCounts.pdf") ggplot(data = life_cycle_counts_for_plot, mapping = aes(x = Generation,y=Counts, group = gene)) + scale_color_gradientn(colours=c('red', 'yellow', 'cyan', 'blue'), trans = "log10")+ geom_line(size = .4, alpha=.3,aes(colour=maxCount)) + geom_jitter(size = 1, alpha=.9,col = "black", width = .33, height = 0 ) + scale_y_log10(breaks = c(1,11,101,1001,10001), labels = c(0,10,100,1000,10000), limits = c(1,3e4), name="Counts Per Million Reads") + geom_hline(yintercept=c(log10(1)),alpha=0.5) + scale_x_discrete(labels = c("G1SG"="G1","G2SG"="G2","GBetSG"="G5","SexSG"="G7")) + theme(axis.text.x=element_text(size=16), axis.text.y=element_text(size=16, vjust = 0.5), axis.title.x=element_text(size=20), axis.title.y=element_text(size=20), legend.position = "none" ) dev.off() ybiCYCle <- ysub[row.names(ysub) %in% sub("\\..*","",gene_cluster1),] life_cycle_counts <- data.frame( "gene" = row.names(ybiCYCle$counts), "maxCount" = rowMeans(cpm(ybiCYCle)[,ybiCYCle$samples$group == "G1SG"]), "G1SG" = rowMeans(cpm(ybiCYCle)[,ybiCYCle$samples$group == "G1SG"]), "G2SG" = rowMeans(cpm(ybiCYCle)[,ybiCYCle$samples$group == "G2SG"]), "GBetSG" = rowMeans(cpm(ybiCYCle)[,ybiCYCle$samples$group == "GBetSG"]), "SexSG" = rowMeans(cpm(ybiCYCle)[,ybiCYCle$samples$group == "SexSG"]) ) life_cycle_counts_for_plot <- life_cycle_counts %>% gather(Generation, Counts, G1SG:SexSG) life_cycle_counts_for_plot$maxCount <- life_cycle_counts_for_plot$maxCount + 1 life_cycle_counts_for_plot$Counts <- life_cycle_counts_for_plot$Counts + 1 pdf("LifeCycleGallUnclusteredCounts.pdf") ggplot(data = life_cycle_counts_for_plot, mapping = aes(x = Generation,y=Counts, group = gene)) + scale_color_gradientn(colours=c('red', 'yellow', 'cyan', 'blue'), trans = "log10")+ geom_line(size = .4, alpha=.3,aes(colour=maxCount)) + geom_jitter(size = 1, alpha=.9,col = "black", width = .33, height = 0 ) + scale_y_log10(breaks = c(1,11,101,1001,10001), labels = c(0,10,100,1000,10000), limits = c(1,3e4), name="Counts Per Million Reads") + geom_hline(yintercept=c(log10(1)),alpha=0.5) + scale_x_discrete(labels = c("G1SG"="G1","G2SG"="G2","GBetSG"="G5","SexSG"="G7")) + theme(axis.text.x=element_text(size=16), axis.text.y=element_text(size=16, vjust = 0.5), axis.title.x=element_text(size=20), axis.title.y=element_text(size=20), legend.position = "none" ) dev.off() ``` Plot histograms of expression levels for each group ```{r} #renumber gene_clusters to match renamed clusters gene_clusters<-cutree(clustercol, k = 3) gene_clusters <- data.frame(gene = gsub("\\..*","",names(gene_clusters)), cluster = as.vector(gene_clusters)) gene_clusters<-unique(gene_clusters) #gene_clusters <- rbind(gene_clusters, data.frame(gene = annotated_genes, cluster = 5)) SGUU_scores <- qlf.SGFundvsSGSex$table[row.names(qlf.SGFundvsSGSex$table) %in% gene_clusters$gene,] SGUU_scores$gene <- gene_clusters$gene SGUU_scores <- merge(SGUU_scores, gene_clusters, by= "gene") pdf(file = "Density_logCPM_groups.pdf") ggplot(data = SGUU_scores, aes(data = logCPM, group = cluster, color = as.factor(cluster))) + geom_line(aes(logCPM), size=1.5, stat="density") + #geom_density() + scale_color_discrete(name=NULL, labels = c("Unclustered","biCYCle genes", "CWG genes")) + xlab("Counts Per Million (log2)") + ylab("Density") + theme(legend.position = c(.8,.9), legend.text = element_text(size = 16), axis.text=element_text(size=18), axis.title=element_text(size=22), plot.margin = margin(.5, .5, .5, .5, "cm")) dev.off() ``` Are signal peptides over-represented in any of the three gene clusters? Chi^2 of sigP with gene cluster ```{r} sigp_genes<-rownames(annot_genes[!is.na(annot_genes$sigp),]) Num_Clus1_sigp <- sum(sub("\\..*","",gene_cluster1) %in% sigp_genes) Num_Clus2_sigp <- sum(sub("\\..*","",gene_cluster2) %in% sigp_genes) Num_Clus3_sigp <- sum(sub("\\..*","",gene_cluster3) %in% sigp_genes) print("Proportion of sigP genes in Cluster 1, 2, 3") sigP_prop_Clus1 <- Num_Clus1_sigp/length(gene_cluster1) sigP_prop_Clus2 <- Num_Clus2_sigp/length(gene_cluster2) sigP_prop_Clus3 <- Num_Clus3_sigp/length(gene_cluster3) sigP_prop_Clus1 sigP_prop_Clus2 sigP_prop_Clus3 ``` sigp strongly enriched in bicycle and CWG Do unclustered genes with sigP show strong DE? ```{r} #get names of Cluster 5 sigP genes Cluster1_sigP <- sigp_genes[sigp_genes %in% sub("\\..*","",gene_cluster1)] pointcols <- c(colwalpha("firebrick",.7)) pointsizes <- c(.9) pointtypes <- c(16) annotation_gene_status <- if_else(rownames(qlf.SGFundvsSGSex$table) %in% Cluster1_sigP,true="Cluster1_sigP", false = "Not Enriched") annotation_gene_status<- factor(annotation_gene_status,c("Not Enriched","Cluster1_sigP")) pdf(file = "MDplot_Cluster1_sigP.PDF") plotMD(qlf.SGFundvsSGSex, status = annotation_gene_status,main="Cluster 1 sigP", xlab = "Mean Expression Level (log2 CPM)", ylab = "Salivary Gland (Fundatrix - Sexual)\nFold Difference (log2)", scale_y_continuous(breaks = c(-10,0,10,20)), hl.col = pointcols, hl.cex = pointsizes, hl.pch = pointtypes, legend = FALSE) dev.off() ``` Examine biCYCle gene expression patterns across the fundatrix instars ```{r} ybiCYCle <- ysub[row.names(ysub) %in% sub("\\..*","",gene_cluster2),] life_cycle_counts <- data.frame( "gene" = row.names(ybiCYCle$counts), "maxCount" = rowMeans(cpm(ybiCYCle)[,ybiCYCle$samples$group == "G1SG"]), "1st" = rowMeans(cpm(ybiCYCle)[,grepl(pattern = "1st",x = ybiCYCle$samples$files) & ybiCYCle$samples$group == "G1SG"]), "2nd" = rowMeans(cpm(ybiCYCle)[,grepl(pattern = "2nd",x = ybiCYCle$samples$files) & ybiCYCle$samples$group == "G1SG"]), "3rd" = rowMeans(cpm(ybiCYCle)[,grepl(pattern = "3rd",x = ybiCYCle$samples$files) & ybiCYCle$samples$group == "G1SG"]), "4th" = rowMeans(cpm(ybiCYCle)[,grepl(pattern = "4th",x = ybiCYCle$samples$files) & ybiCYCle$samples$group == "G1SG"]), "Ad" = rowMeans(cpm(ybiCYCle)[,grepl(pattern = "adult",x = ybiCYCle$samples$files) & ybiCYCle$samples$group == "G1SG"]) ) life_cycle_counts_for_plot <- life_cycle_counts %>% gather(Stage, Counts, X1st:Ad) life_cycle_counts_for_plot$maxCount <- life_cycle_counts_for_plot$maxCount + 1 life_cycle_counts_for_plot$Counts <- life_cycle_counts_for_plot$Counts + 1 pdf("FundNymphbiCYCleCounts.pdf") ggplot(data = life_cycle_counts_for_plot, mapping = aes(x = Stage,y=Counts, group = gene)) + scale_color_gradientn(colours=c('red', 'yellow', 'cyan', 'blue'), trans = "log10")+ geom_line(size = .4, alpha=.3,aes(colour=maxCount)) + geom_jitter(size = 1, alpha=.9,col = "black", width = .33, height = 0 ) + scale_y_log10(breaks = c(1,11,101,1001,10001), labels = c(0,10,100,1000,10000), limits = c(1,7e4), name="Counts Per Million Reads") + geom_hline(yintercept=c(log10(1)),alpha=0.5) + scale_x_discrete(labels = c("X1st"="1","X2nd"="2","X3rd"="3","X4th"="4","Ad"="Ad"), limits = c("X1st","X2nd","X3rd","X4th","Ad"), name = "Fundatrix Stage") + theme(axis.text.x=element_text(size=24), axis.text.y=element_text(size=24, vjust = 0.5), axis.title.x=element_text(size=28), axis.title.y=element_text(size=28), legend.position = "none" ) dev.off() ``` Examine Cluster3 gene expression patterns across the life cycle ```{r} yCluster2 <- ysub[row.names(ysub) %in% sub("\\..*","",gene_cluster3),] life_cycle_counts <- data.frame( "gene" = row.names(yCluster2$counts), "maxCount" = rowMeans(cpm(yCluster2)[,yCluster2$samples$group == "G1SG"]), "G1SG" = rowMeans(cpm(yCluster2)[,yCluster2$samples$group == "G1SG"]), "G2SG" = rowMeans(cpm(yCluster2)[,yCluster2$samples$group == "G2SG"]), "GBetSG" = rowMeans(cpm(yCluster2)[,yCluster2$samples$group == "GBetSG"]), "SexSG" = rowMeans(cpm(yCluster2)[,yCluster2$samples$group == "SexSG"]) ) life_cycle_counts_for_plot <- life_cycle_counts %>% gather(Generation, Counts, G1SG:SexSG) life_cycle_counts_for_plot$maxCount <- life_cycle_counts_for_plot$maxCount + 1 life_cycle_counts_for_plot$Counts <- life_cycle_counts_for_plot$Counts + 1 pdf("LifeCycleCluster3Counts.pdf") ggplot(data = life_cycle_counts_for_plot, mapping = aes(x = Generation,y=Counts, group = gene)) + ggtitle(label = "Cluster 3") + scale_color_gradientn(colours=c('red', 'yellow', 'cyan', 'blue'), trans = "log10")+ geom_line(size = .4, alpha=.3,aes(colour=maxCount)) + geom_jitter(size = 1, alpha=.9,col = "black", width = .33, height = 0 ) + scale_y_log10(breaks = c(1,11,101,1001,10001), labels = c(0,10,100,1000,10000), limits = c(1,7e4), name="Counts Per Million Reads") + geom_hline(yintercept=c(log10(1)),alpha=0.5) + scale_x_discrete(labels = c("G1SG"="G1","G2SG"="G2","GBetSG"="G5","SexSG"="G7")) + theme(axis.text.x=element_text(size=16), axis.text.y=element_text(size=16, vjust = 0.5), axis.title.x=element_text(size=20), axis.title.y=element_text(size=20), legend.position = "none", plot.title = element_text(hjust=0.5, margin = margin(t = 10, b = -20), size = 20) ) dev.off() ``` Examine Cluster1 gene expression patterns across the life cycle ```{r} yCluster3 <- ysub[row.names(ysub) %in% sub("\\..*","",gene_cluster1),] life_cycle_counts <- data.frame( "gene" = row.names(yCluster3$counts), "maxCount" = rowMeans(cpm(yCluster3)[,yCluster3$samples$group == "G1SG"]), "G1SG" = rowMeans(cpm(yCluster3)[,yCluster3$samples$group == "G1SG"]), "G2SG" = rowMeans(cpm(yCluster3)[,yCluster3$samples$group == "G2SG"]), "GBetSG" = rowMeans(cpm(yCluster3)[,yCluster3$samples$group == "GBetSG"]), "SexSG" = rowMeans(cpm(yCluster3)[,yCluster3$samples$group == "SexSG"]) ) life_cycle_counts_for_plot <- life_cycle_counts %>% gather(Generation, Counts, G1SG:SexSG) life_cycle_counts_for_plot$maxCount <- life_cycle_counts_for_plot$maxCount + 1 life_cycle_counts_for_plot$Counts <- life_cycle_counts_for_plot$Counts + 1 pdf("LifeCycleCluster1Counts.pdf") ggplot(data = life_cycle_counts_for_plot, mapping = aes(x = Generation,y=Counts, group = gene)) + ggtitle(label = "Cluster 1") + scale_color_gradientn(colours=c('red', 'yellow', 'cyan', 'blue'), trans = "log10")+ geom_line(size = .4, alpha=.3,aes(colour=maxCount)) + geom_jitter(size = 1, alpha=.9,col = "black", width = .33, height = 0 ) + scale_y_log10(breaks = c(1,11,101,1001,10001), labels = c(0,10,100,1000,10000), limits = c(1,7e4), name="Counts Per Million Reads") + geom_hline(yintercept=c(log10(1)),alpha=0.5) + scale_x_discrete(labels = c("G1SG"="G1","G2SG"="G2","GBetSG"="G5","SexSG"="G7")) + theme(axis.text.x=element_text(size=16), axis.text.y=element_text(size=16, vjust = 0.5), axis.title.x=element_text(size=20), axis.title.y=element_text(size=20), legend.position = "none", plot.title = element_text(hjust=0.5, margin = margin(t = 10, b = -20), size = 20) ) dev.off() ``` ********************* ********************* Some genes related to bicycle in unclustered? Grab these and check how similar to bicycle ```{r} Cluster2_by_Cluster1<-SGUU_SGUU_similarity_matrix[gene_cluster2,gene_cluster1] Cluster1_hitby_Cluster2 <- Cluster2_by_Cluster1[,which(colSums(Cluster2_by_Cluster1)>20)] hist(colSums(Cluster1_hitby_Cluster2),20) genes_Cluster1_hitby_Cluster2 <- gsub(pattern = "\\..*",replacement = "",names(Cluster1_hitby_Cluster2)) ``` Find other salivary gland expressed small exon genes plot exon sizes - read in exon sizes of all genes ```{r message=FALSE, include=FALSE} #gff.genes already read in gff.exons <- rtracklayer::readGFF("Augustus.updated_w_annots.21Aug20.gff3.gz",filter=list(type="exon")) gff.exons$Parent <- unlist(gff.exons$Parent) parents <- unique(gff.exons$Parent) gff.exons$Parent <-factor(gff.exons$Parent,levels = parents) gff.exons <- data.frame(gff.exons) gff.median.exon.sizes <- gff.exons %>% mutate(size = end - start + 1) %>% group_by(Parent) %>% summarise(num_exons = n(),median = median(size)) P3Cgff.exons <- gff.exons[unlist(gff.exons$Parent %in% gene_cluster2),] P3C.median.exon.sizes <- P3Cgff.exons %>% mutate(size = end - start + 1) %>% group_by(Parent) %>% summarise(num_exons = n(),median = median(size)) ``` Plot exon size vs number for all genes Then color bicycle subset different color ```{r} pdf(file = "ExonSizevsNumExons.pdf",width=9,height=6,paper='special') plot(log10(gff.median.exon.sizes$median),gff.median.exon.sizes$num_exons,pch = 19, cex = .7,xlab = "Median Exon Size (bp, log10)", ylab = "Number of Exons Per Gene",col=colwalpha("firebrick",alpha=.15),axes=FALSE) #rgb(1,20/255,147/255,.15) points(log10(P3C.median.exon.sizes$median),P3C.median.exon.sizes$num_exons,pch = 19, cex = 1,xlab = "Median Exon Size (bp, log10)", ylab = "Number of Exons Per Gene",col=colwalpha("steelblue",.4)) #rgb(0,1,0,.4) axis(side=1, at=c(1,2,3,4),labels = c(10,100,1000,10000)) axis(side=2, at=c(0,50,100,150)) leg<-legend(x = c(3,4),y=c(130,80),legend=c("non-bicycle genes","bicycle genes"),col=c(rgb(1,20/255,147/255,.15),rgb(0,1,0,.4)),y.intersp=.2,pch=19,plot=FALSE) dev.off() #try to change position of text, unclear how to use this to plot #leg$text$y <- c(130,100) #average size of median P3C exons mean(P3C.median.exon.sizes$median) pdf(file = "ExonSizevsNumExonsZoom.pdf",width=9,height=6,paper='special') plot(gff.median.exon.sizes$median,gff.median.exon.sizes$num_exons,pch = 19, cex = .7,xlab = "Median Exon Size (bp)", ylab = "Number of Exons Per Gene",col=colwalpha("firebrick",alpha=.15),xlim=c(0,300),ylim=c(0,40),axes=FALSE) points(P3C.median.exon.sizes$median,P3C.median.exon.sizes$num_exons,pch = 19, cex = .7,xlab = "Median Exon Size (bp)", ylab = "Number of Exons Per Gene",col=colwalpha("steelblue",.4)) axis(side=1) axis(side=2) dev.off() ``` Clearly additional small exon genes in same region as bicycle. Grab these and annotate and make logo ```{r} Cluster1_smlexon<-gff.median.exon.sizes[gff.median.exon.sizes$num_exons>9 & gff.median.exon.sizes$median<60 & gff.median.exon.sizes$Parent %in% gene_cluster1,] Cluster1_smlexon_unlikebicycle <- Cluster1_smlexon[!Cluster1_smlexon %in% genes_Cluster1_hitby_Cluster2] genes_Cluster1_smlexon_unlikebicycle <- gsub(pattern = "\\..*",replacement = "",Cluster1_smlexon_unlikebicycle$Parent) Cluster1_smlexon_unlikebicycle$Parent qlf.SGvsWholeFund$table[genes_Cluster1_smlexon_unlikebicycle,] %>% arrange(desc(logCPM)) annot_genes[genes_Cluster1_smlexon_unlikebicycle,] write.csv(x = qlf.SGvsWholeFund$table[genes_Cluster1_smlexon_unlikebicycle,] %>% arrange(desc(logCPM)), file = "genes_Cluster1_smlexon_unlikebicycle.csv") ``` Align and logo these two sets of genes ```{r} write(x =paste(names(Cluster1_hitby_Cluster2)), file = "transcripts_Cluster1_hitby_Cluster2") write(x = str_trim(format(Cluster1_smlexon_unlikebicycle$Parent)), file = "transcripts_Cluster1_smlexon_unlikebicycle") ``` seqkit grep -f transcripts_Cluster1_hitby_Cluster2 Annotate/Augustus.updated_w_annots.21Aug20.pep.fa.gz > transcripts_Cluster1_hitby_Cluster2.fa seqkit grep -f transcripts_Cluster1_smlexon_unlikebicycle Annotate/Augustus.updated_w_annots.21Aug20.pep.fa.gz > transcripts_Cluster1_smlexon_unlikebicycle.fa mafft --auto --thread 2 transcripts_Cluster1_hitby_Cluster2.fa > transcripts_Cluster1_hitby_Cluster2.mafft_auto.fa mafft --auto --thread 2 transcripts_Cluster1_smlexon_unlikebicycle.fa > transcripts_Cluster1_smlexon_unlikebicycle.mafft_auto.fa trim alignments #include only seqns with at least 50% coverage trimal -in transcripts_Cluster1_hitby_Cluster2.mafft_auto.fa -out transcripts_Cluster1_hitby_Cluster2.mafft_auto.trimal50.fa -gt 0.50 trimal -in transcripts_Cluster1_smlexon_unlikebicycle.mafft_auto.fa -out transcripts_Cluster1_smlexon_unlikebicycle.mafft_auto.trimal50.fa -gt 0.50 trimal -in transcripts_Cluster1_hitby_Cluster2.mafft_auto.fa -out transcripts_Cluster1_hitby_Cluster2.mafft_auto.trimal30.fa -gt 0.30 trimal -in transcripts_Cluster1_smlexon_unlikebicycle.mafft_auto.fa -out transcripts_Cluster1_smlexon_unlikebicycle.mafft_auto.trimal30.fa -gt 0.30 ```{r} aln<-read.alignment("transcripts_Cluster1_hitby_Cluster2.mafft_auto.trimal50.fa",format="fasta") aln$seq <- unlist(aln$seq) aln$seq <- toupper(aln$seq) logo <- ggseqlogo(aln$seq) + theme(axis.text.x = element_blank()) pdf(file = "transcripts_Cluster1_hitby_Cluster2.mafft_auto.trimal50.pdf",width=20,height=4) logo dev.off() aln<-read.alignment("transcripts_Cluster1_smlexon_unlikebicycle.mafft_auto.trimal50.fa",format="fasta") aln$seq <- unlist(aln$seq) aln$seq <- toupper(aln$seq) logo <- ggseqlogo(aln$seq) + theme(axis.text.x = element_blank()) pdf(file = "transcripts_Cluster1_smlexon_unlikebicycle.mafft_auto.trimal50.pdf",width=20,height=4) logo dev.off() aln<-read.alignment("transcripts_Cluster1_hitby_Cluster2.mafft_auto.trimal30.fa",format="fasta") aln$seq <- unlist(aln$seq) aln$seq <- toupper(aln$seq) logo <- ggseqlogo(aln$seq) + theme(axis.text.x = element_blank()) pdf(file = "transcripts_Cluster1_hitby_Cluster2.mafft_auto.trimal30.pdf",width=20,height=4) logo dev.off() aln<-read.alignment("transcripts_Cluster1_smlexon_unlikebicycle.mafft_auto.trimal30.fa",format="fasta") aln$seq <- unlist(aln$seq) aln$seq <- toupper(aln$seq) logo <- ggseqlogo(aln$seq) + theme(axis.text.x = element_blank()) pdf(file = "transcripts_Cluster1_smlexon_unlikebicycle.mafft_auto.trimal30.pdf",width=20,height=4) logo dev.off() ``` Small exon genes show some similarity to bicycle genes, many Asparigines. CNYC-like genes. cluster1 hit by bicycle also show vague similarity. Make list of totally non-bicycle-like cluster 1 genes ```{r} non_bicycle_cluster1 <- gene_cluster1[!gene_cluster1 %in% Cluster1_smlexon_unlikebicycle$Parent] write(non_bicycle_cluster1, file = "non_bicycle_cluster1.names") bicycle_cluster1 <- gene_cluster1[gene_cluster1 %in% Cluster1_smlexon_unlikebicycle$Parent] write(bicycle_cluster1, file = "bicycle_cluster1.names") ``` seqkit grep -f non_bicycle_cluster1.names Annotate/Augustus.updated_w_annots.21Aug20.pep.fa.gz > non_bicycle_cluster1.fa mafft --auto --thread 2 non_bicycle_cluster1.fa > non_bicycle_cluster1.mafft_auto.fa trim alignments #include only seqns with at least 50% coverage trimal -in non_bicycle_cluster1.mafft_auto.fa -out non_bicycle_cluster1.mafft_auto.trimal50.fa -gt 0.50 trimal -in non_bicycle_cluster1.mafft_auto.fa -out non_bicycle_cluster1.mafft_auto.trimal30.fa -gt 0.30 ```{r} aln<-read.alignment("non_bicycle_cluster1.mafft_auto.trimal50.fa",format="fasta") aln$seq <- unlist(aln$seq) aln$seq <- toupper(aln$seq) logo <- ggseqlogo(aln$seq) + theme(axis.text.x = element_blank()) pdf(file = "non_bicycle_cluster1.mafft_auto.trimal50.pdf",width=20,height=4) logo dev.off() aln<-read.alignment("non_bicycle_cluster1.mafft_auto.trimal30.fa",format="fasta") aln$seq <- unlist(aln$seq) aln$seq <- toupper(aln$seq) logo <- ggseqlogo(aln$seq) + theme(axis.text.x = element_blank()) pdf(file = "non_bicycle_cluster1.mafft_auto.trimal30.pdf",width=20,height=4) logo dev.off() ``` ****************** ****************** Search for additional bicycle genes jackhmmer bicycle against whole peptome scp gene_cluster2.fa sternd@login2.int.janelia.org:hormaphis/Annotate bsub -n 32 -We 300 -o jackhmmer.clust22Hcor "/usr/local/hmmer-3.1b2/bin/jackhmmer --cpu 32 -E 0.01 --incE 0.01 --max --tblout jackhmmer.clust2_to_Hcor.hits.table.txt gene_cluster2.fa Augustus.updated_w_annots.21Aug20.pep.fa" scp sternd@login2.int.janelia.org:hormaphis/Annotate/jackhmmer.clust2_to_Hcor.hits.table.txt.gz Annotate/. Hierarchical clustering of jackhmmer hits ```{r} jhmmer_clust2_Hcor <- read_tblout("Annotate/jackhmmer.clust2_to_Hcor.hits.table.txt.gz") #using only significant hits #Use simple Bonferroni correction to get E-value threshold # num_searches <- jhmmer_clust2_Hcor %>% distinct(query_name) %>% summarize(num_searches = length(query_name)) num_searches <- as.integer(num_searches[1]) #I prefer a conservative cutoff of 0.01 P_val_cutoff <- 0.01 / num_searches #reduce table to only hits meeting this threshold clust2_Hcor_hits <- jhmmer_clust2_Hcor %>% dplyr::filter(best_domain_evalue <= P_val_cutoff) clust2_Hcor_similarity_matrix <- reshape2::dcast(clust2_Hcor_hits,query_name ~ domain_name,value.var = "sequence_score",fill=0) row.names(clust2_Hcor_similarity_matrix) <- clust2_Hcor_similarity_matrix$query_name clust2_Hcor_similarity_matrix$query_name <- NULL my.breaks <- c(seq(-4.4, -1.05, length.out=50),seq(-1, 2, length.out=50),seq(2.05,4.4, length.out=50)) ``` ```{r} distance = dist(data.matrix(clust2_Hcor_similarity_matrix), method = "canberra") clusterrow = hclust(distance, method = "ward.D2") distance = dist(data.matrix(t(clust2_Hcor_similarity_matrix)), method = "canberra") clustercol = hclust(distance, method = "ward.D2") ``` Label each of the groups ```{r} pdf(file = "Heatmap_clust2_2_Hcor.canberra.wardD2.labelled.pdf") heatmap.2(data.matrix(clust2_Hcor_similarity_matrix), Rowv = as.dendrogram(clusterrow), Colv = as.dendrogram(clustercol), col=bluered(149), trace="none", cexCol=.6, labRow=FALSE, labCol=FALSE, key=0, dendrogram='both', breaks = my.breaks ) dev.off() ``` Twp major clusters. Take each one, make logo ```{r} #cuts into 3 groups based on column clusters jckhmmer_gene_clusters<-cutree(clustercol, k = 2) #which cluster is which jckhmmer_gene_cluster1 <- names(jckhmmer_gene_clusters[jckhmmer_gene_clusters == 1]) jckhmmer_gene_cluster2 <- names(jckhmmer_gene_clusters[jckhmmer_gene_clusters == 2]) print("Number of transcripts in 'Clusters' 1, 2") length(jckhmmer_gene_cluster1) length(jckhmmer_gene_cluster2) print('Number of genes in Clusters') length(unique(gsub(pattern = "\\..*",replacement = "",x = jckhmmer_gene_cluster1))) length(unique(gsub(pattern = "\\..*",replacement = "",x = jckhmmer_gene_cluster2))) ``` 513 bicycle genes ```{r} write(x = jckhmmer_gene_cluster1, file = "jckhmmer_gene_cluster1") write(x = jckhmmer_gene_cluster2, file = "jckhmmer_gene_cluster2") ``` seqkit grep -f jckhmmer_gene_cluster1 Annotate/Augustus.updated_w_annots.21Aug20.pep.fa.gz > jckhmmer_gene_cluster1.fa seqkit grep -f jckhmmer_gene_cluster2 Annotate/Augustus.updated_w_annots.21Aug20.pep.fa.gz > jckhmmer_gene_cluster2.fa mafft --auto --thread 2 jckhmmer_gene_cluster1.fa > jckhmmer_gene_cluster1.mafft_auto.fa mafft --auto --thread 2 jckhmmer_gene_cluster2.fa > jckhmmer_gene_cluster2.mafft_auto.fa trim alignments #include only seqns with at least 50% coverage trimal -in jckhmmer_gene_cluster1.mafft_auto.fa -out jckhmmer_gene_cluster1.mafft_auto.trimal50.fa -gt 0.50 trimal -in jckhmmer_gene_cluster2.mafft_auto.fa -out jckhmmer_gene_cluster2.mafft_auto.trimal50.fa -gt 0.50 trimal -in jckhmmer_gene_cluster1.mafft_auto.fa -out jckhmmer_gene_cluster1.mafft_auto.trimal30.fa -gt 0.30 trimal -in jckhmmer_gene_cluster2.mafft_auto.fa -out jckhmmer_gene_cluster2.mafft_auto.trimal30.fa -gt 0.30 ```{r} aln<-read.alignment("jckhmmer_gene_cluster1.mafft_auto.trimal50.fa",format="fasta") aln$seq <- unlist(aln$seq) aln$seq <- toupper(aln$seq) logo <- ggseqlogo(aln$seq) + theme(axis.text.x = element_blank()) pdf(file = "jckhmmer_gene_cluster1.mafft_auto.trimal50.pdf",width=20,height=4) logo dev.off() aln<-read.alignment("jckhmmer_gene_cluster2.mafft_auto.trimal50.fa",format="fasta") aln$seq <- unlist(aln$seq) aln$seq <- toupper(aln$seq) logo <- ggseqlogo(aln$seq) + theme(axis.text.x = element_blank()) pdf(file = "jckhmmer_gene_cluster2.mafft_auto.trimal50.pdf",width=20,height=4) logo dev.off() aln<-read.alignment("jckhmmer_gene_cluster1.mafft_auto.trimal30.fa",format="fasta") aln$seq <- unlist(aln$seq) aln$seq <- toupper(aln$seq) logo <- ggseqlogo(aln$seq) + theme(axis.text.x = element_blank()) pdf(file = "jckhmmer_gene_cluster1.mafft_auto.trimal30.pdf",width=20,height=4) logo dev.off() aln<-read.alignment("jckhmmer_gene_cluster2.mafft_auto.trimal30.fa",format="fasta") aln$seq <- unlist(aln$seq) aln$seq <- toupper(aln$seq) logo <- ggseqlogo(aln$seq) + theme(axis.text.x = element_blank()) pdf(file = "jckhmmer_gene_cluster2.mafft_auto.trimal30.pdf",width=20,height=4) logo dev.off() ``` Cluster 2 are clear bicycle = 513 genes Cluster 1 looks related = 80 genes make logo for all bicycle genes extract just 1 transcript of each bicycle genes (take -00001) ```{r} bicycle_transcripts <- c(jckhmmer_gene_cluster2[grep(pattern = "-00001", x = jckhmmer_gene_cluster2)], jckhmmer_gene_cluster1[grep(pattern = "-00001", x = jckhmmer_gene_cluster1)]) write(x = bicycle_transcripts,file = "bicycle_transcripts") ``` seqkit grep -f bicycle_transcripts Annotate/Augustus.updated_w_annots.21Aug20.pep.fa.gz > bicycle_transcripts.fa mafft --auto --thread 2 bicycle_transcripts.fa > bicycle_transcripts.mafft_auto.fa trimal -in bicycle_transcripts.mafft_auto.fa -out bicycle_transcripts.mafft_auto.trimal50.fa -gt 0.50 ```{r} aln<-read.alignment("bicycle_transcripts.mafft_auto.trimal50.fa",format="fasta") aln$seq <- unlist(aln$seq) aln$seq <- toupper(aln$seq) logo <- ggseqlogo(aln$seq) + theme(axis.text.x = element_blank()) pdf(file = "bicycle_transcripts.mafft_auto.trimal50.pdf",width=20,height=4) logo dev.off() ``` Plot exon # vs size for all bicycle genes ```{r message=FALSE, include=FALSE} #gff.genes already read in gff.exons <- rtracklayer::readGFF("Augustus.updated_w_annots.21Aug20.gff3.gz",filter=list(type="exon")) gff.exons$Parent <- unlist(gff.exons$Parent) parents <- unique(gff.exons$Parent) gff.exons$Parent <-factor(gff.exons$Parent,levels = parents) gff.exons <- data.frame(gff.exons) gff.median.exon.sizes <- gff.exons %>% mutate(size = end - start + 1) %>% group_by(Parent) %>% summarise(num_exons = n(),median = median(size)) P3Cgff.exons <- gff.exons[unlist(gff.exons$Parent %in% c(jckhmmer_gene_cluster2)),] P3C.median.exon.sizes <- P3Cgff.exons %>% mutate(size = end - start + 1) %>% group_by(Parent) %>% summarise(num_exons = n(),median = median(size)) ``` Plot exon size vs number for all genes Then color bicycle subset different color ```{r} pdf(file = "ExonSizevsNumExons_bicycle.pdf",width=9,height=6,paper='special') par(mar = c(5.1,4.3,4.1,2.1)) plot(log10(gff.median.exon.sizes$median),gff.median.exon.sizes$num_exons,pch = 19, cex = .7,xlab = "Median Exon Size (bp, log10)", ylab = "Number of Exons Per Gene",col=colwalpha("firebrick",alpha=.15),axes=FALSE, cex.lab = 1.5) points(log10(P3C.median.exon.sizes$median),P3C.median.exon.sizes$num_exons,pch = 19, cex = .7,xlab = "Median Exon Size (bp, log10)", ylab = "Number of Exons Per Gene",col=colwalpha("steelblue",.4)) #rgb(0,1,0,.4) axis(side=1, at=c(1,2,3,4),labels = c(10,100,1000,10000), cex.axis = 1.25) axis(side=2, at=c(0,50,100,150), las = 2, cex.axis = 1.25) legend(x = c(3,4),y=c(150,115),legend=c("non bicycle genes","bicycle genes"), col=c("firebrick","steelblue"),pch=19, cex = 1.4) dev.off() #try to change position of text, unclear how to use this to plot #leg$text$y <- c(130,100) #average size of median P3C exons mean(P3C.median.exon.sizes$median) pdf(file = "ExonSizevsNumExonsZoom_bicycle.pdf",width=9,height=6,paper='special') plot(gff.median.exon.sizes$median,gff.median.exon.sizes$num_exons,pch = 19, cex = .7,xlab = "Median Exon Size (bp)", ylab = "Number of Exons Per Gene",col=colwalpha("firebrick",alpha=.15),xlim=c(0,300),ylim=c(0,40),axes=FALSE) points(P3C.median.exon.sizes$median,P3C.median.exon.sizes$num_exons,pch = 19, cex = .7,xlab = "Median Exon Size (bp)", ylab = "Number of Exons Per Gene",col=colwalpha("steelblue",.4)) axis(side=1) axis(side=2) dev.off() ``` Clearly additional small exon genes in same region as bicycle. Grab these and annotate and make logo ```{r} Cluster1_smlexon<-gff.median.exon.sizes[gff.median.exon.sizes$num_exons>9 & gff.median.exon.sizes$median<60 & gff.median.exon.sizes$Parent %in% gene_cluster1,] Cluster1_smlexon_unlikebicycle <- Cluster1_smlexon[!Cluster1_smlexon %in% genes_Cluster1_hitby_Cluster2] genes_Cluster1_smlexon_unlikebicycle <- gsub(pattern = "\\..*",replacement = "",Cluster1_smlexon_unlikebicycle$Parent) Cluster1_smlexon_unlikebicycle$Parent qlf.SGvsWholeFund$table[genes_Cluster1_smlexon_unlikebicycle,] %>% arrange(desc(logCPM)) annot_genes[genes_Cluster1_smlexon_unlikebicycle,] write.csv(x = qlf.SGvsWholeFund$table[genes_Cluster1_smlexon_unlikebicycle,] %>% arrange(desc(logCPM)), file = "genes_bicycle_smlexon_unlikebicycle.csv") ``` ******************* ******************* Make tree of biCYCle genes FastTree < jckhmmer_gene_cluster2.mafft_auto.fa > bicycle_transcripts.mafft_auto.tree ```{r} tree<-read.newick("bicycle_transcripts.mafft_auto.tree") CYCtree <- ggtree(tree, size=.2, aes(color="steelblue")) + scale_y_reverse() + scale_color_manual(values = c("steelblue")) CYCtree_scale <- CYCtree + theme_tree2() + xlab(label = "Divergence (amino acid substitutions per site)") + theme(axis.title.x = element_text(size = 16), axis.text.x = element_text(size=14), axis.line=element_line(color='black',size = .2), legend.position = 'none') pdf(file = "biCYCle_tree.pdf",width = 6.2,height = 7) CYCtree_scale dev.off() ``` Examine gene structure ```{r} Hcornu_chroms <- data.frame( chrom = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9"), chromStart = c(1, 1, 1, 1, 1, 1, 1, 1, 1), chromEnd = c(60222264, 39701450, 38924103, 36083769, 32569495, 24543776, 23553545, 20814484, 17934322), name = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9"), chrom_real_name = c("ScPoQx8_4268;HRSCAF=5273", "ScPoQx8_1269;HRSCAF=1516", "ScPoQx8_9746;HRSCAF=11751", "ScPoQx8_7909;HRSCAF=9379", "ScPoQx8_3632;HRSCAF=4405", "ScPoQx8_9972;HRSCAF=12060", "ScPoQx8_8456;HRSCAF=10181", "ScPoQx8_7882;HRSCAF=9334", "ScPoQx8_9720;HRSCAF=11714" ), score = c(1, 1, 1, 1, 1, 1, 1, 1, 1) ) hplotter <- regioneR::toGRanges(Hcornu_chroms) kp <- plotKaryotype(hplotter) ``` import real data, break into chromosomes, strip transcript id and "g" ```{r message=FALSE, warning=FALSE} #read in P3C genes as dataframe P3C <- read.delim(file="jckhmmer_gene_cluster2",head=FALSE,col.names = "transcript",colClasses = "character") P3C$gene <- gsub("\\..*","",P3C$transcript) #read in gene info gff.genes <- readGFF("Augustus.updated_w_annots.21Aug20.gff3.gz",filter=list(type="gene")) #get reduced gff that includes only P3Cgenes P3Cgff <- gff.genes[gff.genes$ID %in% P3C$gene,] #sort P3Cgff by chromosome and then by gene position P3Cgff <- P3Cgff[order(P3Cgff$seqid,P3Cgff$start),] #use gene ID # as surrogate for gene position # e.g. g134 > 134 P3Cgff$gene_position <- gsub("[^0-9\\.]", "",P3Cgff$ID) #calculate neighbor distances for all genes P3Cgff$num_genes_between <- c(diff(as.integer(P3Cgff$gene_position)) - 1,NA) #some genes are g###a and g###b. These are calculated as -1. Change these to 0 P3Cgff$num_genes_between[P3Cgff$num_genes_between == -1] <- 0 P3Cgff$gene_length <- P3Cgff$end - P3Cgff$start #then NA for genes at start of chromosomes (first gene that is not in a consecutive duplicate is marked FALSE) P3Cgff$num_genes_between[(which(!duplicated(P3Cgff$seqid))-1)[2:8]] = NA ggplot(P3Cgff, aes(x=num_genes_between)) + geom_histogram(na.rm = TRUE,bins=200) + labs(title="Number of genes interrupting biCYCle genes", x = "Number of genes between biCYCle genes", y = "Number of biCYCle genes") + theme_classic() + theme(plot.title = element_text(hjust = 0.5)) ggplot(P3Cgff, aes(x=num_genes_between)) + geom_histogram(na.rm = TRUE,bins=21,binwidth = 1) + labs(title="Number of genes interrupting biCYCle genes", x = "Number of genes between biCYCle genes", y = "Number of biCYCle genes") + theme_classic() + theme(plot.title = element_text(hjust = 0.5)) + scale_x_continuous(limits = c(-1, 21)) pdf(file = "hist_biCYCle_gene_lengths.pdf", width = 9, height = 4) ggplot(P3Cgff, aes(x=gene_length)) + geom_histogram(na.rm = TRUE,bins=21) + labs(title="Length of biCYCle genes", x = "Gene length", y = "Number of biCYCle genes") + theme_classic() + theme(plot.title = element_text(hjust = 0.5)) dev.off() ``` biCYCle genes separated by <= 10 genes will be considered within a single cluster Define cluster by first 5' gene in cluster ```{r} cluster_break_value = 10 P3Cgff$new_cluster <- data.table::shift(P3Cgff$num_genes_between > cluster_break_value,n=1) #replace NA (beginning of chrom) with TRUE (start of cluster) P3Cgff$new_cluster[is.na(P3Cgff$new_cluster)] <- TRUE #name clusterID as first gene in cluster P3Cgff$clusterID <- ifelse(P3Cgff$new_cluster == TRUE,P3Cgff$ID,NA) #fill in remaining with na.locf from zoo package P3Cgff$clusterID <- na.locf(P3Cgff$clusterID) #add column of summary chromosome names P3Cgff <- merge(P3Cgff,Hcornu_chroms,by.x = "seqid",by.y="chrom_real_name") gff.genes <- merge(gff.genes,Hcornu_chroms,by.x = "seqid",by.y="chrom_real_name") print("Number of unique clusters") length(unique(P3Cgff$clusterID)) ``` Plot genomic distribution of clusters and histogram of #genes per cluster ```{r} clusterID <- unique(P3Cgff$clusterID) P3Cgff$clusterID <-factor(P3Cgff$clusterID,levels = clusterID) x0 <- P3Cgff %>% group_by(clusterID) %>% summarise(x0 = min(start)) x1 <- P3Cgff %>% group_by(clusterID) %>% summarise(x1 = max(end)) y0 <- integer(length(clusterID)) y1 <- P3Cgff %>% group_by(clusterID) %>% summarise(y1=n()) chr <- P3Cgff %>% group_by(clusterID) %>% summarise(chr=unique(chrom)) clusters <-data.frame(chrs = chr$chr,start = x0$x0, end = x1$x1,y0, y1 = y1$y1,clusterID) #remove clusters that do not occur on major 9 chromosomes (or are missing some other data) clusters <- clusters[complete.cases(clusters),] clusters.GRange <- regioneR::toGRanges(clusters) clusters.GRange$y1<-clusters.GRange$y1/40 setEPS(width=9,height=6) postscript(file = "Number of genes per cluster.eps") par(fig=c(0, 1, 0, 1)) pp <- getDefaultPlotParams(plot.type=1) pp$ideogramheight <- 30 kp <- plotKaryotype(genome=hplotter,plot.type=1,plot.params = pp,cex=.9) kpBars(kp,data=clusters.GRange,data.panel=1,col = 'lightblue') kpAxis(kp,chr="chr7",side=2,tick.len=5e5,numticks=2,data.panel=1,labels=c("0 genes","40 genes"),label.margin = 100,cex=.75) par(fig=c(0.65,1,0,.8), new=TRUE) hist_info <- hist(clusters$y1,breaks=1:38,plot = FALSE) hist_info$counts[hist_info$counts==0]<-NA #hist_info$density[hist_info$density==0]<-NA kp<-plot(hist_info,xlim=c(1,40),ylim=c(1,120),xlab="Number of Genes \nPer Cluster",ylab="Number of Clusters",main=NA,col='lightblue',cex =.9 , cex.lab=.9, axes=FALSE) axis(side=1, at=c(0.5,10.5,20.5,30.5,40.5), labels=seq(0,40,10)) axis(side=2,las = 1) dev.off() ``` So, biCYCle genes have an extremely unusual exonic structure. Many tiny exons. Plot example from big Cluster on Chormosome 7 FAILS. Reason unknown. Use older plot. ```{r} gff.exon <-readGFF("Augustus.updated_w_annots.21Aug20.gff3.gz", filter = list(type = "CDS")) gff.bed <- as_tibble(gff.exon) %>% rename(chrom = seqid, stop = end, gene = Parent) %>% dplyr::select(chrom, start, stop, gene, score, strand, type) %>% tidyr::unnest(gene) %>% mutate(strand = case_when(strand == "-" ~ -1, strand == "+" ~ +1)) %>% mutate(type = "exon") gff.bed$chrom <- recode(gff.bed$chrom,"ScPoQx8_4268;HRSCAF=5273" = "chr1", "ScPoQx8_1269;HRSCAF=1516" = "chr2", "ScPoQx8_9746;HRSCAF=11751" = "chr3", "ScPoQx8_7909;HRSCAF=9379" ="chr4", "ScPoQx8_3632;HRSCAF=4405" ="chr5", "ScPoQx8_9972;HRSCAF=12060"= "chr6", "ScPoQx8_8456;HRSCAF=10181"= "chr7", "ScPoQx8_7882;HRSCAF=9334" ="chr8", "ScPoQx8_9720;HRSCAF=11714"= "chr9") gff.bed <- as.data.frame(gff.bed) gff.bed <- gff.bed[,c(1,2,3,7,4,5,6)]#format for sushi gff.chr7bed <- gff.bed[gff.bed$chrom == "chr7",] ``` ```{r eval=FALSE, include=FALSE} gff.chr7bed <- gff.bed[gff.bed$chrom == "chr7",] chrom = "chr7" exampleregion = c(17e6,17.45e6) pdf(file = "P3C_gene_region.chr7.17Mbp.pdf",width = 9,height = 3) plotGenesStern(geneinfo = gff.chr7bed, chrom = chrom, chromstart=exampleregion[1], chromend=exampleregion[2], types = gff.chr7bed$type, bheight=0.2, plotgenetype="box", bentline=FALSE, labeloffset=.4, fontsize=1.2, labelat = "middle", labeltext=F) dev.off() ``` **************** **************** volcano plots of fund vs sex of the genes from previous publications ```{r} #RING genes hmm <- read_domtblout("Annotate/hmmscan.out.gz") hmm <- hmm[hmm$domain_ievalue <= 0.001,] colnames(hmm)[4] <- "qseqid" hmm$sseqid <- paste(hmm$domain_name,hmm$description) sigp <- rtracklayer::readGFF("Annotate/Augustus.pep.fasta.signalp5.gff3.gz",version=3) colnames(sigp)[1] <- "qseqid" sigp$sseqid <- paste("sigp",sigp$start,sigp$end,sep="_") RING_transcripts <- hmm$qseqid[hmm$description %in% "Ring finger domain"] #SPRING_transcripts <- sigp$qseqid[sigp$qseqid %in% RING_transcripts] SPRING_genes <- gsub("\\..*","",SPRING_transcripts) #SG vs body DEredgrSG <- qlf.SGvsWholeFund$table DEredgrSG$qValue <- p.adjust(DEredgrSG$PValue, method = "fdr") orderedDEredgrSG<-DEredgrSG[order(DEredgrSG$qValue),] #find cutoffs P05cutoff<-orderedDEredgrSG[orderedDEredgrSG$qValue>0.05,][1,] P01cutoff<-orderedDEredgrSG[orderedDEredgrSG$qValue>0.01,][1,] ptlabelsize = 4 p <- P05cutoff$PValue keyvals <- rep('grey75', nrow(DEredgrSG)) names(keyvals) <- rep('NS', nrow(DEredgrSG)) keyvals[which(row.names(DEredgrSG) %in% SPRING_genes)] <- 'red2' names(keyvals)[which(row.names(DEredgrSG) %in% SPRING_genes)] <- 'SPRING genes' pdf("VPFundSGVsBody_SPRINGGenes.pdf") p1 <- EnhancedVolcano(toptable = DEredgrSG[order(keyvals),], lab = NA, x = "logFC", y = "PValue", title = NULL, subtitle = NULL, caption = NULL, colCustom = keyvals[order(keyvals)], pCutoff = p, FCcutoff = FALSE, cutoffLineType = "blank", hline = c(p,P01cutoff$PValue), hlineCol = "black", hlineType = "longdash", hlineWidth = 0.5, pointSize = c(ifelse(row.names(DEredgrSG[order(keyvals),]) %in% SPRING_genes, 3, .5)), colAlpha = c(ifelse(row.names(DEredgrSG[order(keyvals),]) %in% SPRING_genes, 1, 0.5)), ylim = c(0, max(-log10(DEredgrSG$PValue[order(keyvals)]), na.rm=TRUE)), xlab = "Fundatrix (Salivary Gland - Body)\nFold Change (log2)", col = c("black", "red2") ) p1 + theme(legend.position = c(.16,.95),, axis.text.x = element_text(angle = 0, size = 18, vjust= 1), axis.text.y = element_text(angle = 0, size = 18, vjust= 0.5), axis.title.x = element_text(angle = 0,size = 20,vjust = 0), axis.title.y = element_text(angle = 90,size = 22,vjust = 0) ) + scale_color_manual(breaks = c("SPRING genes"), values = c("gray75","red2")) + annotate("text", label =c("FDR = 0.05", "FDR = 0.01"), x = c(-6.5,-6.5), y = c(.95,2.85), size = 5, colour = "black") dev.off() #fundatrix vs sexuals DEredgrSG <- qlf.SGFundvsSGSex$table DEredgrSG$qValue <- p.adjust(DEredgrSG$PValue, method = "fdr") orderedDEredgrSG<-DEredgrSG[order(DEredgrSG$qValue),] #find cutoffs P05cutoff<-orderedDEredgrSG[orderedDEredgrSG$qValue>0.05,][1,] P01cutoff<-orderedDEredgrSG[orderedDEredgrSG$qValue>0.01,][1,] ptlabelsize = 4 p <- P05cutoff$PValue keyvals <- rep('grey75', nrow(DEredgrSG)) names(keyvals) <- rep('NS', nrow(DEredgrSG)) keyvals[which(row.names(DEredgrSG) %in% SPRING_genes)] <- 'red2' names(keyvals)[which(row.names(DEredgrSG) %in% SPRING_genes)] <- 'SPRING genes' DEredgrSG <- qlf.SGFundvsSGSex$table DEredgrSG$qValue <- p.adjust(DEredgrSG$PValue, method = "fdr") orderedDEredgrSG<-DEredgrSG[order(DEredgrSG$qValue),] #find cutoffs P05cutoff<-orderedDEredgrSG[orderedDEredgrSG$qValue>0.05,][1,] P01cutoff<-orderedDEredgrSG[orderedDEredgrSG$qValue>0.01,][1,] ptlabelsize = 4 p <- P05cutoff$PValue keyvals <- rep('grey75', nrow(DEredgrSG)) names(keyvals) <- rep('NS', nrow(DEredgrSG)) keyvals[which(row.names(DEredgrSG) %in% SPRING_genes)] <- 'red2' names(keyvals)[which(row.names(DEredgrSG) %in% SPRING_genes)] <- 'SPRING genes' pdf("VPFundSGVsSex_SPRINGGenes.pdf") p1 <- EnhancedVolcano(toptable = DEredgrSG[order(keyvals),], lab = NA, x = "logFC", y = "PValue", title = NULL, subtitle = NULL, caption = NULL, colCustom = keyvals[order(keyvals)], pCutoff = p, FCcutoff = FALSE, cutoffLineType = "blank", hline = c(p,P01cutoff$PValue), hlineCol = "black", hlineType = "longdash", hlineWidth = 0.5, pointSize = c(ifelse(row.names(DEredgrSG[order(keyvals),]) %in% SPRING_genes, 3, .5)), colAlpha = c(ifelse(row.names(DEredgrSG[order(keyvals),]) %in% SPRING_genes, 1, 0.5)), ylim = c(0, max(-log10(DEredgrSG$PValue[order(keyvals)]), na.rm=TRUE)), xlab = "Salivary Gland (Fundatrix - Sexual)\nFold Change (log2)", #col = c("black", "red2") ) p1 + theme(legend.position = c(.19,.95), axis.text.x = element_text(angle = 0, size = 18, vjust= 1), axis.text.y = element_text(angle = 0, size = 18, vjust= 0.5), axis.title.x = element_text(angle = 0,size = 20,vjust = 0), axis.title.y = element_text(angle = 90,size = 22,vjust = 0) ) + scale_color_manual(breaks = c("SPRING genes"), values = c("gray75","red2")) + annotate("text", label =c("FDR = 0.05", "FDR = 0.01"), x = c(-10,-10), y = c(.95,2.85), size = 5, colour = "black") dev.off() sigp$qseqid[sigp$qseqid %in% RING_transcripts] #both have sigp ``` ```{r} #E3 genes hmm <- read_domtblout("Annotate/hmmscan.out.gz") #hmm <- hmm[hmm$domain_ievalue <= 0.001,] colnames(hmm)[4] <- "qseqid" hmm$sseqid <- paste(hmm$domain_name,hmm$description) sigp <- rtracklayer::readGFF("Annotate/Augustus.pep.fasta.signalp5.gff3.gz",version=3) colnames(sigp)[1] <- "qseqid" sigp$sseqid <- paste("sigp",sigp$start,sigp$end,sep="_") E3_transcripts <- hmm$qseqid[grepl("E3", hmm$description)] #E3_transcripts <- sigp$qseqid[sigp$qseqid %in% E3_transcripts] E3_genes <- unique(gsub("\\..*","",E3_transcripts)) #SG vs body DEredgrSG <- qlf.SGvsWholeFund$table DEredgrSG$qValue <- p.adjust(DEredgrSG$PValue, method = "fdr") orderedDEredgrSG<-DEredgrSG[order(DEredgrSG$qValue),] #find cutoffs P05cutoff<-orderedDEredgrSG[orderedDEredgrSG$qValue>0.05,][1,] P01cutoff<-orderedDEredgrSG[orderedDEredgrSG$qValue>0.01,][1,] ptlabelsize = 4 p <- P05cutoff$PValue keyvals <- rep('grey75', nrow(DEredgrSG)) names(keyvals) <- rep('NS', nrow(DEredgrSG)) keyvals[which(row.names(DEredgrSG) %in% E3_genes)] <- 'red2' names(keyvals)[which(row.names(DEredgrSG) %in% E3_genes)] <- 'E3 genes' pdf("VPFundSGVsBody_E3Genes.pdf") p1 <- EnhancedVolcano(toptable = DEredgrSG[order(keyvals),], lab = NA, x = "logFC", y = "PValue", title = NULL, subtitle = NULL, caption = NULL, colCustom = keyvals[order(keyvals)], pCutoff = p, FCcutoff = FALSE, cutoffLineType = "blank", hline = c(p,P01cutoff$PValue), hlineCol = "black", hlineType = "longdash", hlineWidth = 0.5, pointSize = c(ifelse(row.names(DEredgrSG[order(keyvals),]) %in% E3_genes, 3, .5)), colAlpha = c(ifelse(row.names(DEredgrSG[order(keyvals),]) %in% E3_genes, 1, 0.5)), ylim = c(0, max(-log10(DEredgrSG$PValue[order(keyvals)]), na.rm=TRUE)), xlab = "Fundatrix (Salivary Gland - Body)\nFold Change (log2)", col = c("black", "red2") ) p1 + theme(legend.position = c(.15,.95),, axis.text.x = element_text(angle = 0, size = 18, vjust= 1), axis.text.y = element_text(angle = 0, size = 18, vjust= 0.5), axis.title.x = element_text(angle = 0,size = 20,vjust = 0), axis.title.y = element_text(angle = 90,size = 22,vjust = 0) ) + scale_color_manual(breaks = c("E3 genes"), values = c("red2","gray75")) + annotate("text", label =c("FDR = 0.05", "FDR = 0.01"), x = c(-6.5,-6.5), y = c(.95,2.85), size = 5, colour = "black") dev.off() DEredgrSG[row.names(DEredgrSG) %in% E3_genes,] #logFC logCPM F PValue #g14470 0.5673248 7.950138 16.6676328 1.351564e-04 #g17408 -0.9016488 7.811883 10.3052224 2.142471e-03 #g18778 0.7584107 5.322955 17.5198962 9.544200e-05 #g20040 -0.4910237 9.031297 5.8559084 1.860668e-02 #g3454 -0.3676010 4.551843 0.8047886 3.732917e-01 #g3815 -0.4448331 9.200505 5.4793043 2.262349e-02 #g4092 -4.0647890 1.642754 36.5893382 1.054310e-07 #fundatrix vs sexuals DEredgrSG <- qlf.SGFundvsSGSex$table DEredgrSG$qValue <- p.adjust(DEredgrSG$PValue, method = "fdr") orderedDEredgrSG<-DEredgrSG[order(DEredgrSG$qValue),] #find cutoffs P05cutoff<-orderedDEredgrSG[orderedDEredgrSG$qValue>0.05,][1,] P01cutoff<-orderedDEredgrSG[orderedDEredgrSG$qValue>0.01,][1,] ptlabelsize = 4 p <- P05cutoff$PValue keyvals <- rep('grey75', nrow(DEredgrSG)) names(keyvals) <- rep('NS', nrow(DEredgrSG)) keyvals[which(row.names(DEredgrSG) %in% E3_genes)] <- 'red2' names(keyvals)[which(row.names(DEredgrSG) %in% E3_genes)] <- 'E3 genes' pdf("VPFundSGVsSex_E3Genes.pdf") p1 <- EnhancedVolcano(toptable = DEredgrSG[order(keyvals),], lab = NA, x = "logFC", y = "PValue", title = NULL, subtitle = NULL, caption = NULL, colCustom = keyvals[order(keyvals)], pCutoff = p, FCcutoff = FALSE, cutoffLineType = "blank", hline = c(p,P01cutoff$PValue), hlineCol = "black", hlineType = "longdash", hlineWidth = 0.5, pointSize = c(ifelse(row.names( DEredgrSG[order(keyvals),]) %in% E3_genes, 3, .5)), colAlpha = c(ifelse(row.names(DEredgrSG[order(keyvals),]) %in% E3_genes, 1, 0.5)), ylim = c(0, max(-log10(DEredgrSG$PValue[order(keyvals)]), na.rm=TRUE)), xlab = "Salivary Gland (Fundatrix - Sexual)\nFold Change (log2)", col = c("black", "red2") ) p1 + theme(legend.position = c(.15,.95), axis.text.x = element_text(angle = 0, size = 18, vjust= 1), axis.text.y = element_text(angle = 0, size = 18, vjust= 0.5), axis.title.x = element_text(angle = 0,size = 20,vjust = 0), axis.title.y = element_text(angle = 90,size = 22,vjust = 0) ) + scale_color_manual(breaks = c("E3 genes"), values = c("red2", "gray75")) + annotate("text", label =c("FDR = 0.05", "FDR = 0.01"), x = c(-11,-11), y = c(.95,2.85), size = 5, colour = "black") dev.off() DEredgrSG[row.names(DEredgrSG) %in% E3_genes,] #logFC logCPM F PValue qValue #g14470 1.7473142 7.950138 75.4993536 3.617463e-12 7.306998e-11 #g17408 1.9346385 7.811883 21.4716190 2.014083e-05 6.643575e-05 #g18778 0.6244664 5.322955 6.6019870 1.271936e-02 2.166660e-02 #g20040 1.9716946 9.031297 42.5363586 1.697499e-08 1.255372e-07 #g3454 -3.9346027 4.551843 63.2753196 6.302088e-11 9.053027e-10 #g3815 2.7998421 9.200505 87.5264294 2.784923e-13 8.023013e-12 #g4092 -0.5345668 1.642754 0.3987644 5.301538e-01 5.909029e-01 sigp$qseqid[sigp$qseqid %in% E3_transcripts] #none has sigp ``` #SSGP genes #get the sequence of all the SSGP-71s genes from the Zhao et al. paper awk -F "\"*,\"*" '{print $9}' Zhao2015_SSGP-71s.csv > Zhao2015_SSGP-71s.txt awk -F "\"*,\"*" '{print $1}' Zhao2015_SSGP-71s.csv > Zhao2015_SSGP-71s_genes.txt sed -e 's/^/>/' Zhao2015_SSGP-71s_genes.txt > Zhao2015_SSGP-71s_genes_header.txt paste -d '\n' Zhao2015_SSGP-71s_genes_header.txt Zhao2015_SSGP-71s.txt > Zhao2015_SSGP-71s.fasta ```{R} SSGP_genes = c("g3528", "g8289", "g8292","g6324") SSGP_transcripts = c("g3528.t1", "g8289.t1", "g8292.t1","g6324.t1") #SSGP_transcripts <- sigp$qseqid[sigp$qseqid %in% SSGP_transcripts] #SG vs body DEredgrSG <- qlf.SGvsWholeFund$table DEredgrSG$qValue <- p.adjust(DEredgrSG$PValue, method = "fdr") orderedDEredgrSG<-DEredgrSG[order(DEredgrSG$qValue),] #find cutoffs P05cutoff<-orderedDEredgrSG[orderedDEredgrSG$qValue>0.05,][1,] P01cutoff<-orderedDEredgrSG[orderedDEredgrSG$qValue>0.01,][1,] ptlabelsize = 4 p <- P05cutoff$PValue keyvals <- rep('grey75', nrow(DEredgrSG)) names(keyvals) <- rep('NS', nrow(DEredgrSG)) keyvals[which(row.names(DEredgrSG) %in% SSGP_genes)] <- 'red2' names(keyvals)[which(row.names(DEredgrSG) %in% SSGP_genes)] <- 'SSGP genes' pdf("VPFundSGVsBody_SSGPGenes.pdf") p1 <- EnhancedVolcano(toptable = DEredgrSG[order(keyvals),], lab = NA, x = "logFC", y = "PValue", title = NULL, subtitle = NULL, caption = NULL, colCustom = keyvals[order(keyvals)], pCutoff = p, FCcutoff = FALSE, cutoffLineType = "blank", hline = c(p,P01cutoff$PValue), hlineCol = "black", hlineType = "longdash", hlineWidth = 0.5, pointSize = c(ifelse(row.names(DEredgrSG[order(keyvals),]) %in% SSGP_genes, 3, .5)), colAlpha = c(ifelse(row.names(DEredgrSG[order(keyvals),]) %in% SSGP_genes, 1, 0.5)), ylim = c(0, max(-log10(DEredgrSG$PValue[order(keyvals)]), na.rm=TRUE)), xlab = "Fundatrix (Salivary Gland - Body)\nFold Change (log2)", col = c("black", "red2") ) p1 + theme(legend.position = c(.16,.95),, axis.text.x = element_text(angle = 0, size = 18, vjust= 1), axis.text.y = element_text(angle = 0, size = 18, vjust= 0.5), axis.title.x = element_text(angle = 0,size = 20,vjust = 0), axis.title.y = element_text(angle = 90,size = 22,vjust = 0) ) + scale_color_manual(breaks = c("SSGP genes"), values = c("gray75","red2")) + annotate("text", label =c("FDR = 0.05", "FDR = 0.01"), x = c(-6.5,-6.5), y = c(.95,2.85), size = 5, colour = "black") dev.off() DEredgrSG[row.names(DEredgrSG) %in% SSGP_genes,] # logFC logCPM F PValue qValue #g3528 -2.876629 5.692213 36.97458 9.333253e-08 5.734838e-07 #g6324 2.036075 7.484944 23.59803 9.037310e-06 3.246657e-05 #fundatrix vs sexuals DEredgrSG <- qlf.SGFundvsSGSex$table DEredgrSG$qValue <- p.adjust(DEredgrSG$PValue, method = "fdr") orderedDEredgrSG<-DEredgrSG[order(DEredgrSG$qValue),] #find cutoffs P05cutoff<-orderedDEredgrSG[orderedDEredgrSG$qValue>0.05,][1,] P01cutoff<-orderedDEredgrSG[orderedDEredgrSG$qValue>0.01,][1,] ptlabelsize = 4 p <- P05cutoff$PValue keyvals <- rep('grey75', nrow(DEredgrSG)) names(keyvals) <- rep('NS', nrow(DEredgrSG)) keyvals[which(row.names(DEredgrSG) %in% SSGP_genes)] <- 'red2' names(keyvals)[which(row.names(DEredgrSG) %in% SSGP_genes)] <- 'SSGP genes' pdf("VPFundSGVsSex_SSGPGenes.pdf") p1 <- EnhancedVolcano(toptable = DEredgrSG[order(keyvals),], lab = NA, x = "logFC", y = "PValue", title = NULL, subtitle = NULL, caption = NULL, colCustom = keyvals[order(keyvals)], pCutoff = p, FCcutoff = FALSE, cutoffLineType = "blank", hline = c(p,P01cutoff$PValue), hlineCol = "black", hlineType = "longdash", hlineWidth = 0.5, pointSize = c(ifelse(row.names( DEredgrSG[order(keyvals),]) %in% SSGP_genes, 3, .5)), colAlpha = c(ifelse(row.names(DEredgrSG[order(keyvals),]) %in% SSGP_genes, 1, 0.5)), ylim = c(0, max(-log10(DEredgrSG$PValue[order(keyvals)]), na.rm=TRUE)), xlab = "Salivary Gland (Fundatrix - Sexual)\nFold Change (log2)", col = c("black", "red2") ) p1 + theme(legend.position = c(.16,.95), axis.text.x = element_text(angle = 0, size = 18, vjust= 1), axis.text.y = element_text(angle = 0, size = 18, vjust= 0.5), axis.title.x = element_text(angle = 0,size = 20,vjust = 0), axis.title.y = element_text(angle = 90,size = 22,vjust = 0) ) + scale_color_manual(breaks = c("SSGP genes"), values = c("gray75","red2")) + annotate("text", label =c("FDR = 0.05", "FDR = 0.01"), x = c(-11,-11), y = c(.95,2.85), size = 5, colour = "black") dev.off() sigp$qseqid[sigp$qseqid %in% SSGP_transcripts] #g6324.t1 has sigp DEredgrSG[row.names(DEredgrSG) %in% SSGP_genes,] ``` None Plot g16073 across all nymphal and generation stages [5] "catrim.m36.Sample_1st_II_catrimFundSG" [6] "catrim.m36.Sample_1st_III_catrimFundSG" [20] "catrim.m36.Sample_early2nd_I_catrimFundSG" [21] "catrim.m36.Sample_early2nd_II_catrimFundSG" [22] "catrim.m36.Sample_early2nd_III_catrimFundSG" [35] "catrim.m36.Sample_late2nd_I_catrimFundSG" [36] "catrim.m36.Sample_late2nd_II_catrimFundSG" [37] "catrim.m36.Sample_late2nd_III_catrimFundSG" [38] "catrim.m36.Sample_late2nd_IV_catrimFundSG" [39] "catrim.m36.Sample_late2nd_V_catrimFundSG" [40] "catrim.m36.Sample_late2nd_VI_catrimFundSG" [11] "catrim.m36.Sample_3rd_I_catrimFundSG" [12] "catrim.m36.Sample_3rd_II_catrimFundSG" [13] "catrim.m36.Sample_3rd_III_catrimFundSG" [14] "catrim.m36.Sample_4th_I_catrimFundSG" [15] "catrim.m36.Sample_4th_II_catrimFundSG" [16] "catrim.m36.Sample_4th_III_catrimFundSG" [17] "catrim.m36.Sample_adult_I_catrimFundSG" [18] "catrim.m36.Sample_adult_II_catrimFundSG" [19] "catrim.m36.Sample_adult_III_catrimFundSG" [7] "catrim.m36.Sample_2nd_gen_1stinst_SG3" "Sample_2ndgen_3rdinst_sg_I_1" "Sample_2ndgen_3rdinst_sg_II_1" "Sample_2ndgen_3rdinst_sg_III_1" "Sample_2ndgen_3rdinst_whole_I_1" "Sample_2ndgen_3rdinst_whole_II_1" "Sample_2ndgen_3rdinst_whole_III_1" [8] "catrim.m36.Sample_2nd_gen_Ad_SG4" [9] "catrim.m36.Sample_2nd_gen_Ad_SG5" [10] "catrim.m36.Sample_2nd_gen_Ad_SG6" [23] "catrim.m36.Sample_Hcor_ex_bet_4thSG10" [24] "catrim.m36.Sample_Hcor_ex_bet_4thSG9" [25] "catrim.m36.Sample_Hcor_ex_bet_AdSG11" [26] "catrim.m36.Sample_Hcor_ex_bet_AdSG12" [27] "catrim.m36.Sample_Hcor_ex_bet_AdSG13" [28] "catrim.m36.Sample_Hcor_ex_bet_AdSG8" 31] "catrim.m36.Sample_Hcor_sexF_SG_19_1" [32] "catrim.m36.Sample_Hcor_sexF_SG_19_3" [33] "catrim.m36.Sample_Hcor_sexF_SG_19_4" [34] "catrim.m36.Sample_Hcor_sexF_SG_19_5" [3] "catrim.m36.F02.pooled.fq.gz" [4] "catrim.m36.H02.pooled.fq.gz" Carcass [41] "Sample_whole_1st_I_catrimFundSG" [46] "Sample_whole_early2nd_I_catrimFundSG" [47] "Sample_whole_late2nd_I_catrimFundSG" [48] "Sample_whole_v_late2nd_I_catrimFundSG" [42] "Sample_whole_3rd_I_catrimFundSG" [43] "Sample_whole_3rd_II_catrimFundSG" [44] "Sample_whole_4th_I_catrimFundSG" [45] "Sample_whole_adult_I_catrimFundSG" [29] "catrim.m36.Sample_Hcor_ex_bet_nymphs_15" [30] "catrim.m36.Sample_Hcor_ex_bet_nymphs_16" [1] "catrim.m36.D02.pooled.fq.gz" [2] "catrim.m36.E02.pooled.fq.gz" Make dataframe of sample type (stage and body part) and values ```{r} g16073_row <- which(row.names(ysub$counts) == "g16073") g16073_counts <- data.frame(stage = "1", generation = "G1", sample = "Salivary Gland", CPM = cpm(ysub)[g16073_row,c( "catrim.m36.Sample_1st_II.bam", "catrim.m36.Sample_1st_III.bam")]) g16073_counts <- rbind(g16073_counts,data.frame(stage = "2", generation = "G1", sample = "Salivary Gland", CPM = cpm(ysub)[g16073_row,c( "catrim.m36.Sample_early2nd_I.bam" , "catrim.m36.Sample_early2nd_II.bam" , "catrim.m36.Sample_early2nd_III.bam", "catrim.m36.Sample_late2nd_I.bam" , "catrim.m36.Sample_late2nd_II.bam" , "catrim.m36.Sample_late2nd_III.bam" , "catrim.m36.Sample_late2nd_IV.bam" , "catrim.m36.Sample_late2nd_V.bam" , "catrim.m36.Sample_late2nd_VI.bam" )])) g16073_counts <- rbind(g16073_counts,data.frame(stage = "3", generation = "G1", sample = "Salivary Gland", CPM = cpm(ysub)[g16073_row,c( "catrim.m36.Sample_3rd_I.bam" , "catrim.m36.Sample_3rd_II.bam" , "catrim.m36.Sample_3rd_III.bam" )])) g16073_counts <- rbind(g16073_counts,data.frame(stage = "4", generation = "G1", sample = "Salivary Gland", CPM = cpm(ysub)[g16073_row,c( "catrim.m36.Sample_4th_I.bam" , "catrim.m36.Sample_4th_II.bam" , "catrim.m36.Sample_4th_III.bam" )])) g16073_counts <- rbind(g16073_counts,data.frame(stage = "Ad", generation = "G1", sample = "Salivary Gland", CPM = cpm(ysub)[g16073_row,c( "catrim.m36.Sample_adult_I.bam" , "catrim.m36.Sample_adult_II.bam" , "catrim.m36.Sample_adult_III.bam" )])) g16073_counts <- rbind(g16073_counts,data.frame(stage = "1", generation = "G2", sample = "Salivary Gland", CPM = cpm(ysub)[g16073_row,"catrim.m36.Sample_2nd_gen_1stinst_SG3.bam"])) g16073_counts <- rbind(g16073_counts,data.frame(stage = "3", generation = "G2", sample = "Salivary Gland", CPM = cpm(ysub)[g16073_row,c( "Sample_2ndgen_3rdinst_sg_I_1.fastq.unmapped.bam" , "Sample_2ndgen_3rdinst_sg_II_1.fastq.unmapped.bam" , "Sample_2ndgen_3rdinst_sg_III_1.fastq.unmapped.bam")])) g16073_counts <- rbind(g16073_counts,data.frame(stage = "Ad", generation = "G2", sample = "Salivary Gland", CPM = cpm(ysub)[g16073_row,c( "catrim.m36.Sample_2nd_gen_Ad_SG4.bam", "catrim.m36.Sample_2nd_gen_Ad_SG5.bam", "catrim.m36.Sample_2nd_gen_Ad_SG6.bam" )])) g16073_counts <- rbind(g16073_counts,data.frame(stage = "4", generation = "G5", sample = "Salivary Gland", CPM = cpm(ysub)[g16073_row,c( "catrim.m36.Sample_Hcor_ex_bet_4thSG10.bam" , "catrim.m36.Sample_Hcor_ex_bet_4thSG9.bam" )])) g16073_counts <- rbind(g16073_counts,data.frame(stage = "Ad", generation = "G5",sample = "Salivary Gland", CPM = cpm(ysub)[g16073_row,c( "catrim.m36.Sample_Hcor_ex_bet_AdSG11.bam" , "catrim.m36.Sample_Hcor_ex_bet_AdSG12.bam" , "catrim.m36.Sample_Hcor_ex_bet_AdSG13.bam" , "catrim.m36.Sample_Hcor_ex_bet_AdSG8.bam")])) g16073_counts <- rbind(g16073_counts,data.frame(stage = "Ny", generation = "Sexual", sample = "Salivary Gland", CPM = cpm(ysub)[g16073_row,c( "catrim.m36.Sample_Hcor_sexF_SG_19_1.bam" , "catrim.m36.Sample_Hcor_sexF_SG_19_3.bam" , "catrim.m36.Sample_Hcor_sexF_SG_19_4.bam" , "catrim.m36.Sample_Hcor_sexF_SG_19_5.bam" , "catrim.m36.F02.pooled.fq.gz.bam" , "catrim.m36.H02.pooled.fq.gz.bam" )])) g16073_counts <- rbind(g16073_counts,data.frame(stage = "1", generation = "G1", sample = "Body", CPM = cpm(ysub)[g16073_row,c("catrim.m36.Sample_whole_1st_I.bam")])) g16073_counts <- rbind(g16073_counts,data.frame(stage = "2", generation = "G1", sample = "Body", CPM = cpm(ysub)[g16073_row,c( "catrim.m36.Sample_whole_early2nd_I.bam" , "catrim.m36.Sample_whole_late2nd_I.bam" , "catrim.m36.Sample_whole_v_late2nd_I.bam")])) g16073_counts <- rbind(g16073_counts,data.frame(stage = "3", generation = "G1", sample = "Body", CPM = cpm(ysub)[g16073_row,c( "catrim.m36.Sample_whole_3rd_I.bam" , "catrim.m36.Sample_whole_3rd_II.bam" )])) g16073_counts <- rbind(g16073_counts,data.frame(stage = "4", generation = "G1", sample = "Body", CPM = cpm(ysub)[g16073_row,c("catrim.m36.Sample_whole_4th_I.bam" )])) g16073_counts <- rbind(g16073_counts,data.frame(stage = "Ad", generation = "G1", sample = "Body", CPM = cpm(ysub)[g16073_row,c("catrim.m36.Sample_whole_adult_I.bam" )])) g16073_counts <- rbind(g16073_counts,data.frame(stage = "3rd", generation = "G2", sample = "Body", CPM = cpm(ysub)[g16073_row,c("Sample_2ndgen_3rdinst_whole_I_1.fastq.unmapped.bam", "Sample_2ndgen_3rdinst_whole_II_1.fastq.unmapped.bam", "Sample_2ndgen_3rdinst_whole_III_1.fastq.unmapped.bam" )])) g16073_counts <- rbind(g16073_counts,data.frame(stage = "Ny", generation = "G5", sample = "Carcass", CPM = cpm(ysub)[g16073_row,c("catrim.m36.Sample_Hcor_ex_bet_nymphs_15.bam" , "catrim.m36.Sample_Hcor_ex_bet_nymphs_16.bam" )])) g16073_counts <- rbind(g16073_counts,data.frame(stage = "Ny", generation = "Sexual", sample = "Carcass", CPM = cpm(ysub)[g16073_row,c( "catrim.m36.D02.pooled.fq.gz.bam" , "catrim.m36.E02.pooled.fq.gz.bam" )])) g16073_counts$CPM <- g16073_counts$CPM + 1 ``` ```{r} pdf("LifeCycleg16073Counts.pdf", width = 9, height = 7) ggplot(data = g16073_counts, mapping = aes(x = stage,y=CPM)) + geom_jitter(size = 2, alpha=.8,col = "black", width = .2, height = 0 ) + scale_y_log10(breaks = c(1,11,101,1001,10001), labels = c(0,10,100,1000,10000), limits = c(1,1.3e4), name="Counts Per Million Reads") + geom_hline(yintercept=c(log10(1)),alpha=0.5) + facet_wrap(facets = c("sample","generation"), strip.position = "bottom", switch = "x",scales = "free_x", drop = TRUE, nrow = 1) + theme(axis.text.x=element_text(size=12), axis.text.y=element_text(size=12, vjust = 0.5), axis.title.x=element_blank(), axis.title.y=element_text(size=20), legend.position = "none", panel.spacing = unit(0, "lines"), strip.background = element_blank(), strip.placement = "outside", strip.text.x = element_text(size = 10) ) dev.off() ``` Examine SPRING counts across life cycle ```{r} ybiCYCle <- ysub[row.names(ysub) %in% sub("\\..*","",SPRING_genes),] life_cycle_counts <- data.frame( "gene" = row.names(ybiCYCle$counts), "maxCount" = rowMeans(cpm(ybiCYCle)[,ybiCYCle$samples$group == "G1SG"]), "G1SG" = rowMeans(cpm(ybiCYCle)[,ybiCYCle$samples$group == "G1SG"]), "G2SG" = rowMeans(cpm(ybiCYCle)[,ybiCYCle$samples$group == "G2SG"]), "GBetSG" = rowMeans(cpm(ybiCYCle)[,ybiCYCle$samples$group == "GBetSG"]), "SexSG" = rowMeans(cpm(ybiCYCle)[,ybiCYCle$samples$group == "SexSG"]) ) life_cycle_counts_for_plot <- life_cycle_counts %>% gather(Generation, Counts, G1SG:SexSG) life_cycle_counts_for_plot$maxCount <- life_cycle_counts_for_plot$maxCount + 1 life_cycle_counts_for_plot$Counts <- life_cycle_counts_for_plot$Counts + 1 pdf("LifeCycleSPRINGCounts.pdf") ggplot(data = life_cycle_counts_for_plot, mapping = aes(x = Generation,y=Counts, group = gene)) + scale_color_gradientn(colours=c('red', 'yellow', 'cyan', 'blue'), trans = "log10")+ geom_line(size = .4, alpha=.3,aes(colour=maxCount)) + geom_jitter(size = 1, alpha=.9,col = "black", width = .33, height = 0 ) + scale_y_log10(breaks = c(1,11,101,1001,10001), labels = c(0,10,100,1000,10000), limits = c(1,3e4), name="Counts Per Million Reads") + geom_hline(yintercept=c(log10(1)),alpha=0.5) + scale_x_discrete(labels = c("G1SG"="G1","G2SG"="G2","GBetSG"="G5","SexSG"="G7")) + theme(axis.text.x=element_text(size=16), axis.text.y=element_text(size=16, vjust = 0.5), axis.title.x=element_text(size=20), axis.title.y=element_text(size=20), legend.position = "none" ) dev.off() ``` Examine SPRING counts across fundatrix instars ```{r} ybiCYCle <- ysub[row.names(ysub) %in% sub("\\..*","",SPRING_genes),] life_cycle_counts <- data.frame( "gene" = row.names(ybiCYCle$counts), "maxCount" = rowMeans(cpm(ybiCYCle)[,ybiCYCle$samples$group == "G1SG"]), "1st" = rowMeans(cpm(ybiCYCle)[,grepl(pattern = "1st",x = ybiCYCle$samples$files) & ybiCYCle$samples$group == "G1SG"]), "2nd" = rowMeans(cpm(ybiCYCle)[,grepl(pattern = "2nd",x = ybiCYCle$samples$files) & ybiCYCle$samples$group == "G1SG"]), "3rd" = rowMeans(cpm(ybiCYCle)[,grepl(pattern = "3rd",x = ybiCYCle$samples$files) & ybiCYCle$samples$group == "G1SG"]), "4th" = rowMeans(cpm(ybiCYCle)[,grepl(pattern = "4th",x = ybiCYCle$samples$files) & ybiCYCle$samples$group == "G1SG"]), "Ad" = rowMeans(cpm(ybiCYCle)[,grepl(pattern = "adult",x = ybiCYCle$samples$files) & ybiCYCle$samples$group == "G1SG"]) ) life_cycle_counts_for_plot <- life_cycle_counts %>% gather(Stage, Counts, X1st:Ad) life_cycle_counts_for_plot$maxCount <- life_cycle_counts_for_plot$maxCount + 1 life_cycle_counts_for_plot$Counts <- life_cycle_counts_for_plot$Counts + 1 pdf("FundNymphSPRINGCounts.pdf") ggplot(data = life_cycle_counts_for_plot, mapping = aes(x = Stage,y=Counts, group = gene)) + scale_color_gradientn(colours=c('red', 'yellow', 'cyan', 'blue'), trans = "log10")+ geom_line(size = .4, alpha=.3,aes(colour=maxCount)) + geom_jitter(size = 1, alpha=.9,col = "black", width = .33, height = 0 ) + scale_y_log10(breaks = c(1,11,101,1001,10001), labels = c(0,10,100,1000,10000), limits = c(1,7e4), name="Counts Per Million Reads") + geom_hline(yintercept=c(log10(1)),alpha=0.5) + scale_x_discrete(labels = c("X1st"="1","X2nd"="2","X3rd"="3","X4th"="4","Ad"="Ad"), limits = c("X1st","X2nd","X3rd","X4th","Ad"), name = "Fundatrix Stage") + theme(axis.text.x=element_text(size=24), axis.text.y=element_text(size=24, vjust = 0.5), axis.title.x=element_text(size=28), axis.title.y=element_text(size=28), legend.position = "none" ) dev.off() ```