--- title: "classifier_orthogroup_Hcor.Rmd" output: html_document: df_print: paged --- this code tests how prrn would perform when aligning a group of bicycle genes and an unrelated group of paralog genes (related to each other, but not to bicycle genes) ```{r, echo=FALSE} library(seqinr) library(ggplot2) library(dplyr) library(tidyr) library(stringr) library(rtracklayer) library(GenomicFeatures) library(gridExtra) library(RFLPtools) library(rhmmer) library(rcompanion) library(glmnet) library(broom) library(reshape2) library(scales) library(tidyverse) library(class) library(regioneR) library(ggseqlogo) ``` ```{r} gff_file = "Hcor.Augustus.updated_w_annots.3ix21_final.gff3" gff <- readGFF(gff_file, filter=list(type="CDS")) gff.exons=gff gff.exons$Parent <- unlist(gff.exons$Parent) parents <- unique(gff.exons$Parent) gff.exons$Parent <-factor(gff.exons$Parent,levels = parents) gff.exons <- data.frame(gff.exons) exons.summary = gff.exons %>% mutate(size = end - start + 1) %>% group_by(Parent) %>% summarise(num_total_exons = n(), total_exon_length = sum(size)) ``` read in orthofinder results to find potential paralog group in Hcor that has a lot of exons (like bicycle genes) and a sufficient number of genes in the paralog group orthofinder result from hanc/bicycle_classifier/prrn_orthogroup/Hcor_Apis_whole_genome/OrthoFinder/Results_Oct21/ ```{r} of = read.table("Augustus.updated_w_annots.3ix21_final.pep_longest__v__Apis.Augustus.updated_w_annots.7vii21.pep_longest.tsv", header = TRUE, stringsAsFactors = FALSE, sep = "\t") colnames(of) = c("orthogroup", "Hcor", "Apis") of_Hcor = of$Hcor of_Hcor_summary = as.data.frame(matrix(, nrow=length(of_Hcor), ncol=4)) colnames(of_Hcor_summary) = c("orthogroup", "num_genes", "ave_num_exons", "ave_total_exon_length") of_Hcor_summary$orthogroup = of$orthogroup for (i in 1:length(of_Hcor)){ curr_transcripts = unlist(str_split(of_Hcor[i], pattern=", ")) exons.summary.curr = exons.summary[exons.summary$Parent %in% curr_transcripts,] of_Hcor_summary[i,2] = length(curr_transcripts) of_Hcor_summary[i,3] = exons.summary.curr %>% summarise(mean(num_total_exons)) of_Hcor_summary[i,4] = exons.summary.curr %>% summarise(mean(total_exon_length)) } of_Hcor_summary %>% filter(ave_num_exons>10 & num_genes>3) ``` OG0000200 g3145.t2: ACATN_HUMAN Acetyl-coenzyme A transporter 1 (SLC33A1) g3146.t2: ACATN_HUMAN g3147.t1: ACATN_HUMAN g3148.t1: ACATN_RAT g3149.t2: ACATN_HUMAN g3150.t1: ACATN_HUMAN g3151.t1: ACATN_HUMAN #look for all genes with ACATN blast hits to see if we can increase the sample size of the paralog cluster cd ~/bicycle_classifier/prrn_orthogroup/Hcor_Apis_whole_genome zgrep "ACATN" ~/bicycle_classifier/Hcor/Augustus.3ix21_final.pep.fasta.blastp.outfmt6.gz > Augustus.3ix21_final.blastp.ACATN.tsv check structure of all ACATN genes ```{r} interest_gene = read.table("Augustus.3ix21_final.blastp.ACATN.tsv", stringsAsFactors = FALSE, header = FALSE) interest_gene.summary = exons.summary %>% filter(Parent %in% interst_gene$V1) write.table(interest_gene.summary$Parent, "Augustus.3ix21_final.ACATN.transcripts_longest.txt", col.names = FALSE, row.names = FALSE, quote = FALSE) ``` #make logo of all ACATN genes longest transcripts seqkit grep -f Augustus.3ix21_final.ACATN.transcripts_longest.txt Augustus.updated_w_annots.3ix21_final.pep_longest.fa > Augustus.3ix21_final.ACATN.fa #Align with MAFFT mafft --auto --thread 2 Augustus.3ix21_final.ACATN.fa > Augustus.3ix21_final.ACATN.mafft_auto.fa #trim alignments trimal -in Augustus.3ix21_final.ACATN.mafft_auto.fa -out Augustus.3ix21_final.ACATN.mafft_auto.trimal_50.fa -gt 0.50 ```{r} clustergenes<-read.alignment("Augustus.3ix21_final.ACATN.mafft_auto.trimal_50.fa",format="fasta") clustergenes$seq <- unlist(clustergenes$seq) clustergenes$seq <- toupper(clustergenes$seq) clustergenes.logo <- ggseqlogo(clustergenes$seq) + theme(axis.text.x = element_blank(), axis.text.y = element_text(size=26), axis.title = element_text(size=26), legend.position = "None") pdf(file = "Augustus.3ix21_final.ACATN.mafft_auto.trimal_50_logo.pdf",width=60,height=4) clustergenes.logo dev.off() ``` Almost all matches on sequence level #run prrn for Hcor DE 200 (subsampled to same number of samples vs this group of genes) #bicycle /groups/stern/home/sternd/bin/anno2gsiseq.pl -f INSECTBASE -o original_cluster2_gene_100_v5.5.6.ext.fa Hcor.Augustus.updated_w_annots.3ix21_final.gff3 original_cluster2_gene_100_v5.5.6.fa #don't need to subsample because only 22 seqs seqkit grep -f <(grep "^>" original_cluster2_gene_200_v5.5.6.fa | cut -c 2- | shuf -n 23) original_cluster2_gene_200_v5.5.6.fa > original_cluster2_gene_200_v5.5.6_sample23.fa /groups/stern/home/sternd/bin/anno2gsiseq.pl -f INSECTBASE -o original_cluster2_gene_200_v5.5.6_sample23.ext.fa Hcor.Augustus.updated_w_annots.3ix21_final.gff3 original_cluster2_gene_200_v5.5.6_sample23.fa sed -i '/^>/ s/$/_DE/' original_cluster2_gene_100_v5.5.6.ext.fa sed -i '/^>/ s/$/_DE/' original_cluster2_gene_200_v5.5.6_sample23.ext.fa #ACATN /groups/stern/home/sternd/bin/anno2gsiseq.pl -f INSECTBASE -o Augustus.3ix21_final.ACATN.ext.fa Hcor.Augustus.updated_w_annots.3ix21_final.gff3 Augustus.3ix21_final.ACATN.fa sed -i '/^>/ s/$/_ACATN/' Augustus.3ix21_final.ACATN.ext.fa #combine ext fasta cat original_cluster2_gene_100_v5.5.6.ext.fa Augustus.3ix21_final.ACATN.ext.fa > original_cluster2_gene_100+ACATN.ext.fa cat original_cluster2_gene_200_v5.5.6_sample23.ext.fa Augustus.3ix21_final.ACATN.ext.fa > original_cluster2_gene_200_v5.5.6_sample23+ACATN.ext.fa #run prrn bsub -o log.txt -We 10 "prrn5 -KA -l2000 -F8 -ph -ps -o original_cluster2_gene_100+ACATN.align.clustal.html original_cluster2_gene_100+ACATN.ext.fa" bsub -o log.txt -We 10 "prrn5 -KA -l2000 -F8 -ph -ps -o original_cluster2_gene_200_v5.5.6_sample23+ACATN.align.clustal.html original_cluster2_gene_200_v5.5.6_sample23+ACATN.ext.fa" #Replace intron residues with Z perl -0777 -pe 's/.*?b>/Z/gs' original_cluster2_gene_100+ACATN.align.clustal.html | grep -v "^<" > original_cluster2_gene_100+ACATN.align.clustal.replaceZ.txt perl -0777 -pe 's/.*?b>/Z/gs' original_cluster2_gene_200_v5.5.6_sample23+ACATN.align.clustal.html | grep -v "^<" > original_cluster2_gene_200_v5.5.6_sample23+ACATN.align.clustal.replaceZ.txt ```{r} plot_introns = function(file_name, group, class_label){ replaceZaln <- read_table(file = paste0(file_name, ".align.clustal.replaceZ.txt"), col_names = c("transcript","sequence"), col_types = c("c","c"), skip = 1) candidate_intron_positions<-str_locate_all(replaceZaln$sequence[grepl(pattern = group,x = replaceZaln$transcript)], "Z") original_intron_positions<-str_locate_all(replaceZaln$sequence[!grepl(pattern = group,x = replaceZaln$transcript)], "Z") candidate_pos <- do.call(rbind,candidate_intron_positions)[,1] original_pos <- do.call(rbind,original_intron_positions)[,1] seq_length <-str_length(replaceZaln$sequence[1]) candidate_hist <- hist(candidate_pos, breaks = (1:seq_length)) original_hist <- hist(original_pos, breaks = (1:seq_length)) hist_values <- tibble(position = c(1:length(candidate_hist$counts)), candidate_counts = candidate_hist$counts, candidate_density = candidate_hist$density, original_counts = original_hist$counts, original_density = original_hist$density) plot_hist_density<-hist_values %>% tidyr::pivot_longer(cols = c(original_density,candidate_density)) %>% dplyr::select(-c(candidate_counts,original_counts)) hist_values <- tibble(position = c(1:length(candidate_hist$counts)), candidate_counts = candidate_hist$counts, candidate_freq = candidate_hist$counts / length(candidate_intron_positions), original_counts = original_hist$counts, original_freq =original_hist$count / length(original_intron_positions)) #remove all positions where there are no introns in either group #do not run this if want to plot every position on x-axis (9/23/21) hist_values = hist_values[!(hist_values$candidate_counts==0 & hist_values$original_counts==0),] #add column for overlap frequency hist_values = hist_values %>% rowwise() %>% mutate(overlap_freq=min(c(original_freq, candidate_freq))) plot_hist_freq <-hist_values %>% tidyr::pivot_longer(cols = c(original_freq,candidate_freq,overlap_freq)) %>% dplyr::select(-c(candidate_counts,original_counts)) candidate_count = length(unique(gsub("\\..*","",replaceZaln$transcript[grepl(pattern = group,x = replaceZaln$transcript)]))) original_count = length(unique(gsub("\\..*","",replaceZaln$transcript[!grepl(pattern = group,x = replaceZaln$transcript)]))) #re-number the axis axis to skip the positions that do not have non-zero count (comment out if want to plot all x, 9/23/21) plot_hist_freq = plot_hist_freq %>% mutate(position_new = rep(1:(dim(plot_hist_freq)[1]/3), each=3)) plot_intron_table = ggplot(plot_hist_freq, aes(x = position_new, y = value, fill = name)) + geom_bar(stat = "identity", position = "identity", width = 1,show.legend = FALSE) + scale_x_continuous(name = "Relative Position of Intron in Alignment", expand = c(0, 0)) + scale_y_continuous(name = "Frequency of intron", limits = c(0,1)) + scale_fill_manual(values = c("original_freq"="#E69F00", "candidate_freq"="#56B4E9", "overlap_freq"="#CC79A7")) + annotate("text", x=10, y=c(1,.95), hjust = 0, label = c(paste0("original ", class_label, ": ", original_count), paste0(group, ": ", candidate_count)), fontface = "bold.italic", colour = c("#E69F00","#56B4E9"), size = 7.5) + theme(axis.line = element_line(colour = "black", size = 0.2), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.border = element_blank(), panel.background = element_blank(), axis.text = element_text(size=22), axis.title = element_text(size=26)) pdf(paste0(file_name, "_intronpos_ACATN_comparison.pdf"),width = 18,height = 6) plot(plot_intron_table) dev.off() #get regression value for number of sequences with intron at each position between the two groups rsqr = summary(lm(candidate_counts ~ original_counts, data = hist_values))$r.squared return(rsqr) } ``` ```{r} rsqr = plot_introns("original_cluster2_gene_100+ACATN", "ACATN", "unicycle") rsqr #0.0002779153 rsqr = plot_introns("original_cluster2_gene_200_v5.5.6_sample23+ACATN", "ACATN", "bicycle") rsqr #9.018962e-06 ``` plot of example intron position regression ```{r} file_name="original_cluster2_gene_200_v5.5.6_sample23+ACATN" group="ACATN" #run content of function above pdf("original_cluster2_gene_200_v5.5.6_sample23+ACATN_intronpos_ACATN_regression.pdf", width=5, height=5) ggplot(hist_values, aes(original_counts, candidate_counts)) + geom_point(size=2) + theme_classic() + geom_smooth(method = "lm", se = FALSE) + theme(axis.line = element_line(colour = "black", size = 0.2), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.border = element_blank(), panel.background = element_blank(), axis.text = element_text(size=22), axis.title = element_text(size=20)) + xlab("Number of bicycle Genes") + ylab("Number of ACATN Genes") dev.off() ``` ####################################################################################################################### #check orthogroup #2 OG0000247 g14021.t1: ABCGN_DICDI g14022.t1: ABCGN_DICDI g16289.t1: ABCGN_DICDI g16379.t1: ABCGN_DICDI g17197.t1: ABCGN_DICDI ABC transporter G family member 23 (abcG23, slime mold) zgrep "ABCGN_" ~/bicycle_classifier/Hcor/Augustus.3ix21_final.pep.fasta.blastp.outfmt6.gz > Augustus.3ix21_final.blastp.ABCGN.tsv #run prrn for Hcor DE 200 (subsampled to same number of samples vs this group of genes) ```{r} interest_gene = read.table("Augustus.3ix21_final.blastp.ABCGN.tsv", stringsAsFactors = FALSE, header = FALSE) interest_gene.summary = exons.summary %>% filter(Parent %in% interest_gene$V1) write.table(interest_gene.summary$Parent, "Augustus.3ix21_final.ABCGN.transcripts_longest.txt", col.names = FALSE, row.names = FALSE, quote = FALSE) ``` seqkit grep -f Augustus.3ix21_final.ABCGN.transcripts_longest.txt Augustus.updated_w_annots.3ix21_final.pep_longest.fa > Augustus.3ix21_final.ABCGN.fa #16 hits mafft --auto --thread 2 Augustus.3ix21_final.ABCGN.fa > Augustus.3ix21_final.ABCGN.mafft_auto.fa #probably should have removed g8957 and g16023 #bicycle #/groups/stern/home/sternd/bin/anno2gsiseq.pl -f INSECTBASE -o original_cluster2_gene_100_v5.5.6.ext.fa Hcor.Augustus.updated_w_annots.3ix21_final.gff3 original_cluster2_gene_100_v5.5.6.fa #don't need to subsample because only 22 seqs seqkit grep -f <(grep "^>" original_cluster2_gene_200_v5.5.6.fa | cut -c 2- | shuf -n 16) original_cluster2_gene_200_v5.5.6.fa > original_cluster2_gene_200_v5.5.6_sample16.fa /groups/stern/home/sternd/bin/anno2gsiseq.pl -f INSECTBASE -o original_cluster2_gene_200_v5.5.6_sample16.ext.fa Hcor.Augustus.updated_w_annots.3ix21_final.gff3 original_cluster2_gene_200_v5.5.6_sample16.fa sed -i '/^>/ s/$/_DE/' original_cluster2_gene_200_v5.5.6_sample16.ext.fa #ABCGN /groups/stern/home/sternd/bin/anno2gsiseq.pl -f INSECTBASE -o Augustus.3ix21_final.ABCGN.ext.fa Hcor.Augustus.updated_w_annots.3ix21_final.gff3 Augustus.3ix21_final.ABCGN.fa sed -i '/^>/ s/$/_ABCGN/' Augustus.3ix21_final.ABCGN.ext.fa #combine ext fasta cat original_cluster2_gene_100_v5.5.6.ext.fa Augustus.3ix21_final.ABCGN.ext.fa > original_cluster2_gene_100+ABCGN.ext.fa cat original_cluster2_gene_200_v5.5.6_sample16.ext.fa Augustus.3ix21_final.ABCGN.ext.fa > original_cluster2_gene_200_v5.5.6_sample16+ABCGN.ext.fa #run prrn bsub -o log.txt -We 10 "prrn5 -KA -l2000 -F8 -ph -ps -o original_cluster2_gene_100+ABCGN.align.clustal.html original_cluster2_gene_100+ABCGN.ext.fa" bsub -o log.txt -We 10 "prrn5 -KA -l2000 -F8 -ph -ps -o original_cluster2_gene_200_v5.5.6_sample16+ABCGN.align.clustal.html original_cluster2_gene_200_v5.5.6_sample16+ABCGN.ext.fa" #Replace intron residues with Z perl -0777 -pe 's/.*?b>/Z/gs' original_cluster2_gene_100+ABCGN.align.clustal.html | grep -v "^<" > original_cluster2_gene_100+ABCGN.align.clustal.replaceZ.txt perl -0777 -pe 's/.*?b>/Z/gs' original_cluster2_gene_200_v5.5.6_sample16+ABCGN.align.clustal.html | grep -v "^<" > original_cluster2_gene_200_v5.5.6_sample16+ABCGN.align.clustal.replaceZ.txt ```{r} rsqr = plot_introns("original_cluster2_gene_100+ABCGN", "ABCGN", "unicycle") rsqr #5.059805e-06 rsqr = plot_introns("original_cluster2_gene_200_v5.5.6_sample16+ABCGN", "ABCGN", "bicycle") rsqr #0.002650872 ``` ####################################################################################################################### #check orthogroup #3 OG0000364 g12410.t1: NRF6_CAEEL g7075.t1: NRF6_CAEEL g7076.t1: NRF6_CAEEL g6399.t1: NRF6_CAEEL g15031.t1: NRF6_CAEEL Nose resistant to fluoxetine protein 6 (nrf-6; c. elegans) zgrep "NRF6_" ~/bicycle_classifier/Hcor/Augustus.3ix21_final.pep.fasta.blastp.outfmt6.gz > Augustus.3ix21_final.blastp.NRF6.tsv #run prrn for Hcor DE 200 (subsampled to same number of samples vs this group of genes) ```{r} interest_gene = read.table("Augustus.3ix21_final.blastp.NRF6.tsv", stringsAsFactors = FALSE, header = FALSE) interest_gene.summary = exons.summary %>% filter(Parent %in% interest_gene$V1) write.table(interest_gene.summary$Parent, "Augustus.3ix21_final.NRF6.transcripts_longest.txt", col.names = FALSE, row.names = FALSE, quote = FALSE) ``` seqkit grep -f Augustus.3ix21_final.NRF6.transcripts_longest.txt Augustus.updated_w_annots.3ix21_final.pep_longest.fa > Augustus.3ix21_final.NRF6.fa #16 hits mafft --auto --thread 2 Augustus.3ix21_final.NRF6.fa > Augustus.3ix21_final.NRF6.mafft_auto.fa #bicycle #/groups/stern/home/sternd/bin/anno2gsiseq.pl -f INSECTBASE -o original_cluster2_gene_100_v5.5.6.ext.fa Hcor.Augustus.updated_w_annots.3ix21_final.gff3 original_cluster2_gene_100_v5.5.6.fa #don't need to subsample because only 22 seqs seqkit grep -f <(grep "^>" original_cluster2_gene_200_v5.5.6.fa | cut -c 2- | shuf -n 15) original_cluster2_gene_200_v5.5.6.fa > original_cluster2_gene_200_v5.5.6_sample15.fa /groups/stern/home/sternd/bin/anno2gsiseq.pl -f INSECTBASE -o original_cluster2_gene_200_v5.5.6_sample15.ext.fa Hcor.Augustus.updated_w_annots.3ix21_final.gff3 original_cluster2_gene_200_v5.5.6_sample15.fa sed -i '/^>/ s/$/_DE/' original_cluster2_gene_200_v5.5.6_sample15.ext.fa #NRF6 /groups/stern/home/sternd/bin/anno2gsiseq.pl -f INSECTBASE -o Augustus.3ix21_final.NRF6.ext.fa Hcor.Augustus.updated_w_annots.3ix21_final.gff3 Augustus.3ix21_final.NRF6.fa sed -i '/^>/ s/$/_NRF6/' Augustus.3ix21_final.NRF6.ext.fa #combine ext fasta cat original_cluster2_gene_100_v5.5.6.ext.fa Augustus.3ix21_final.NRF6.ext.fa > original_cluster2_gene_100+NRF6.ext.fa cat original_cluster2_gene_200_v5.5.6_sample15.ext.fa Augustus.3ix21_final.NRF6.ext.fa > original_cluster2_gene_200_v5.5.6_sample15+NRF6.ext.fa #run prrn bsub -o log.txt -We 10 "prrn5 -KA -l2000 -F8 -ph -ps -o original_cluster2_gene_100+NRF6.align.clustal.html original_cluster2_gene_100+NRF6.ext.fa" bsub -o log.txt -We 10 "prrn5 -KA -l2000 -F8 -ph -ps -o original_cluster2_gene_200_v5.5.6_sample15+NRF6.align.clustal.html original_cluster2_gene_200_v5.5.6_sample15+NRF6.ext.fa" #Replace intron residues with Z perl -0777 -pe 's/.*?b>/Z/gs' original_cluster2_gene_100+NRF6.align.clustal.html | grep -v "^<" > original_cluster2_gene_100+NRF6.align.clustal.replaceZ.txt perl -0777 -pe 's/.*?b>/Z/gs' original_cluster2_gene_200_v5.5.6_sample15+NRF6.align.clustal.html | grep -v "^<" > original_cluster2_gene_200_v5.5.6_sample15+NRF6.align.clustal.replaceZ.txt ```{r} rsqr = plot_introns("original_cluster2_gene_100+NRF6", "NRF6", "unicycle") rsqr #0.002364937 rsqr = plot_introns("original_cluster2_gene_200_v5.5.6_sample15+NRF6", "NRF6", "bicycle") rsqr #0.02184701 ``` ####################################################################################################################### OG0000355 g20082.t1: YQII_BACSU g20083.t1: PSAL_TRIEI g20084.t1: AMPD_CITFR g20085.t1: RL9_EHRRW ignore #check orthogroup #4 (only 4 genes in ortholog group; should not use) OG0000487 g2418.t1: ROCK2_HUMAN g2427.t2: ROCK2_HUMAN g4049.t1: ROCK2_HUMAN g4059.t2: ROCK2_HUMAN Rho-associated protein kinase 2 zgrep "ROCK2_" ~/bicycle_classifier/Hcor/Augustus.3ix21_final.pep.fasta.blastp.outfmt6.gz > Augustus.3ix21_final.blastp.ROCK2.tsv #run prrn for Hcor DE 200 (subsampled to same number of samples vs this group of genes) ```{r} interest_gene = read.table("Augustus.3ix21_final.blastp.ROCK2.tsv", stringsAsFactors = FALSE, header = FALSE) interest_gene.summary = exons.summary %>% filter(Parent %in% interest_gene$V1) write.table(interest_gene.summary$Parent, "Augustus.3ix21_final.ROCK2.transcripts_longest.txt", col.names = FALSE, row.names = FALSE, quote = FALSE) ``` seqkit grep -f Augustus.3ix21_final.ROCK2.transcripts_longest.txt Augustus.updated_w_annots.3ix21_final.pep_longest.fa > Augustus.3ix21_final.ROCK2.fa #16 hits mafft --auto --thread 2 Augustus.3ix21_final.ROCK2.fa > Augustus.3ix21_final.ROCK2.mafft_auto.fa #bicycle #/groups/stern/home/sternd/bin/anno2gsiseq.pl -f INSECTBASE -o original_cluster2_gene_100_v5.5.6.ext.fa Hcor.Augustus.updated_w_annots.3ix21_final.gff3 original_cluster2_gene_100_v5.5.6.fa #don't need to subsample because only 22 seqs seqkit grep -f <(grep "^>" original_cluster2_gene_200_v5.5.6.fa | cut -c 2- | shuf -n 4) original_cluster2_gene_200_v5.5.6.fa > original_cluster2_gene_200_v5.5.6_sample4.fa /groups/stern/home/sternd/bin/anno2gsiseq.pl -f INSECTBASE -o original_cluster2_gene_200_v5.5.6_sample4.ext.fa Hcor.Augustus.updated_w_annots.3ix21_final.gff3 original_cluster2_gene_200_v5.5.6_sample4.fa sed -i '/^>/ s/$/_DE/' original_cluster2_gene_200_v5.5.6_sample4.ext.fa #ROCK2 /groups/stern/home/sternd/bin/anno2gsiseq.pl -f INSECTBASE -o Augustus.3ix21_final.ROCK2.ext.fa Hcor.Augustus.updated_w_annots.3ix21_final.gff3 Augustus.3ix21_final.ROCK2.fa sed -i '/^>/ s/$/_ROCK2/' Augustus.3ix21_final.ROCK2.ext.fa #combine ext fasta cat original_cluster2_gene_100_v5.5.6.ext.fa Augustus.3ix21_final.ROCK2.ext.fa > original_cluster2_gene_100+ROCK2.ext.fa cat original_cluster2_gene_200_v5.5.6_sample4.ext.fa Augustus.3ix21_final.ROCK2.ext.fa > original_cluster2_gene_200_v5.5.6_sample4+ROCK2.ext.fa #run prrn bsub -o log.txt -We 10 "prrn5 -KA -l2000 -F8 -ph -ps -o original_cluster2_gene_100+ROCK2.align.clustal.html original_cluster2_gene_100+ROCK2.ext.fa" bsub -o log.txt -We 10 "prrn5 -KA -l2000 -F8 -ph -ps -o original_cluster2_gene_200_v5.5.6_sample4+ROCK2.align.clustal.html original_cluster2_gene_200_v5.5.6_sample4+ROCK2.ext.fa" #Replace intron residues with Z perl -0777 -pe 's/.*?b>/Z/gs' original_cluster2_gene_100+ROCK2.align.clustal.html | grep -v "^<" > original_cluster2_gene_100+ROCK2.align.clustal.replaceZ.txt perl -0777 -pe 's/.*?b>/Z/gs' original_cluster2_gene_200_v5.5.6_sample4+ROCK2.align.clustal.html | grep -v "^<" > original_cluster2_gene_200_v5.5.6_sample4+ROCK2.align.clustal.replaceZ.txt ```{r} rsqr = plot_introns("original_cluster2_gene_100+ROCK2", "ROCK2", "unicycle") rsqr #0.03727323 rsqr = plot_introns("original_cluster2_gene_200_v5.5.6_sample4+ROCK2", "ROCK2", "bicycle") rsqr #0.1692689 ``` ####################################################################################################################### OG0000490 are bicycle genes #check orthogroup #5 (only 6 genes in ortholog group; should not use) OG0000514 g11151.t1: ABCA3_MOUSE g11627.t1: ABCA3_MOUSE g6780.t2: ABCA3_HUMAN g6779.t1: ABCA3_MOUSE Phospholipid-transporting ATPase ABCA3 zgrep "ABCA3_" ~/bicycle_classifier/Hcor/Augustus.3ix21_final.pep.fasta.blastp.outfmt6.gz > Augustus.3ix21_final.blastp.ABCA3.tsv #run prrn for Hcor DE 200 (subsampled to same number of samples vs this group of genes) ```{r} interest_gene = read.table("Augustus.3ix21_final.blastp.ABCA3.tsv", stringsAsFactors = FALSE, header = FALSE) interest_gene.summary = exons.summary %>% filter(Parent %in% interest_gene$V1) write.table(interest_gene.summary$Parent, "Augustus.3ix21_final.ABCA3.transcripts_longest.txt", col.names = FALSE, row.names = FALSE, quote = FALSE) ``` seqkit grep -f Augustus.3ix21_final.ABCA3.transcripts_longest.txt Augustus.updated_w_annots.3ix21_final.pep_longest.fa > Augustus.3ix21_final.ABCA3.fa #16 hits mafft --auto --thread 2 Augustus.3ix21_final.ABCA3.fa > Augustus.3ix21_final.ABCA3.mafft_auto.fa #bicycle #/groups/stern/home/sternd/bin/anno2gsiseq.pl -f INSECTBASE -o original_cluster2_gene_100_v5.5.6.ext.fa Hcor.Augustus.updated_w_annots.3ix21_final.gff3 original_cluster2_gene_100_v5.5.6.fa #don't need to subsample because only 22 seqs seqkit grep -f <(grep "^>" original_cluster2_gene_200_v5.5.6.fa | cut -c 2- | shuf -n 6) original_cluster2_gene_200_v5.5.6.fa > original_cluster2_gene_200_v5.5.6_sample6.fa /groups/stern/home/sternd/bin/anno2gsiseq.pl -f INSECTBASE -o original_cluster2_gene_200_v5.5.6_sample6.ext.fa Hcor.Augustus.updated_w_annots.3ix21_final.gff3 original_cluster2_gene_200_v5.5.6_sample6.fa sed -i '/^>/ s/$/_DE/' original_cluster2_gene_200_v5.5.6_sample6.ext.fa #ABCA3 /groups/stern/home/sternd/bin/anno2gsiseq.pl -f INSECTBASE -o Augustus.3ix21_final.ABCA3.ext.fa Hcor.Augustus.updated_w_annots.3ix21_final.gff3 Augustus.3ix21_final.ABCA3.fa sed -i '/^>/ s/$/_ABCA3/' Augustus.3ix21_final.ABCA3.ext.fa #combine ext fasta cat original_cluster2_gene_100_v5.5.6.ext.fa Augustus.3ix21_final.ABCA3.ext.fa > original_cluster2_gene_100+ABCA3.ext.fa cat original_cluster2_gene_200_v5.5.6_sample6.ext.fa Augustus.3ix21_final.ABCA3.ext.fa > original_cluster2_gene_200_v5.5.6_sample6+ABCA3.ext.fa #run prrn bsub -o log.txt -We 10 "prrn5 -KA -l2000 -F8 -ph -ps -o original_cluster2_gene_100+ABCA3.align.clustal.html original_cluster2_gene_100+ABCA3.ext.fa" bsub -o log.txt -We 10 "prrn5 -KA -l2000 -F8 -ph -ps -o original_cluster2_gene_200_v5.5.6_sample6+ABCA3.align.clustal.html original_cluster2_gene_200_v5.5.6_sample6+ABCA3.ext.fa" #Replace intron residues with Z perl -0777 -pe 's/.*?b>/Z/gs' original_cluster2_gene_100+ABCA3.align.clustal.html | grep -v "^<" > original_cluster2_gene_100+ABCA3.align.clustal.replaceZ.txt perl -0777 -pe 's/.*?b>/Z/gs' original_cluster2_gene_200_v5.5.6_sample6+ABCA3.align.clustal.html | grep -v "^<" > original_cluster2_gene_200_v5.5.6_sample6+ABCA3.align.clustal.replaceZ.txt ```{r} rsqr = plot_introns("original_cluster2_gene_100+ABCA3", "ABCA3", "unicycle") rsqr #0.06400076 rsqr = plot_introns("original_cluster2_gene_200_v5.5.6_sample6+ABCA3", "ABCA3", "bicycle") rsqr #0.002418773 ```